Version: 1.0
Type: Function
Category: Algorithms
License: GNU General Public License
Description: This will generate all possible n-grams for a word and returns an array of all unique n-grams. The function takes two arguments: $word
= the word and $min_gram_length
= the smallest n-gram string length you would like to produce. So, ngrams('hello', 2)
would produce the following values in an array: he el ll lo hel ell llo hell ello
. This function is useful if you are creating a word index and would like to have the ability to search for substrings without using LIKE %word%
.
function ngrams($word, $min_gram_length = 2) { $ngrams = array(); $word = trim($word); $len = strlen($word); $max_gram_length = $len - 1; //BEGIN N-GRAM SIZE LOOP $a for ($a = $min_gram_length; $a <= $max_gram_length; $a++) { //BEGIN N-GRAM SIZE LOOP $a for ($pos = 0; $pos < $len; $pos ++ { //BEGIN POSITION WITHIN WORD $pos if(($pos + $a -1) < $len) { //IF THE SUBSTRING WILL NOT EXCEED THE END OF THE WORD $ngrams[] = substr($word, $pos, $a); } //END IF THE SUBSTRING WILL NOT EXCEED THE END OF THE WORD } //END POSITION WITHIN WORD $pos } //END N-GRAM SIZE LOOP $a $ngrams = array_unique($ngrams); return $ngrams; }