PORTER English stemming tokenizer
Example SQL full-text searches using PORTER English stemming tokenizer
The PORTER stemming tokenizer identifies each word in the search using whitespace and/or punctuation. Then, regardless of case, the tokenizer finds all matching words by reducing each word in the search to its stem by removing trailing s, er, ing, and ed.
A search string identifies each word using whitespace and/or punctuation and may use double quotes to identify a word or a phrase. Phrases contain two or more words within double quotes and may include whitespace between words. Punctuation, such as .!>, is ignored in the search string when it is not a full-text search operator. Reserved search operators include - " ^ * ( ) NEAR NEAR/n AND OR NOT; However, * ( ) OR NOT are not supported and will return an error when used.
/* =====================================================================
* Full-text search demo using the PORTER tokenizer.
* =====================================================================
*/
/* Create the testsearch table. */
CREATE TABLE testsearch
(
name VARCHAR(30)
, description VARCHAR(500)
)
STORAGE_ATTRIBUTES 'hotAlter; huge; rowid_fld; noRecbyt_idx;'
;
/* Create full-text index on the description field of the testsearch table. Use
the PORTER English word stem tokenizer to do case insensitive matching of
common word forms */
CREATE FULLTEXT INDEX testsearch_description_porter_fts
ON testsearch( description )
TOKENIZER PORTER
UPDATE_MODE ASYNC
;
/* Insert a record into the testsearch table. */
INSERT INTO testsearch
VALUES('Mike','Americans --love-- their __BASEball__ and <FOOTball>!')
;
/* Look at the record inserted into the testsearch table. */
SELECT description FROM testsearch;
/* =====================================================================
* The following queries find the inserted record
* by searching for words and phrases in the description field:
* 'Americans --love-- their __BASEball__ and <FOOTball>!'.
*
* The PORTER tokenizer reduces the text to
* 'american love their baseball and football'.
*
* The tokenizer identifies each word using whitespace and/or punctuation.
* The tokenizer is case insensitive.
* The tokenizer reduces each word to its stem
* by removing trailing s, er, ing, and ed.
* The tokenizer finds all words matching the stem.
* These words match each other -- even when they are invalid words:
* baseball baseballs baseballing baseballer baseballed
*
* A search string identifies each word using whitespace and/or punctuation.
* A search string may use double quotes to identify a word or a phrase.
* A phrase contains two or more words within double quotes.
* A search string may include any amount of whitespace between words.
* A search string may include
* Punctuation, such as .!>, is ignored in the search string
* when it is not a full-text search operator.
* Reserved search operators include - " ^ * ( ) NEAR NEAR/n AND OR NOT
* The following search operators are unsupported: * ( ) OR NOT
* The server returns an error when you use an unsupported search operator.
* =====================================================================
*/
/* WORD SEARCH: Find records in the testsearch table where the description
column contains the word 'baseball' anywhere in the description field. */
SELECT description FROM testsearch WHERE description MATCH 'baseball';
/* VERB STEM SEARCH: Find records in the testsearch table where the description
column contains common English conjugations of verbs, such as 'loving', 'love',
'loved', and 'loves', anywhere in the description field. */
SELECT description FROM testsearch WHERE description MATCH 'loving';
/* NOUN STEM SEARCH: Find records in the testsearch table where the description
column contains English singular and plural forms of nouns, such as 'american'
and 'americans', anywhere in the description field. */
SELECT description FROM testsearch WHERE description MATCH 'american';
/* CASE INSENSITIVE SEARCH: Find records in the testsearch table where the
description column contains common forms of the word 'baseBALLs', such as
'baseball' and 'Baseballs' anywhere in the description field. */
SELECT description FROM testsearch WHERE description MATCH 'baseBALLs';
/* FIRST WORD SEARCH: Find records in the testsearch table where the
description column contains the case-insensitive word 'americans' as the first
word in the description field. */
SELECT description FROM testsearch WHERE description MATCH '^americans';
/* FIRST PHRASE SEARCH: Find records in the testsearch table where the
description column contains the phrase "americans love" at the begining of the
text in the description field. */
SELECT description FROM testsearch WHERE description MATCH ' ^"americans love"
';
/* MULTI-WORD SEARCH: Find records in the testsearch table where the
description column contains the words 'baseball' and 'love' in any order
anywhere in the description field. */
SELECT description FROM testsearch WHERE description MATCH 'baseball love';
/* MULTI-WORD SEARCH ACROSS WHITESPACE: Find records in the testsearch table
where the description column contains the words 'baseball' and 'love' in any
order anywhere in the description field while ignoring case and whitespace. */
SELECT description FROM testsearch WHERE description MATCH ' Baseball >>>
LOVE! ';
/* PHRASE SEARCH: Find records in the testsearch table where the description
column contains the case-insensitive phrase "love their" anywhere in the
description field. The words 'love" and 'their" must follow each other with no
intervening words in the description field. */
SELECT description FROM testsearch WHERE description MATCH ' "love their" ';
/* PHRASE SEARCH ACROSS WHITESPACE: Find records in the testsearch table where
the description column contains the case-insensitive phrase "LOVE their"
anywhere in the description field. The case-insensitive words 'LOVE" and
'theIR" must follow each other with no intervening words, but any amount of
whitespace and punctuation may occur between these words in the description
field. */
SELECT description FROM testsearch WHERE description MATCH ' "LOVE theIR" ';
/* STEMMED MULTI-WORD SEARCH: Find records in the testsearch table where the
description column contains several words anywhere in the description field,
such as 'baseBALLs' and 'american'. The PORTER tokenizer matches each word that
has the same English stemming. */
SELECT description FROM testsearch WHERE description MATCH 'Loved their
american baseBALLs.';
/* STEMMED PHRASE SEARCH: Find records in the testsearch table where the
description column contains common forms of each word in the phrase "and
football" anywhere in the description field. */
SELECT description FROM testsearch WHERE description MATCH ' "and footballs" ';
/* STEMMED WORD SEARCH ACROSS INCORRECT WORD FORMS: Find records in the
testsearch table where the description column contains correct and incorrect
forms of words, such as 'baseballing', 'baseballer', 'baseballed'. The PORTER
stemmer applies its rules to every word -- even when the rule is incorrect in
English. */
SELECT description FROM testsearch WHERE description MATCH ' american americans
americaning americaner americaned baseball baseballs baseballing baseballer
baseballed their theirs theiring theired "and" "ands" "anding" "anded" ';
/* STEMMED FIRST WORD SEARCH: Find records in the testsearch table where the
description column contain common forms of the word 'americaning' as the first
word in the description field. */
SELECT description FROM testsearch WHERE description MATCH '^americaning';
/* STEMMED FIRST PHRASE SEARCH: Find records in the testsearch table where the
description column contains common forms of the words in the phrase "american
loving" at the begining of the text in the description field. */
SELECT description FROM testsearch WHERE description MATCH ' ^"american
loving" ';
/* NEAR SEARCH: Find records in the testsearch table where the description
column contains the words 'Americans' and 'baseball' with no more than ten
words between each other anywhere in the description field. */
SELECT description FROM testsearch WHERE description MATCH 'Americans NEAR
football';
/* NEAR PHRASE SEARCH: Find records in the testsearch table where the
description column contains the words 'their' and 'baseballs' with no words
between each other. This is the same as using a phrase search. */
SELECT description FROM testsearch WHERE description MATCH 'their NEAR/0
baseball';
/* NEAR N SEARCH: Find records in the testsearch table where the description
column contains the words 'love' and 'baseball' with no more than 1 word
between each other. This pattern may occur anywhere in the description field
with any amount of whitespace and punctuation between the words. */
SELECT description FROM testsearch WHERE description MATCH ' baseball NEAR/1
football ';
/* NOT WORD SEARCH: Find records in the testsearch table where the description
column contains the words 'love' and 'baseball' but not 'karate'. NOTE: The
minus operator is the NOT operator. It must be adjacent to the word with no
intervening whitespace. You must also include a term in the search phrase
before the NOT operator. */
SELECT description FROM testsearch WHERE description MATCH 'love baseball
-karate';
/* NOT PHRASE SEARCH: Find records in the testsearch table where the
description column contains the word 'baseball' but not the phrase "hate
karate" and not the word 'soccer'. NOTE: The minus operator is the NOT
operator. It must be adjacent to the word with no intervening whitespace. You
must also include a term in the search phrase before the NOT operator. */
SELECT description FROM testsearch WHERE description MATCH 'baseball -"hate
karate" -soccer';
/* STEMMED NEAR SEARCH: Find records in the testsearch table where the
description column contains case-insensitive common forms of the words
'American' and 'baseball' with no more than ten words between each other
anywhere in the description field. */
SELECT description FROM testsearch WHERE description MATCH 'americaner NEAR
baseballs ';
/* STEMMED PHRASE SEARCH: Find records in the testsearch table where the
description column contains case-insensitive common forms of the words 'their'
and 'baseballs' with no words between each other anywhere in the description
field. */
SELECT description FROM testsearch WHERE description MATCH 'their NEAR/0
baseBALLS ';
/* STEMMED NEAR N SEARCH: Find records in the testsearch table where the
description column contains case-insensitive common forms of the words
'baseball' and 'football' with no more than 1 word between each other anywhere
in the description field. */
SELECT description FROM testsearch WHERE description MATCH 'baseballing NEAR/1
footballed ';
/* STEMMED NOT WORD SEARCH: Find records in the testsearch table where the
description column contains case-insensitive common forms of the words 'love'
and 'baseball' but not 'karate'. NOTE: The minus operator must be adjacent to
the word or phrase with no intervening whitespace. You must also have a term in
the search phrase before the minus - operator. */
SELECT description FROM testsearch WHERE description MATCH 'loves baseballing
-karate';
/* STEMMED NOT PHRASE SEARCH: Find records in the testsearch table where the
description column contains case-insensitive common forms of the word
'baseball' but not the phrase "hate karate". */
SELECT description FROM testsearch WHERE description MATCH 'loving baseballs
-"hating karate" ';
/* STEMMED NOT PHRASE SEARCH: Find records in the testsearch table where the
description column contains case-insensitive common forms of the word 'love'
but not case-insensitive common forms of the words in the phrase "Americans
football". */
SELECT description FROM testsearch WHERE description MATCH 'Loved -"ameRICAN
footBALLs" ';
/* Remove the testsearch table. */
DROP TABLE testsearch;