Product Documentation

SQL Reference Guide

Previous Topic

Next Topic

SQL full-text search examples using PORTER English stemming tokenizer

The PORTER stemming tokenizer identifies each word in the search using whitespace and punctuation. Then, regardless of case, the tokenizer finds all matching words by reducing each word in the search to its stem by removing trailing s, er, ing, and ed.

A search string identifies each word using whitespace and/or punctuation and may use double quotes to identify a word or a phrase. Phrases contain two or more words within double quotes and may include whitespace between words. Punctuation, such as .!>, is ignored in the search string when it is not a full-text search operator. Reserved search operators include - " ^ * ( ) NEAR NEAR/n AND OR NOT; However, * ( ) OR NOT are not supported and will return an error when used.

/* =====================================================================

* Full-text search demo using the PORTER tokenizer.

* =====================================================================

*/

/* Create the testsearch table. */

CREATE TABLE testsearch

(

name VARCHAR(30)

, description VARCHAR(500)

)

STORAGE_ATTRIBUTES 'hotAlter; huge; rowid_fld; noRecbyt_idx;'

;

/* Create full-text index on the description field of the testsearch table. Use the PORTER English word stem tokenizer to do case insensitive matching of common word forms */

CREATE FULLTEXT INDEX testsearch_description_porter_fts

ON testsearch( description )

TOKENIZER PORTER

UPDATE_MODE ASYNC

;

/* Insert a record into the testsearch table. */

INSERT INTO testsearch

VALUES('Mike','Americans --love-- their __BASEball__ and <FOOTball>!')

;

/* Look at the record inserted into the testsearch table. */

SELECT description FROM testsearch;

/* =====================================================================

* The following queries find the inserted record

* by searching for words and phrases in the description field:

* 'Americans --love-- their __BASEball__ and <FOOTball>!'.

*

* The PORTER tokenizer reduces the text to

* 'american love their baseball and football'.

*

* The tokenizer identifies each word using whitespace and/or punctuation.

* The tokenizer is case insensitive.

* The tokenizer reduces each word to its stem

* by removing trailing s, er, ing, and ed.

* The tokenizer finds all words matching the stem.

* These words match each other -- even when they are invalid words:

* baseball baseballs baseballing baseballer baseballed

*

* A search string identifies each word using whitespace and/or punctuation.

* A search string may use double quotes to identify a word or a phrase.

* A phrase contains two or more words within double quotes.

* A search string may include any amount of whitespace between words.

* A search string may include

* Punctuation, such as .!>, is ignored in the search string

* when it is not a full-text search operator.

* Reserved search operators include - " ^ * ( ) NEAR NEAR/n AND OR NOT

* The following search operators are unsupported: * ( ) OR NOT

* The server returns an error when you use an unsupported search operator.

* =====================================================================

*/

/* WORD SEARCH: Find records in the testsearch table where the description column contains the word 'baseball' anywhere in the description field. */

SELECT description FROM testsearch WHERE description MATCH 'baseball';

/* VERB STEM SEARCH: Find records in the testsearch table where the description column contains common English conjugations of verbs, such as 'loving', 'love', 'loved', and 'loves', anywhere in the description field. */

SELECT description FROM testsearch WHERE description MATCH 'loving';

/* NOUN STEM SEARCH: Find records in the testsearch table where the description column contains English singular and plural forms of nouns, such as 'american' and 'americans', anywhere in the description field. */

SELECT description FROM testsearch WHERE description MATCH 'american';

/* CASE INSENSITIVE SEARCH: Find records in the testsearch table where the description column contains common forms of the word 'baseBALLs', such as 'baseball' and 'Baseballs' anywhere in the description field. */

SELECT description FROM testsearch WHERE description MATCH 'baseBALLs';

/* FIRST WORD SEARCH: Find records in the testsearch table where the description column contains the case-insensitive word 'americans' as the first word in the description field. */

SELECT description FROM testsearch WHERE description MATCH '^americans';

/* FIRST PHRASE SEARCH: Find records in the testsearch table where the description column contains the phrase "americans love" at the begining of the text in the description field. */

SELECT description FROM testsearch WHERE description MATCH ' ^"americans love" ';

/* MULTI-WORD SEARCH: Find records in the testsearch table where the description column contains the words 'baseball' and 'love' in any order anywhere in the description field. */

SELECT description FROM testsearch WHERE description MATCH 'baseball love';

/* MULTI-WORD SEARCH ACROSS WHITESPACE: Find records in the testsearch table where the description column contains the words 'baseball' and 'love' in any order anywhere in the description field while ignoring case and whitespace. */

SELECT description FROM testsearch WHERE description MATCH ' Baseball >>> LOVE! ';

/* PHRASE SEARCH: Find records in the testsearch table where the description column contains the case-insensitive phrase "love their" anywhere in the description field. The words 'love" and 'their" must follow each other with no intervening words in the description field. */

SELECT description FROM testsearch WHERE description MATCH ' "love their" ';

/* PHRASE SEARCH ACROSS WHITESPACE: Find records in the testsearch table where the description column contains the case-insensitive phrase "LOVE their" anywhere in the description field. The case-insensitive words 'LOVE" and 'theIR" must follow each other with no intervening words, but any amount of whitespace and punctuation may occur between these words in the description field. */

SELECT description FROM testsearch WHERE description MATCH ' "LOVE theIR" ';

/* STEMMED MULTI-WORD SEARCH: Find records in the testsearch table where the description column contains several words anywhere in the description field, such as 'baseBALLs' and 'american'. The PORTER tokenizer matches each word that has the same English stemming. */

SELECT description FROM testsearch WHERE description MATCH 'Loved their american baseBALLs.';

/* STEMMED PHRASE SEARCH: Find records in the testsearch table where the description column contains common forms of each word in the phrase "and football" anywhere in the description field. */

SELECT description FROM testsearch WHERE description MATCH ' "and footballs" ';

/* STEMMED WORD SEARCH ACROSS INCORRECT WORD FORMS: Find records in the testsearch table where the description column contains correct and incorrect forms of words, such as 'baseballing', 'baseballer', 'baseballed'. The PORTER stemmer applies its rules to every word -- even when the rule is incorrect in English. */

SELECT description FROM testsearch WHERE description MATCH ' american americans americaning americaner americaned baseball baseballs baseballing baseballer baseballed their theirs theiring theired "and" "ands" "anding" "anded" ';

/* STEMMED FIRST WORD SEARCH: Find records in the testsearch table where the description column contain common forms of the word 'americaning' as the first word in the description field. */

SELECT description FROM testsearch WHERE description MATCH '^americaning';

/* STEMMED FIRST PHRASE SEARCH: Find records in the testsearch table where the description column contains common forms of the words in the phrase "american loving" at the begining of the text in the description field. */

SELECT description FROM testsearch WHERE description MATCH ' ^"american loving" ';

/* NEAR SEARCH: Find records in the testsearch table where the description column contains the words 'Americans' and 'baseball' with no more than ten words between each other anywhere in the description field. */

SELECT description FROM testsearch WHERE description MATCH 'Americans NEAR football';

/* NEAR PHRASE SEARCH: Find records in the testsearch table where the description column contains the words 'their' and 'baseballs' with no words between each other. This is the same as using a phrase search. */

SELECT description FROM testsearch WHERE description MATCH 'their NEAR/0 baseball';

/* NEAR N SEARCH: Find records in the testsearch table where the description column contains the words 'love' and 'baseball' with no more than 1 word between each other. This pattern may occur anywhere in the description field with any amount of whitespace and punctuation between the words. */

SELECT description FROM testsearch WHERE description MATCH ' baseball NEAR/1 football ';

/* NOT WORD SEARCH: Find records in the testsearch table where the description column contains the words 'love' and 'baseball' but not 'karate'. NOTE: The minus operator is the NOT operator. It must be adjacent to the word with no intervening whitespace. You must also include a term in the search phrase before the NOT operator. */

SELECT description FROM testsearch WHERE description MATCH 'love baseball -karate';

/* NOT PHRASE SEARCH: Find records in the testsearch table where the description column contains the word 'baseball' but not the phrase "hate karate" and not the word 'soccer'. NOTE: The minus operator is the NOT operator. It must be adjacent to the word with no intervening whitespace. You must also include a term in the search phrase before the NOT operator. */

SELECT description FROM testsearch WHERE description MATCH 'baseball -"hate karate" -soccer';

/* STEMMED NEAR SEARCH: Find records in the testsearch table where the description column contains case-insensitive common forms of the words 'American' and 'baseball' with no more than ten words between each other anywhere in the description field. */

SELECT description FROM testsearch WHERE description MATCH 'americaner NEAR baseballs ';

/* STEMMED PHRASE SEARCH: Find records in the testsearch table where the description column contains case-insensitive common forms of the words 'their' and 'baseballs' with no words between each other anywhere in the description field. */

SELECT description FROM testsearch WHERE description MATCH 'their NEAR/0 baseBALLS ';

/* STEMMED NEAR N SEARCH: Find records in the testsearch table where the description column contains case-insensitive common forms of the words 'baseball' and 'football' with no more than 1 word between each other anywhere in the description field. */

SELECT description FROM testsearch WHERE description MATCH 'baseballing NEAR/1 footballed ';

/* STEMMED NOT WORD SEARCH: Find records in the testsearch table where the description column contains case-insensitive common forms of the words 'love' and 'baseball' but not 'karate'. NOTE: The minus operator must be adjacent to the word or phrase with no intervening whitespace. You must also have a term in the search phrase before the minus - operator. */

SELECT description FROM testsearch WHERE description MATCH 'loves baseballing -karate';

/* STEMMED NOT PHRASE SEARCH: Find records in the testsearch table where the description column contains case-insensitive common forms of the word 'baseball' but not the phrase "hate karate". */

SELECT description FROM testsearch WHERE description MATCH 'loving baseballs -"hating karate" ';

/* STEMMED NOT PHRASE SEARCH: Find records in the testsearch table where the description column contains case-insensitive common forms of the word 'love' but not case-insensitive common forms of the words in the phrase "Americans football". */

SELECT description FROM testsearch WHERE description MATCH 'Loved -"ameRICAN footBALLs" ';

/* Remove the testsearch table. */

DROP TABLE testsearch;

TOCIndex