docs: update example and docs

olastor · Jul 29, 2023 · 5ea92a2 · 5ea92a2
1 parent 593df19
commit 5ea92a2
Show file tree

Hide file tree

Showing 3 changed files with 168 additions and 49 deletions.
diff --git a/README.md b/README.md
@@ -14,15 +14,126 @@ Minimalistic, customizable module for creating basic full-text search indices an
 yarn add picosearch
 ```
 
-or 
+or
 
 ```bash
 npm install picosearch
 ```
 
+## Quickstart
+
+```javascript
+const { createIndex, indexDocument, searchIndex } = require('picosearch')
+const porterStemmer = require('porter-stemmer')
+const { eng } = require('stopword')
+
+; (async () => {
+  // define a (custom) tokenizer for splitting a sentence into tokens
+  const tokenizer = (sentence) => sentence.split(' ').map(s => s.trim())
+
+  // define a (custom) anaylzer for preprocessing individual tokens/words
+  const REGEXP_PATTERN_PUNCT = new RegExp("['!\"“”#$%&\\'()\*+,\-\.\/:;<=>?@\[\\\]\^_`{|}~']", 'g')
+  const analyzer = (token) => {
+    let newToken = token.trim().replace(REGEXP_PATTERN_PUNCT, '').toLowerCase()
+
+    if (eng.includes(newToken)) {
+      return ''
+    }
+
+    return porterStemmer.stemmer(newToken)
+  }
+
+  // create a new index with a specific mapping
+  const index = createIndex({
+    title: 'text',
+    body: 'text',
+    topic: 'keyword'
+  })
+
+  // index some documents
+  // raw documents are not stored in the index by default to optimize the index size
+  // that's why we keep the data in a lookup mapping that can be used by the search to
+  // get the documents later
+  const docsLookup = {
+    doc1: { title: 'Milk', body: 'A man is drinking milk.', topic: 'a' },
+    doc2: { title: 'Bread', body: 'A man is eating breads.', topic: 'a' },
+    doc3: { title: 'Butter', body: 'A man is eating bread and butter.', topic: 'b' }
+  }
+  const docsArray = Object.entries(docsLookup).map(([docId, doc]) => ({ _id: docId, ...doc }))
+
+  docsArray.forEach((doc) => indexDocument(index, doc, analyzer))
+
+  // make an example search on the 'body' and 'title' fields
+  console.log(
+    await searchIndex(
+      index,
+      'bread', {
+        size: 10,
+        queryFields: ['body', 'title'],
+        filter: {
+          topic: 'a'
+        },
+        getDocument: docId => docsLookup[docId]
+      },
+      analyzer,
+      tokenizer
+    )
+  )
+  // returns:
+  // {
+  //   total: 1,
+  //   maxScore: 0.08530260953900706,
+  //   hits: [ { _id: 'doc2', _score: 0.08530260953900706, _source: [Object] } ]
+  // }
+})()
+```
+
+
+See [examples/](https://github.com/olastor/picosearch/tree/main/examples).
+
+## API
+
+### `createIndex(mappings)`
+
+[TS Doc](https://olastor.github.io/picosearch/functions/createIndex.html)
+
+**Parameters**
+
+- `mappings: Mappings` An object defining the fields of a document. Possible field types: `text`, `keyword`, `number`, `date`.
+
+**Return Value**
+
+Returns an index object to be used for querying and scoring. The raw documents are **not included**. Depending on the size of the text corpus, the size of the index can very.
+
+### `indexDocument(index, document, analyzer, tokenizer)`
+
+[TS Doc](https://olastor.github.io/picosearch/functions/indexDocument.html)
+
+**Parameters**
+
+- `index` The index.
+- `document` The document to index.
+- `analyzer` A function for analyzing an individual token.
+- `tokenizer` A function for splitting a query into individual tokens.
+
+### `searchIndex(index, query, options, analyzer, tokenizer)`
+
+[TS Doc](https://olastor.github.io/picosearch/functions/searchIndex.html)
+
+**Parameters**
+
+- `index` The index.
+- `query` The search query.
+- `options` The searhc options. See [here](https://olastor.github.io/picosearch/interfaces/QueryOptions.html).
+- `analyzer` A function for analyzing an individual token.
+- `tokenizer` A function for splitting a query into individual tokens.
+
+**Return Value**
+
+A search results object. See [here](https://olastor.github.io/picosearch/interfaces/SearchResults.html)
 
 ## API Docs
 
-see [https://olastor.github.io/picosearch/](https://olastor.github.io/picosearch/)
+see [https://olastor.github.io/picosearch/](https://olastor.github.io/picosearch/) for more details.
 
 
diff --git a/examples/english.js b/examples/english.js
@@ -2,52 +2,62 @@ const { createIndex, indexDocument, searchIndex } = require('../dist')
 const porterStemmer = require('porter-stemmer')
 const { eng } = require('stopword')
 
-sentences = [
-  'A man is eating food.',
-  'A man is buying bread.',
-  'The woman is riding a bike.',
-  'A woman is playing a violin.',
-  'Two men are biking.',
-  'Two women are biking.',
-]
-
-const analyzer = {
-  stemmer: porterStemmer.stemmer,
-  lowercase: true,
-  stripPunctuation: true,
-  stopwords: eng
-}
-
-
-const index = createIndex({
-  title: 'text',
-  body: 'text',
-  topic: 'keyword'
-})
-const docs = [
-  { _id: 'doc1', title: 'Milk', body: 'A man is drinking milk.', topic: 'a' },
-  { _id: 'doc2',title: 'Bread', body: 'A man is eating bread.', topic: 'a' },
-   { _id: 'doc3', title: 'Butter', body: 'A man is eating bread and butter.', topic: 'b' }
-]
-docs.forEach((doc) => indexDocument(index, doc, analyzer))
+; (async () => {
+  // define a (custom) tokenizer for splitting a sentence into tokens
+  const tokenizer = (sentence) => sentence.split(' ').map(s => s.trim())
 
+  // define a (custom) anaylzer for preprocessing individual tokens/words
+  const REGEXP_PATTERN_PUNCT = new RegExp("['!\"“”#$%&\\'()\*+,\-\.\/:;<=>?@\[\\\]\^_`{|}~']", 'g')
+  const analyzer = (token) => {
+    let newToken = token.trim().replace(REGEXP_PATTERN_PUNCT, '').toLowerCase()
 
-; (async () => {
-console.log(JSON.stringify(await searchIndex(index, 'breet', {
-  offset: 0,
-  size: 10,
-  queryFields: {
-    body: { highlight: true },
-    title: { highlight: true }
-  },
-  fuzziness: {
-    maxError: 2,
-    prefixLength: 3
-  },
-  filter: {
-    topic: 'a'
-  },
-  getDocument: (d) => docs.find(({_id}) => _id === d)
-}, analyzer), null, 2))
+    if (eng.includes(newToken)) {
+      return ''
+    }
+
+    return porterStemmer.stemmer(newToken)
+  }
+
+  // create a new index with a specific mapping
+  const index = createIndex({
+    title: 'text',
+    body: 'text',
+    topic: 'keyword'
+  })
+
+  // index some documents
+  // raw documents are not stored in the index by default to optimize the index size
+  // that's why we keep the data in a lookup mapping that can be used by the search to
+  // get the documents later
+  const docsLookup = {
+    doc1: { title: 'Milk', body: 'A man is drinking milk.', topic: 'a' },
+    doc2: { title: 'Bread', body: 'A man is eating breads.', topic: 'a' },
+    doc3: { title: 'Butter', body: 'A man is eating bread and butter.', topic: 'b' }
+  }
+  const docsArray = Object.entries(docsLookup).map(([docId, doc]) => ({ _id: docId, ...doc }))
+
+  docsArray.forEach((doc) => indexDocument(index, doc, analyzer))
 
+  // make an example search on the 'body' and 'title' fields
+  console.log(
+    await searchIndex(
+      index,
+      'bread', {
+        size: 10,
+        queryFields: ['body', 'title'],
+        filter: {
+          topic: 'a'
+        },
+        getDocument: docId => docsLookup[docId]
+      },
+      analyzer,
+      tokenizer
+    )
+  )
+  // returns:
+  // {
+  //   total: 1,
+  //   maxScore: 0.08530260953900706,
+  //   hits: [ { _id: 'doc2', _score: 0.08530260953900706, _source: [Object] } ]
+  // }
 })()
diff --git a/src/interfaces.ts b/src/interfaces.ts
@@ -122,8 +122,6 @@ export interface SearchResultsHit {
   _id: string;
   _score: number;
   _source?: { [key: string]: any } | null;
-  highlight?: { [key: string]: string | string[] };
-  snippets?: { [key: string]: string[][] | string[] };
 }
 
 /** Data structure for a search result. */