Skip to content

Commit

Permalink
docs: update example and docs
Browse files Browse the repository at this point in the history
  • Loading branch information
olastor committed Jul 29, 2023
1 parent 593df19 commit 5ea92a2
Show file tree
Hide file tree
Showing 3 changed files with 168 additions and 49 deletions.
115 changes: 113 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,15 +14,126 @@ Minimalistic, customizable module for creating basic full-text search indices an
yarn add picosearch
```

or
or

```bash
npm install picosearch
```

## Quickstart

```javascript
const { createIndex, indexDocument, searchIndex } = require('picosearch')
const porterStemmer = require('porter-stemmer')
const { eng } = require('stopword')

; (async () => {
// define a (custom) tokenizer for splitting a sentence into tokens
const tokenizer = (sentence) => sentence.split(' ').map(s => s.trim())

// define a (custom) anaylzer for preprocessing individual tokens/words
const REGEXP_PATTERN_PUNCT = new RegExp("['!\"“”#$%&\\'()\*+,\-\.\/:;<=>?@\[\\\]\^_`{|}~']", 'g')
const analyzer = (token) => {
let newToken = token.trim().replace(REGEXP_PATTERN_PUNCT, '').toLowerCase()

if (eng.includes(newToken)) {
return ''
}

return porterStemmer.stemmer(newToken)
}

// create a new index with a specific mapping
const index = createIndex({
title: 'text',
body: 'text',
topic: 'keyword'
})

// index some documents
// raw documents are not stored in the index by default to optimize the index size
// that's why we keep the data in a lookup mapping that can be used by the search to
// get the documents later
const docsLookup = {
doc1: { title: 'Milk', body: 'A man is drinking milk.', topic: 'a' },
doc2: { title: 'Bread', body: 'A man is eating breads.', topic: 'a' },
doc3: { title: 'Butter', body: 'A man is eating bread and butter.', topic: 'b' }
}
const docsArray = Object.entries(docsLookup).map(([docId, doc]) => ({ _id: docId, ...doc }))

docsArray.forEach((doc) => indexDocument(index, doc, analyzer))

// make an example search on the 'body' and 'title' fields
console.log(
await searchIndex(
index,
'bread', {
size: 10,
queryFields: ['body', 'title'],
filter: {
topic: 'a'
},
getDocument: docId => docsLookup[docId]
},
analyzer,
tokenizer
)
)
// returns:
// {
// total: 1,
// maxScore: 0.08530260953900706,
// hits: [ { _id: 'doc2', _score: 0.08530260953900706, _source: [Object] } ]
// }
})()
```


See [examples/](https://github.com/olastor/picosearch/tree/main/examples).

## API

### `createIndex(mappings)`

[TS Doc](https://olastor.github.io/picosearch/functions/createIndex.html)

**Parameters**

- `mappings: Mappings` An object defining the fields of a document. Possible field types: `text`, `keyword`, `number`, `date`.

**Return Value**

Returns an index object to be used for querying and scoring. The raw documents are **not included**. Depending on the size of the text corpus, the size of the index can very.

### `indexDocument(index, document, analyzer, tokenizer)`

[TS Doc](https://olastor.github.io/picosearch/functions/indexDocument.html)

**Parameters**

- `index` The index.
- `document` The document to index.
- `analyzer` A function for analyzing an individual token.
- `tokenizer` A function for splitting a query into individual tokens.

### `searchIndex(index, query, options, analyzer, tokenizer)`

[TS Doc](https://olastor.github.io/picosearch/functions/searchIndex.html)

**Parameters**

- `index` The index.
- `query` The search query.
- `options` The searhc options. See [here](https://olastor.github.io/picosearch/interfaces/QueryOptions.html).
- `analyzer` A function for analyzing an individual token.
- `tokenizer` A function for splitting a query into individual tokens.

**Return Value**

A search results object. See [here](https://olastor.github.io/picosearch/interfaces/SearchResults.html)

## API Docs

see [https://olastor.github.io/picosearch/](https://olastor.github.io/picosearch/)
see [https://olastor.github.io/picosearch/](https://olastor.github.io/picosearch/) for more details.


100 changes: 55 additions & 45 deletions examples/english.js
Original file line number Diff line number Diff line change
Expand Up @@ -2,52 +2,62 @@ const { createIndex, indexDocument, searchIndex } = require('../dist')
const porterStemmer = require('porter-stemmer')
const { eng } = require('stopword')

sentences = [
'A man is eating food.',
'A man is buying bread.',
'The woman is riding a bike.',
'A woman is playing a violin.',
'Two men are biking.',
'Two women are biking.',
]

const analyzer = {
stemmer: porterStemmer.stemmer,
lowercase: true,
stripPunctuation: true,
stopwords: eng
}


const index = createIndex({
title: 'text',
body: 'text',
topic: 'keyword'
})
const docs = [
{ _id: 'doc1', title: 'Milk', body: 'A man is drinking milk.', topic: 'a' },
{ _id: 'doc2',title: 'Bread', body: 'A man is eating bread.', topic: 'a' },
{ _id: 'doc3', title: 'Butter', body: 'A man is eating bread and butter.', topic: 'b' }
]
docs.forEach((doc) => indexDocument(index, doc, analyzer))
; (async () => {
// define a (custom) tokenizer for splitting a sentence into tokens
const tokenizer = (sentence) => sentence.split(' ').map(s => s.trim())

// define a (custom) anaylzer for preprocessing individual tokens/words
const REGEXP_PATTERN_PUNCT = new RegExp("['!\"“”#$%&\\'()\*+,\-\.\/:;<=>?@\[\\\]\^_`{|}~']", 'g')
const analyzer = (token) => {
let newToken = token.trim().replace(REGEXP_PATTERN_PUNCT, '').toLowerCase()

; (async () => {
console.log(JSON.stringify(await searchIndex(index, 'breet', {
offset: 0,
size: 10,
queryFields: {
body: { highlight: true },
title: { highlight: true }
},
fuzziness: {
maxError: 2,
prefixLength: 3
},
filter: {
topic: 'a'
},
getDocument: (d) => docs.find(({_id}) => _id === d)
}, analyzer), null, 2))
if (eng.includes(newToken)) {
return ''
}

return porterStemmer.stemmer(newToken)
}

// create a new index with a specific mapping
const index = createIndex({
title: 'text',
body: 'text',
topic: 'keyword'
})

// index some documents
// raw documents are not stored in the index by default to optimize the index size
// that's why we keep the data in a lookup mapping that can be used by the search to
// get the documents later
const docsLookup = {
doc1: { title: 'Milk', body: 'A man is drinking milk.', topic: 'a' },
doc2: { title: 'Bread', body: 'A man is eating breads.', topic: 'a' },
doc3: { title: 'Butter', body: 'A man is eating bread and butter.', topic: 'b' }
}
const docsArray = Object.entries(docsLookup).map(([docId, doc]) => ({ _id: docId, ...doc }))

docsArray.forEach((doc) => indexDocument(index, doc, analyzer))

// make an example search on the 'body' and 'title' fields
console.log(
await searchIndex(
index,
'bread', {
size: 10,
queryFields: ['body', 'title'],
filter: {
topic: 'a'
},
getDocument: docId => docsLookup[docId]
},
analyzer,
tokenizer
)
)
// returns:
// {
// total: 1,
// maxScore: 0.08530260953900706,
// hits: [ { _id: 'doc2', _score: 0.08530260953900706, _source: [Object] } ]
// }
})()
2 changes: 0 additions & 2 deletions src/interfaces.ts
Original file line number Diff line number Diff line change
Expand Up @@ -122,8 +122,6 @@ export interface SearchResultsHit {
_id: string;
_score: number;
_source?: { [key: string]: any } | null;
highlight?: { [key: string]: string | string[] };
snippets?: { [key: string]: string[][] | string[] };
}

/** Data structure for a search result. */
Expand Down

0 comments on commit 5ea92a2

Please sign in to comment.