narrow_down.storage module#

Base classes and interfaces for storage.

exception narrow_down.storage.TooLowStorageLevel[source]#

Bases: Exception

Raised if a feature is used for which a higher storage level is needed.

class narrow_down.storage.StorageLevel(value)[source]#

Bases: Flag

Detail level of document persistence.

Minimal = 1#

Minimal storage level. Only store the necessary data to perform the search.

Fingerprint = 2#

In addition to Minimal, also store the fingerprint, e.g. the Minhashes

Document = 4#

Store the whole inserted document internally.

Full = 7#

Store everything.

class narrow_down.storage.Fingerprint#

Type representing the result of a minhashing operation

alias of ndarray[Any, dtype[uint32]]

class narrow_down.storage.StoredDocument(id_=None, document=None, exact_part=None, fingerprint=None, data=None)[source]#

Bases: object

Data object combining all possible fields of a document stored.

Parameters:
  • id_ (int | None) –

  • document (str | None) –

  • exact_part (str | None) –

  • fingerprint (Fingerprint | None) –

  • data (str | None) –

id_: int | None = None#

Identifier used to distinguish the document from an identical one.

document: str | None = None#

The actual content to use for fuzzy matching, e.g. a full unprocessed sentence.

exact_part: str | None = None#

A string which should be matched exactly.

fingerprint: Fingerprint | None = None#

A fuzzy fingerprint of the document, e.g. a Minhash.

data: str | None = None#

Payload to persist together with the document in the internal data structures.

serialize(storage_level)[source]#

Serialize a document to bytes.

Parameters:

storage_level (StorageLevel) –

Return type:

bytes

static deserialize(doc, id_)[source]#

Deserialize a document from bytes.

Parameters:
Return type:

StoredDocument

without(*attributes)[source]#

Create a copy with the specified attributes left out.

Parameters:

attributes (str) – The names of the attributes to leave empty

Returns:

A copy of the StoredDocument with all the attributes specified in attributes left out. So they will have their default value (None).

Return type:

StoredDocument

class narrow_down.storage.StorageBackend[source]#

Bases: ABC

Storage backend for a SimilarityStore.

async initialize()[source]#

Initialize the database.

Returns:

self

Return type:

StorageBackend

abstract async insert_setting(key, value)[source]#

Store a setting as key-value pair.

Parameters:
  • key (str) –

  • value (str) –

abstract async query_setting(key)[source]#

Query a setting with the given key.

Parameters:

key (str) – The identifier of the setting

Returns:

A string with the value. If the key does not exist or the storage is uninitialized None is returned.

Return type:

str | None

abstract async insert_document(document, document_id=None)[source]#

Add the data of a document to the storage and return its ID.

Parameters:
  • document (bytes) –

  • document_id (int | None) –

Return type:

int

abstract async query_document(document_id)[source]#

Get the data belonging to a document.

Parameters:

document_id (int) – Key under which the data is stored.

Returns:

The document value for the given ID.

Raises:

KeyError – If no document with the given ID is stored.

Return type:

bytes

async query_documents(document_ids)[source]#

Get the data belonging to multiple documents.

Parameters:

document_ids (List[int]) – Key under which the data is stored.

Returns:

The list of document values for the given IDs.

Raises:

KeyError – If no document was found for at least one of the ids.

Return type:

List[bytes]

abstract async remove_document(document_id)[source]#

Remove a document given by ID from the list of documents.

Parameters:

document_id (int) –

abstract async add_document_to_bucket(bucket_id, document_hash, document_id)[source]#

Link a document to a bucket.

Parameters:
  • bucket_id (int) –

  • document_hash (int) –

  • document_id (int) –

abstract async query_ids_from_bucket(bucket_id, document_hash)[source]#

Get all document IDs stored in a bucket for a certain hash value.

Parameters:
  • bucket_id (int) –

  • document_hash (int) –

Return type:

Iterable[int]

abstract async remove_id_from_bucket(bucket_id, document_hash, document_id)[source]#

Remove a document from a bucket.

Parameters:
  • bucket_id (int) –

  • document_hash (int) –

  • document_id (int) –

class narrow_down.storage.InMemoryStore[source]#

Bases: StorageBackend

Rust implementation of InMemoryStore.

__init__()[source]#

Create a new RustMemoryStore.

serialize()[source]#

Serialize the data into a messagepack so that it can be persisted somewhere.

Return type:

bytes

to_file(file_path)[source]#

Serialize the data into a messagepack file with the given path.

Parameters:

file_path (str) –

classmethod deserialize(msgpack)[source]#

Deserialize an InMemoryStore object from messagepack.

Parameters:

msgpack (bytes) –

Return type:

InMemoryStore

classmethod from_file(file_path)[source]#

Deserialize an InMemoryStore object the given messagepack file.

Parameters:

file_path (str) –

Return type:

InMemoryStore

async insert_setting(key, value)[source]#

Store a setting as key-value pair.

Parameters:
  • key (str) –

  • value (str) –

async query_setting(key)[source]#

Query a setting with the given key.

Parameters:

key (str) –

Return type:

str | None

async insert_document(document, document_id=None)[source]#

Add the data of a document to the storage and return its ID.

Parameters:
  • document (bytes) –

  • document_id (int | None) –

Return type:

int

async query_document(document_id)[source]#

Get the data belonging to a document.

Parameters:

document_id (int) – The id of the document. This ID is created and returned by the insert_document method.

Returns:

The document stored under the key document_id as bytes object.

Raises:

KeyError – If the document is not stored.

Return type:

bytes

async remove_document(document_id)[source]#

Remove a document given by ID from the list of documents.

Parameters:

document_id (int) –

async add_document_to_bucket(bucket_id, document_hash, document_id)[source]#

Link a document to a bucket.

Parameters:
  • bucket_id (int) –

  • document_hash (int) –

  • document_id (int) –

async query_ids_from_bucket(bucket_id, document_hash)[source]#

Get all document IDs stored in a bucket for a certain hash value.

Parameters:

document_hash (int) –

Return type:

Iterable[int]

async remove_id_from_bucket(bucket_id, document_hash, document_id)[source]#

Remove a document from a bucket.

Parameters:
  • bucket_id (int) –

  • document_hash (int) –

  • document_id (int) –