ragoon.chunks#

Classes

ChunkMetadata(uuid, chunk_uuid, chunk_number)

Metadata for a text chunk within a dataset.

DatasetChunker(dataset, max_tokens, ...[, ...])

A class to chunk text data within a dataset for processing with embeddings models.

class ragoon.chunks.ChunkMetadata(uuid: str, chunk_uuid: str, chunk_number: str)[source]#

Bases: object

Metadata for a text chunk within a dataset.

uuid#

The UUID of the original text.

Type:

str

chunk_uuid#

The UUID of the chunked text.

Type:

str

chunk_number#

The identifier of the chunk indicating its order and total number of chunks.

Type:

str

uuid: str#
chunk_uuid: str#
chunk_number: str#
__init__(uuid: str, chunk_uuid: str, chunk_number: str) None#
class ragoon.chunks.DatasetChunker(dataset: Dataset | DatasetDict, max_tokens: int, overlap_percentage: float, column: str, model_name: str = 'bert-base-uncased', uuid_column: str | None = None, separators: List[str] = ['.', '\n'], space_after_splitters: List[str] | None = None)[source]#

Bases: object

A class to chunk text data within a dataset for processing with embeddings models.

This class splits large texts into smaller chunks based on a specified maximum token limit, while maintaining an overlap between chunks to preserve context.

datasetUnion[datasets.Dataset, datasets.DatasetDict]

The dataset to be chunked. It can be either a Dataset or a DatasetDict.

max_tokensint

The maximum number of tokens allowed in each chunk.

overlap_percentagefloat

The percentage of tokens to overlap between consecutive chunks.

columnstr

The name of the column containing the text to be chunked.

model_namestr, optional

The name of the tokenizer model to use (default is “bert-base-uncased”).

uuid_columnOptional[str], optional

The name of the column containing UUIDs for the texts. If not provided, new UUIDs will be generated.

separatorsList[str], optional

List of separators used to split the text.

space_after_splittersOptional[List[str]], optional

List of separators that require a space after splitting (default is None).

>>> from datasets import load_dataset
>>> dataset = load_dataset("louisbrulenaudet/dac6-instruct")
>>> chunker = DatasetChunker(
...     dataset['train'],
...     max_tokens=512,
...     overlap_percentage=0.5,
...     column="document",
...     model_name="intfloat/multilingual-e5-large",
...     separators=["
“, “.”, “!”, “?”]

… ) >>> dataset_chunked = chunker.chunk_dataset() >>> dataset_chunked.to_list()[:3] [{‘text’: ‘This is a chunked text.’}, {‘text’: ‘This is another chunked text.’}, …]

__init__(dataset: Dataset | DatasetDict, max_tokens: int, overlap_percentage: float, column: str, model_name: str = 'bert-base-uncased', uuid_column: str | None = None, separators: List[str] = ['.', '\n'], space_after_splitters: List[str] | None = None) None[source]#
split_text(text: str) List[str][source]#

Splits a text into segments based on the specified separators.

Parameters:

text (str) – The text to be split.

Returns:

A list of text segments.

Return type:

List[str]

Examples

>>> chunker = DatasetChunker(dataset, 512, 0.1, 'text')
>>> chunker.split_text("This is a sentence. This is another one.")
['This is a sentence', '.', ' This is another one', '.']
create_chunks(text: str) List[str][source]#

Creates text chunks from a given text based on the maximum tokens limit.

Parameters:

text (str) – The text to be chunked.

Returns:

A list of text chunks.

Return type:

List[str]

Raises:

ValueError – If the text cannot be chunked properly.

Examples

>>> chunker = DatasetChunker(dataset, 512, 0.1, 'text')
>>> text = "This is a very long text that needs to be chunked."
>>> chunks = chunker.create_chunks(text)
>>> len(chunks)
2
finalize_chunk(chunk_text: str, is_last: bool) str[source]#

Finalizes the chunk text by adjusting leading/trailing separators.

Parameters:
  • chunk_text (str) – The chunk text to be finalized.

  • is_last (bool) – Indicates whether this is the last chunk.

Returns:

The finalized chunk text.

Return type:

str

Examples

>>> chunker = DatasetChunker(dataset, 512, 0.1, 'text')
>>> chunk = " This is a chunk."
>>> chunker.finalize_chunk(chunk, is_last=True)
'This is a chunk.'
chunk_dataset() Dataset | DatasetDict[source]#

Chunks the entire dataset into smaller segments.

Returns:

The chunked dataset, with each entry split into smaller chunks.

Return type:

Union[Dataset, DatasetDict]

Examples

>>> chunker = DatasetChunker(dataset, 512, 0.1, 'text')
>>> chunked_dataset = chunker.chunk_dataset()
>>> len(chunked_dataset)
1000