🇹🇰 Token calculator
The token calculator class is a class that implements the token for the link to scrape. Given a link it retrieves a a list of chunks given the name of the model.
Implementation
"""
Module for truncatinh in chunks the messages
"""
from typing import List
import tiktoken
from ..helpers.models_tokens import models_tokens
def truncate_text_tokens(text: str, model: str, encoding_name: str) -> List[str]:
"""
It creates a list of strings to create max dimension tokenizable elements
Args:
text (str): The input text to be truncated into tokenizable elements.
model (str): The name of the language model to be used.
encoding_name (str): The name of the encoding to be used (default: EMBEDDING_ENCODING).
Returns:
List[str]: A list of tokenizable elements created from the input text.
"""
encoding = tiktoken.get_encoding(encoding_name)
max_tokens = models_tokens[model] - 500
encoded_text = encoding.encode(text)
chunks = [encoded_text[i:i + max_tokens]
for i in range(0, len(encoded_text), max_tokens)]
result = [encoding.decode(chunk) for chunk in chunks]
return result
Example
"""
Example for calclating the tokenizer
"""
from scrapegraphai.utils.token_calculator import truncate_text_tokens
INPUT_TEXT = "http://nba.com"
MODEL_NAME = "gpt-3.5-turbo"
ENCODING_NAME = "EMBEDDING_ENCODING"
tokenized_chunks = truncate_text_tokens(INPUT_TEXT, MODEL_NAME, ENCODING_NAME)
for i, chunk in enumerate(tokenized_chunks):
print(f"Chunk {i+1}: {chunk}")