Skip to main content

document_scraper_huggingfacehub

"""
document_scraper example
"""
import os
import json
from dotenv import load_dotenv
from scrapegraphai.graphs import DocumentScraperGraph
from langchain_community.llms import HuggingFaceEndpoint
from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings

load_dotenv()

# ************************************************
# Define the configuration for the graph
# ************************************************
# ************************************************
# Define the configuration for the graph
# ************************************************
HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN')

repo_id = "mistralai/Mistral-7B-Instruct-v0.2"

llm_model_instance = HuggingFaceEndpoint(
repo_id=repo_id, max_length=128, temperature=0.5, token=HUGGINGFACEHUB_API_TOKEN
)

embedder_model_instance = HuggingFaceInferenceAPIEmbeddings(
api_key=HUGGINGFACEHUB_API_TOKEN, model_name="sentence-transformers/all-MiniLM-l6-v2"
)

# ************************************************
# Create the SmartScraperGraph instance and run it
# ************************************************

graph_config = {
"llm": {"model_instance": llm_model_instance},
}

source = """
The Divine Comedy, Italian La Divina Commedia, original name La commedia, long narrative poem written in Italian
circa 1308/21 by Dante. It is usually held to be one of the world s great works of literature.
Divided into three major sections—Inferno, Purgatorio, and Paradiso—the narrative traces the journey of Dante
from darkness and error to the revelation of the divine light, culminating in the Beatific Vision of God.
Dante is guided by the Roman poet Virgil, who represents the epitome of human knowledge, from the dark wood
through the descending circles of the pit of Hell (Inferno). He then climbs the mountain of Purgatory, guided
by the Roman poet Statius, who represents the fulfilment of human knowledge, and is finally led by his lifelong love,
the Beatrice of his earlier poetry, through the celestial spheres of Paradise.
"""

pdf_scraper_graph = DocumentScraperGraph(
prompt="Summarize the text and find the main topics",
source=source,
config=graph_config,
)
result = pdf_scraper_graph.run()

print(json.dumps(result, indent=4))