Module llmflex.Tools.browser_tool
Expand source code
from .tool_utils import BaseTool
from ..Embeddings.base_embeddings import BaseEmbeddingsToolkit
from ..Models.Cores.base_core import BaseLLM
from ..Rankers.base_ranker import BaseRanker
from typing import Optional, Dict, Any, List
class BrowserTool(BaseTool):
"""Tool for browsing contents via the DuckDuckGo search engine given any search query. The output will be the most relevant chunks of content found from the search engine according to the search query.
"""
def __init__(self, embeddings: BaseEmbeddingsToolkit, llm: Optional[BaseLLM] = None, ranker: Optional[BaseRanker] = None) -> None:
"""Initialising the tool.
Args:
embeddings (BaseEmbeddingsToolkit): Embeddings toolkit for the vector database.
llm (Optional[BaseLLM], optional): LLM to count number of tokens for each chunk. Defaults to None.
ranker (Optional[BaseRanker], optional): Reranker to rerank results. If none is given, results will not be reranked after the search on the vector database. Defaults to None.
"""
from ..VectorDBs.faiss_vectordb import FaissVectorDatabase
self.embeddings = embeddings
self.vdb = FaissVectorDatabase.from_documents(embeddings=self.embeddings, docs=[])
self.llm = llm
self.ranker = ranker
super().__init__()
def __call__(self, search_query: str, max_num_results: int = 3) -> Dict[str, List[str]]:
"""Entry point of the tool.
Args:
search_query (str): Search query to browse on DuckDuckGo.
max_num_results (int, optional): Maximum number of relevant chunks of contents from the search results. Defaults to 3.
Returns:
Dict[str, List[str]]: The most relevant chunks on contents along with their resepctive URLs.
"""
import gc
from .web_search_utils import ddg_search, get_markdown, create_content_chunks
from ..Schemas.documents import Document
results = ddg_search(query=search_query, urls_only=False)
contents = list(map(lambda x: get_markdown(x['href'], as_list=True), results))
count_fn = self.llm.get_num_tokens if self.llm is not None else self.embeddings.tokenizer.get_num_tokens
docs = []
for i, c in enumerate(contents):
if c is not None:
doc = create_content_chunks(contents=c, token_count_fn=count_fn)
doc = list(map(lambda x: Document(index=x, metadata=results[i]), doc))
docs.extend(doc)
self.vdb.add_documents(docs, split_text=False)
if self.ranker:
chunks = self.vdb.search(query=search_query, top_k=max(int(max_num_results * 2), 15), index_only=False)
self.vdb.clear()
res = self.ranker.rerank(query=search_query, elements=chunks, top_k=max_num_results)
res = list(map(lambda x: x.to_dict(), res))
chunks = list(map(lambda x: x['index'], res))
sources = list(set(map(lambda x: x['metadata']['href'], res)))
else:
chunks = self.vdb.search(query=search_query, top_k=max_num_results, index_only=False)
self.vdb.clear()
res = chunks.copy()
chunks = list(map(lambda x: x['index'], res))
sources = list(set(map(lambda x: x['metadata']['href'], res)))
gc.collect()
output = dict(relevant_contents=chunks, footnote=sources)
return output
Classes
class BrowserTool (embeddings: BaseEmbeddingsToolkit, llm: Optional[BaseLLM] = None, ranker: Optional[BaseRanker] = None)-
Tool for browsing contents via the DuckDuckGo search engine given any search query. The output will be the most relevant chunks of content found from the search engine according to the search query.
Initialising the tool.
Args
embeddings:BaseEmbeddingsToolkit- Embeddings toolkit for the vector database.
llm:Optional[BaseLLM], optional- LLM to count number of tokens for each chunk. Defaults to None.
ranker:Optional[BaseRanker], optional- Reranker to rerank results. If none is given, results will not be reranked after the search on the vector database. Defaults to None.
Expand source code
class BrowserTool(BaseTool): """Tool for browsing contents via the DuckDuckGo search engine given any search query. The output will be the most relevant chunks of content found from the search engine according to the search query. """ def __init__(self, embeddings: BaseEmbeddingsToolkit, llm: Optional[BaseLLM] = None, ranker: Optional[BaseRanker] = None) -> None: """Initialising the tool. Args: embeddings (BaseEmbeddingsToolkit): Embeddings toolkit for the vector database. llm (Optional[BaseLLM], optional): LLM to count number of tokens for each chunk. Defaults to None. ranker (Optional[BaseRanker], optional): Reranker to rerank results. If none is given, results will not be reranked after the search on the vector database. Defaults to None. """ from ..VectorDBs.faiss_vectordb import FaissVectorDatabase self.embeddings = embeddings self.vdb = FaissVectorDatabase.from_documents(embeddings=self.embeddings, docs=[]) self.llm = llm self.ranker = ranker super().__init__() def __call__(self, search_query: str, max_num_results: int = 3) -> Dict[str, List[str]]: """Entry point of the tool. Args: search_query (str): Search query to browse on DuckDuckGo. max_num_results (int, optional): Maximum number of relevant chunks of contents from the search results. Defaults to 3. Returns: Dict[str, List[str]]: The most relevant chunks on contents along with their resepctive URLs. """ import gc from .web_search_utils import ddg_search, get_markdown, create_content_chunks from ..Schemas.documents import Document results = ddg_search(query=search_query, urls_only=False) contents = list(map(lambda x: get_markdown(x['href'], as_list=True), results)) count_fn = self.llm.get_num_tokens if self.llm is not None else self.embeddings.tokenizer.get_num_tokens docs = [] for i, c in enumerate(contents): if c is not None: doc = create_content_chunks(contents=c, token_count_fn=count_fn) doc = list(map(lambda x: Document(index=x, metadata=results[i]), doc)) docs.extend(doc) self.vdb.add_documents(docs, split_text=False) if self.ranker: chunks = self.vdb.search(query=search_query, top_k=max(int(max_num_results * 2), 15), index_only=False) self.vdb.clear() res = self.ranker.rerank(query=search_query, elements=chunks, top_k=max_num_results) res = list(map(lambda x: x.to_dict(), res)) chunks = list(map(lambda x: x['index'], res)) sources = list(set(map(lambda x: x['metadata']['href'], res))) else: chunks = self.vdb.search(query=search_query, top_k=max_num_results, index_only=False) self.vdb.clear() res = chunks.copy() chunks = list(map(lambda x: x['index'], res)) sources = list(set(map(lambda x: x['metadata']['href'], res))) gc.collect() output = dict(relevant_contents=chunks, footnote=sources) return outputAncestors
Inherited members