Source code for ragoon.web_rag

# -*- coding: utf-8 -*-
# Copyright (c) Louis Brulé Naudet. All Rights Reserved.
# This software may be used and distributed according to the terms of License Agreement.
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import (
    IO,
    TYPE_CHECKING,
    Any,
    Dict,
    List,
    Type,
    Tuple,
    Union,
    Mapping,
    TypeVar,
    Callable,
    Optional,
    Sequence,
)

from ragoon._retrieval import Retriever
from ragoon._scrape import WebScraper
from ragoon._search import GoogleSearch


[docs] class WebRAG:
[docs] def __init__( self, google_api_key: str, google_cx: str, completion_client, user_agent: Optional[str] = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36" ) -> None: """ WebRAG class. This class facilitates retrieval-based querying and completion using various APIs. Parameters ---------- google_api_key : str The API key for Google services. google_cx : str The custom search engine ID for Google Custom Search. completion_client: str The API client for the completion service (e.g., OpenAI's GPT-3). user_agent : str, optional The user agent string to be used in web requests. Default is a Chrome user agent. Attributes ---------- web_scraper : WebScraper An instance of the WebScraper class for web scraping. retriever : Retriever An instance of the Retriever class for data retrieval. google_search : GoogleSearch An instance of the GoogleSearch class for Google searches. Examples -------- # Initialize RAGoon instance >>> ragoon = RAGoon( >>> google_api_key="your_google_api_key", >>> google_cx="your_google_cx", >>> completion_client=Groq(api_key="your_groq_api_key") >>> ) >>> # Search and get results >>> query = "I want to do a left join in python polars" >>> results = ragoon.search( >>> query=query, >>> completion_model="Llama3-70b-8192", >>> max_tokens=512, >>> temperature=1, >>> ) """ self.web_search = GoogleSearch( developer_key=google_api_key, cx=google_cx ) self.retriever = Retriever( client=completion_client ) self.web_scraper = WebScraper( user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36" )
[docs] def search( self, query: str, completion_model: str, system_prompt: Optional[str] = """ Given the user's input query, generate a concise and relevant Google search query that directly addresses the main intent of the user's question. The search query must be specifically tailored to retrieve results that can significantly enhance the context for a subsequent dialogue with an LLM. This approach will facilitate few-shot learning by providing rich, specific, and contextually relevant information. Please ensure that the response is well-formed and format it as a JSON object with a key named 'search_query'. This structured approach will help in assimilating the fetched results into an enhanced conversational model, contributing to a more nuanced and informed interaction. """, *args, **kargs ): """ Search for information and perform completion. This method searches for information related to the given query and performs completion using the specified model. Additional parameters can be passed to the completion method. Parameters ---------- query : str The search query. completion_model : str The name or identifier of the completion model to be used. *args Additional positional arguments to be passed to the completion method. **kwargs Additional keyword arguments to be passed to the completion method. Returns ------- completion_data : dict A dictionary containing the generated completion data. """ search_query = self.retriever.completion( model=completion_model, messages=[ { "role": "system", "content": system_prompt }, { "role": "user", "content": query } ], *args )["search_query"] search_results = self.web_search.search( query=search_query ) list_of_urls = [] for result in search_results: list_of_urls.append(result.get("link")) results = self.web_scraper.parallel_scrape( urls=list_of_urls, element_selectors= [ "main" ] ) return results