Module llmflex.Tools.web_search_utils

Expand source code
from bs4 import BeautifulSoup, NavigableString, Tag
from langchain.llms.base import LLM
from typing import Optional, List, Union, Callable, Dict, Any

def get_soup_from_url(url: str, timeout: int = 8) -> BeautifulSoup:
    """Get the soup object from a URL.

    Args:
        url (str): URL of the  website.
        timeout (int, optional): Timeout for the request in seconds. Defaults to 8.

    Returns:
        BeautifulSoup: Soup object of the website.
    """
    import requests
    from fake_useragent import UserAgent
    agent  = UserAgent(os = ['windows', 'macos'])
    response = requests.get(url, headers={'User-agent': agent.random}, timeout=timeout)
    if response.status_code != 200:
        return BeautifulSoup('', 'html.parser')
    return BeautifulSoup(response.content, 'html.parser')

def unwanted_contents() -> List[str]:
    """Unwanted elements.

    Returns:
        List[str]: List of unwanted elements.
    """
    unwanted = ['notification-bar', 'banner', 'nav', 'footer', 'sidebar', '.nav', '.footer', '.sidebar', '#nav', '#footer', '#sidebar']
    return unwanted

def wanted_contents() -> List[str]:
    """Wanted elements.

    Returns:
        List[str]: List of wanted elements.
    """
    wanted = ['a', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'ul', 'ol', 'table', 'article', 'section', 'blockquote', 'code', 'pre', 'samp']
    return wanted

def filtered_child(element: Union[BeautifulSoup, Tag]) -> List[Tag]:
    """Get the filtered list of children of an element.

    Args:
        element (Union[BeautifulSoup, Tag]): The element to filter.

    Returns:
        List[Tag]: List of children.
    """
    children = element.children
    output = []
    for child in children:
        if isinstance(child, NavigableString):
            output.append(child)
        elif child.name in unwanted_contents():
            pass
        elif any(c in unwanted_contents() for c in child.get('class', [])):
            pass
        elif child.get('id', '') in unwanted_contents():
            pass
        else:
            output.append(child)
    return output
    
def process_table_row(row: Tag) -> str:
    """Process a table row element.

    Args:
        row (Tag): Table row element.

    Returns:
        str: Formatted row as markdown.
    """
    children = list(row.children)
    children = list(filter(lambda x: x.name in ['th', 'td'], children))
    if len(children) == 0:
        return None
    is_header = children[0].name == 'th'
    data = []
    for child in children:
        out = list(map(process_list_children, child.children))
        out = list(filter(lambda x: x is not None, out))
        out = ' '.join(out)
        out = 'EMPTY CELL' if out.strip(' \n\r\t')=='' else out
        data.append(out)
    num = len(data)
    data = '| ' + ' | '.join(data) + ' |'
    if is_header:
        seps = [':---'] * num
        seps = '| ' + ' | '.join(seps) + ' |'
        data += '\n' + seps
    return data

def format_table(table: Tag) -> str:
    """Format a table element as markdown.

    Args:
        table (Tag): Table element.

    Returns:
        str: Formatted table as markdown.
    """
    children = list(filter(lambda x: x.name == 'tr', list(table.children)))
    children = list(map(process_table_row, children))
    children = list(filter(lambda x: x is not None, children))
    if len(children) == 0:
        return None
    return '\n'.join(children)

def process_list_children(child: Union[Tag, NavigableString], order: int = 0) -> Optional[str]:
    """Process list child elements.

    Args:
        child (Union[Tag, NavigableString]): List child element.
        order (int, optional): Order of the list. Defaults to 0.

    Returns:
        Optional[str]: Formatted child element as markdown or None if it's not needed.
    """
    if isinstance(child, NavigableString):
        out = child.get_text(strip=True)
        out = None if out.strip(' \n\r\t') == '' else out
    elif child.name == 'a':
        out = format_link(child)
    elif child.name =='ol':
        out = format_ordered_list(child, order=order + 1)
    elif child.name =='ul':
        out = format_unordered_list(child, order=order + 1)
    else:
        out = child.get_text(strip=True)
        out = None if out.strip(' \n\r\t') == '' else out
    return out

def format_ordered_list(olist: Tag, order: int = 0) -> Optional[str]:
    """Format an ordered list element as markdown.

    Args:
        olist (Tag): Ordered list element.
        order (int, optional): Order of the list. Defaults to 0.

    Returns:
        Optional[str]: Formatted ordered list as markdown or None if it's empty.
    """
    count = 0
    outputs = []
    for l in olist.children:
        if not isinstance(l, Tag):
            continue
        elif l.name != 'li':
            continue
        else:
            child = list(map(lambda x: process_list_children(x, order), list(l.children)))
            child = list(filter(lambda x: x is not None, child))
            if len(child) == 0: 
                out = None
            else:
                out = ' '.join(child)
            if out is None:
                continue
            if out.strip(' \n\r\t') == '':
                continue
            else:
                count += 1
                outputs.append('\t' * order + f'{count}. {out}')

    if len(outputs) == 0:
        return None
    else:
        return '\n'.join(outputs)

def format_unordered_list(ulist: Tag, order: int = 0) -> Optional[str]:
    """Format an unordered list element as markdown.

    Args:
        ulist (Tag): Unordered list element.
        order (int, optional): Order of the list. Defaults to 0.

    Returns:
        Optional[str]: Formatted unordered list as markdown or None if it's empty.
    """
    outputs = []
    for l in ulist.children:
        if not isinstance(l, Tag):
            continue
        elif l.name != 'li':
            continue
        else:
            child = list(map(lambda x: process_list_children(x, order), list(l.children)))
            child = list(filter(lambda x: x is not None, child))
            if len(child) == 0: 
                out = None
            else:
                out = ' '.join(child)
            if out is None:
                continue
            if out.strip(' \n\r\t') == '':
                continue
            else:
                outputs.append('\t' * order + f'* {out}')

    if len(outputs) == 0:
        return None
    else:
        return '\n'.join(outputs)

def detect_language(code_snippet: str) -> str:
    """Detect the language of a code snippet.

    Args:
        code_snippet (str): Code snippet to guess.

    Returns:
        str: Programming language.
    """
    # Normalize the code snippet to help with detection
    code_snippet_lower = code_snippet.lower()

    if 'class' in code_snippet_lower and 'public static void main' in code_snippet:
        return 'java'
    elif ('def ' in code_snippet or 'import ' in code_snippet) and ':' in code_snippet:
        return 'python'
    elif ('function ' in code_snippet or '=>' in code_snippet) and ('var ' in code_snippet or 'let ' in code_snippet or 'const ' in code_snippet):
        return 'javascript'
    elif '#include' in code_snippet:
        return 'cpp'
    elif code_snippet.startswith('#!/bin/bash') or 'echo ' in code_snippet or 'grep ' in code_snippet:
        return 'bash'
    elif 'def ' in code_snippet and 'end' in code_snippet:
        return 'ruby'
    elif '<?php' in code_snippet_lower or 'echo ' in code_snippet or '->' in code_snippet:
        return 'php'
    elif 'using ' in code_snippet and 'namespace ' in code_snippet:
        return 'csharp'  # Note: Markdown typically uses 'cs' or 'csharp' for C#
    elif '<html>' in code_snippet_lower or '<div>' in code_snippet_lower or 'doctype html' in code_snippet_lower:
        return 'html'
    elif '{' in code_snippet and '}' in code_snippet and (':' in code_snippet or ';' in code_snippet) and ('color:' in code_snippet_lower or 'background:' in code_snippet_lower or 'font-size:' in code_snippet_lower):
        return 'css'
    else:
        return 'plaintext'  # Using 'plaintext' for unknown or plain text code blocks

def format_code(code: Tag, with_wrapper: bool = True) -> Optional[str]:
    """Format a code element as markdown.

    Args:
        code (Tag): Code element.
        with_wrapper (bool, optional): Whether to include language wrappers in the output or not. Defaults to True.

    Returns:
        Optional[str]: Formatted code block as markdown or None if it's not needed.
    """
    text = code.get_text(strip=True)
    if text.strip(' \n\r\t') =='':
        return None
    else:
        output = text.strip(' \n\r\t')
    if with_wrapper:
        return f'```{detect_language(output)}\n' + output + '\n```'
    
def format_paragraph(paragraph: Tag) -> str:
    """Format a paragraph element as markdown.

    Args:
        paragraph (Tag): Paragraph element.

    Returns:
        str: Formatted paragraph as markdown.
    """
    outputs = []
    for child in filtered_child(paragraph):
        if isinstance(child, NavigableString):
            outputs.append(child.get_text(strip=True))
        elif child.name in ['pre', 'code', 'samp']:
            code = format_code(child, with_wrapper=False)
            if code is not None:
                outputs.append(f'`{code}`')
            else:
                code = child.get_text(strip=True)
                outputs.append(f'`{code}`')

        elif child.name == 'a':
            outputs.append(format_link(child))
        else:
            outputs.append(child.get_text(strip=True))
    if len(outputs) == 0:
        return None
    else:
        return ' '.join(outputs)

def format_header(header: Tag) -> str:
    """Format a header element as markdown.

    Args:
        header (Tag): Header element.

    Returns:
        str: Formatted header as markdown.
    """
    size = int(header.name[1])
    return '#' * size + ' ' + header.get_text(strip=True)

def format_link(link: Tag) -> str:
    """Format a link element as markdown.

    Args:
        link (Tag): Link element.

    Returns:
        str: Formatted link as markdown.
    """
    text = link.get_text(strip=True)
    href = link.get('href', '')
    if ((text.strip(' \n\r\t#') == '') & (href.strip(' \n\r\t#') == '')):
        return None
    elif text.strip(' \n\r\t') == '':
        output = f'[{href}]'
    elif href.strip(' \n\r\t') == '':
        output = text
    else:
        output = f'[{text}]({href})'
    return output

def process_element(element: Union[BeautifulSoup, Tag, NavigableString], sep: str = '\n\n', end='  ', as_list: bool = False) -> Optional[Union[str, List[str]]]:
    """Process an element recursively and return the output as text of list of texts by elements.

    Args:
        element (Union[BeautifulSoup, Tag, NavigableString]): Element to process.
        sep (str, optional): Seperator of each element. Defaults to '\n\n'.
        end (str, optional): Added string to the end of each element. Defaults to '  '.
        as_list (bool, optional): Whether to return a list of strings of elements or a single string. Defaults to False.

    Returns:
        Optional[Union[str, List[str]]]: Content string or list of string of the element.
    """
    outputs = []
    for e in filtered_child(element):
        if isinstance(e, NavigableString):
            text = e.get_text(strip=True)
            text = None if text.strip(' \n\r\t') == '' else text
            outputs.append(text)
        elif e.name in ['pre', 'code', 'samp']:
            outputs.append(format_code(e))
        elif e.name == 'ul':
            outputs.append(format_unordered_list(e))
        elif e.name == 'ol':
            outputs.append(format_ordered_list(e))
        elif e.name == 'table':
            outputs.append(format_table(e))
        elif e.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
            outputs.append(format_header(e))
        elif e.name == 'a':
            outputs.append(format_link(e))
        elif e.name in ['p']:
            outputs.append(format_paragraph(e))
        else:
            outputs.append(process_element(e, as_list=as_list))
    final = []
    for o in outputs:
        if o is None:
            continue
        elif isinstance(o, list):
            final.extend(o)
        elif o.strip(' \n\r\t') == '':
            continue
        elif len(o) < 3: # Remove random elements with less than 3 characters.
            continue
        else:
            final.append(o)
    if len(final) == 0:
        return None
    final = list(filter(lambda x: x is not None, final))
    if as_list:
        return final
    else:
        final = list(map(lambda x: x + end, final))
        return sep.join(final)
    
def create_content_chunks(contents: Optional[List[str]], token_count_fn: Callable[[str], int], chunk_size: int = 400) -> List[str]:
    """Create a list of strings of chunks limited by the count of tokens.

    Args:
        contents (Optional[List[str]]): List of contents to aggregate.
        token_count_fn (Callable[[str], int]): Function to count tokens.
        chunk_size (int, optional): Token limit of each chunk. Defaults to 400.

    Returns:
        List[str]: List of content chunks.
    """
    chunks = []
    current = []
    current_count = 0
    if contents is None:
        return chunks
    for c in contents:
        count = token_count_fn(c)
        if current_count + count <= chunk_size:
            current.append(c)
            current_count += count
        elif count > chunk_size:
            chunks.append('\n\n'.join(current))
            chunks.append(c)
            current = []
            current_count = 0
        else:
            chunks.append('\n\n'.join(current))
            current = [c]
            current_count = count
    if len(current) != 0:
        chunks.append('\n\n'.join(current))
    return chunks

def get_markdown(url: str, timeout: int = 8, as_list: bool = False) -> Union[str, List[str]]:
    """Get the content of a URL as a string or a list of strings.

    Args:
        url (str): URL of the website.
        timeout (int, optional): Request timeout as seconds. Defaults to 8.
        as_list (bool, optional): Whether to return the content as a list or as a string. Defaults to False.

    Returns:
        Union[str, List[str]]: Content of the URL as a string or a list of string.
    """
    soup = get_soup_from_url(url, timeout=timeout)
    return process_element(soup, as_list=as_list)

def ddg_search(query: str, n: int = 5, urls_only: bool = True, **kwargs) -> List[Union[str, Dict[str, Any]]]:
    """Search with DuckDuckGo.

    Args:
        query (str): Search query.
        n (int, optional): Maximum number of results. Defaults to 5.
        urls_only (bool, optional): Only return the list of urls or return other information as well. Defaults to True.

    Returns:
        List[Union[str, Dict[str, Any]]]: List of search results.
    """
    from duckduckgo_search import DDGS
    with DDGS() as ddgs:
        results = [r for r in ddgs.text(query, max_results=n, **kwargs)]
    if urls_only:
        results = list(map(lambda x: x['href'], results))
    return results

Functions

def create_content_chunks(contents: Optional[List[str]], token_count_fn: Callable[[str], int], chunk_size: int = 400) ‑> List[str]

Create a list of strings of chunks limited by the count of tokens.

Args

contents : Optional[List[str]]
List of contents to aggregate.
token_count_fn : Callable[[str], int]
Function to count tokens.
chunk_size : int, optional
Token limit of each chunk. Defaults to 400.

Returns

List[str]
List of content chunks.
Expand source code
def create_content_chunks(contents: Optional[List[str]], token_count_fn: Callable[[str], int], chunk_size: int = 400) -> List[str]:
    """Create a list of strings of chunks limited by the count of tokens.

    Args:
        contents (Optional[List[str]]): List of contents to aggregate.
        token_count_fn (Callable[[str], int]): Function to count tokens.
        chunk_size (int, optional): Token limit of each chunk. Defaults to 400.

    Returns:
        List[str]: List of content chunks.
    """
    chunks = []
    current = []
    current_count = 0
    if contents is None:
        return chunks
    for c in contents:
        count = token_count_fn(c)
        if current_count + count <= chunk_size:
            current.append(c)
            current_count += count
        elif count > chunk_size:
            chunks.append('\n\n'.join(current))
            chunks.append(c)
            current = []
            current_count = 0
        else:
            chunks.append('\n\n'.join(current))
            current = [c]
            current_count = count
    if len(current) != 0:
        chunks.append('\n\n'.join(current))
    return chunks

Search with DuckDuckGo.

Args

query : str
Search query.
n : int, optional
Maximum number of results. Defaults to 5.
urls_only : bool, optional
Only return the list of urls or return other information as well. Defaults to True.

Returns

List[Union[str, Dict[str, Any]]]
List of search results.
Expand source code
def ddg_search(query: str, n: int = 5, urls_only: bool = True, **kwargs) -> List[Union[str, Dict[str, Any]]]:
    """Search with DuckDuckGo.

    Args:
        query (str): Search query.
        n (int, optional): Maximum number of results. Defaults to 5.
        urls_only (bool, optional): Only return the list of urls or return other information as well. Defaults to True.

    Returns:
        List[Union[str, Dict[str, Any]]]: List of search results.
    """
    from duckduckgo_search import DDGS
    with DDGS() as ddgs:
        results = [r for r in ddgs.text(query, max_results=n, **kwargs)]
    if urls_only:
        results = list(map(lambda x: x['href'], results))
    return results
def detect_language(code_snippet: str) ‑> str

Detect the language of a code snippet.

Args

code_snippet : str
Code snippet to guess.

Returns

str
Programming language.
Expand source code
def detect_language(code_snippet: str) -> str:
    """Detect the language of a code snippet.

    Args:
        code_snippet (str): Code snippet to guess.

    Returns:
        str: Programming language.
    """
    # Normalize the code snippet to help with detection
    code_snippet_lower = code_snippet.lower()

    if 'class' in code_snippet_lower and 'public static void main' in code_snippet:
        return 'java'
    elif ('def ' in code_snippet or 'import ' in code_snippet) and ':' in code_snippet:
        return 'python'
    elif ('function ' in code_snippet or '=>' in code_snippet) and ('var ' in code_snippet or 'let ' in code_snippet or 'const ' in code_snippet):
        return 'javascript'
    elif '#include' in code_snippet:
        return 'cpp'
    elif code_snippet.startswith('#!/bin/bash') or 'echo ' in code_snippet or 'grep ' in code_snippet:
        return 'bash'
    elif 'def ' in code_snippet and 'end' in code_snippet:
        return 'ruby'
    elif '<?php' in code_snippet_lower or 'echo ' in code_snippet or '->' in code_snippet:
        return 'php'
    elif 'using ' in code_snippet and 'namespace ' in code_snippet:
        return 'csharp'  # Note: Markdown typically uses 'cs' or 'csharp' for C#
    elif '<html>' in code_snippet_lower or '<div>' in code_snippet_lower or 'doctype html' in code_snippet_lower:
        return 'html'
    elif '{' in code_snippet and '}' in code_snippet and (':' in code_snippet or ';' in code_snippet) and ('color:' in code_snippet_lower or 'background:' in code_snippet_lower or 'font-size:' in code_snippet_lower):
        return 'css'
    else:
        return 'plaintext'  # Using 'plaintext' for unknown or plain text code blocks
def filtered_child(element: Union[bs4.BeautifulSoup, bs4.element.Tag]) ‑> List[bs4.element.Tag]

Get the filtered list of children of an element.

Args

element : Union[BeautifulSoup, Tag]
The element to filter.

Returns

List[Tag]
List of children.
Expand source code
def filtered_child(element: Union[BeautifulSoup, Tag]) -> List[Tag]:
    """Get the filtered list of children of an element.

    Args:
        element (Union[BeautifulSoup, Tag]): The element to filter.

    Returns:
        List[Tag]: List of children.
    """
    children = element.children
    output = []
    for child in children:
        if isinstance(child, NavigableString):
            output.append(child)
        elif child.name in unwanted_contents():
            pass
        elif any(c in unwanted_contents() for c in child.get('class', [])):
            pass
        elif child.get('id', '') in unwanted_contents():
            pass
        else:
            output.append(child)
    return output
def format_code(code: bs4.element.Tag, with_wrapper: bool = True) ‑> Optional[str]

Format a code element as markdown.

Args

code : Tag
Code element.
with_wrapper : bool, optional
Whether to include language wrappers in the output or not. Defaults to True.

Returns

Optional[str]
Formatted code block as markdown or None if it's not needed.
Expand source code
def format_code(code: Tag, with_wrapper: bool = True) -> Optional[str]:
    """Format a code element as markdown.

    Args:
        code (Tag): Code element.
        with_wrapper (bool, optional): Whether to include language wrappers in the output or not. Defaults to True.

    Returns:
        Optional[str]: Formatted code block as markdown or None if it's not needed.
    """
    text = code.get_text(strip=True)
    if text.strip(' \n\r\t') =='':
        return None
    else:
        output = text.strip(' \n\r\t')
    if with_wrapper:
        return f'```{detect_language(output)}\n' + output + '\n```'
def format_header(header: bs4.element.Tag) ‑> str

Format a header element as markdown.

Args

header : Tag
Header element.

Returns

str
Formatted header as markdown.
Expand source code
def format_header(header: Tag) -> str:
    """Format a header element as markdown.

    Args:
        header (Tag): Header element.

    Returns:
        str: Formatted header as markdown.
    """
    size = int(header.name[1])
    return '#' * size + ' ' + header.get_text(strip=True)

Format a link element as markdown.

Args

link : Tag
Link element.

Returns

str
Formatted link as markdown.
Expand source code
def format_link(link: Tag) -> str:
    """Format a link element as markdown.

    Args:
        link (Tag): Link element.

    Returns:
        str: Formatted link as markdown.
    """
    text = link.get_text(strip=True)
    href = link.get('href', '')
    if ((text.strip(' \n\r\t#') == '') & (href.strip(' \n\r\t#') == '')):
        return None
    elif text.strip(' \n\r\t') == '':
        output = f'[{href}]'
    elif href.strip(' \n\r\t') == '':
        output = text
    else:
        output = f'[{text}]({href})'
    return output
def format_ordered_list(olist: bs4.element.Tag, order: int = 0) ‑> Optional[str]

Format an ordered list element as markdown.

Args

olist : Tag
Ordered list element.
order : int, optional
Order of the list. Defaults to 0.

Returns

Optional[str]
Formatted ordered list as markdown or None if it's empty.
Expand source code
def format_ordered_list(olist: Tag, order: int = 0) -> Optional[str]:
    """Format an ordered list element as markdown.

    Args:
        olist (Tag): Ordered list element.
        order (int, optional): Order of the list. Defaults to 0.

    Returns:
        Optional[str]: Formatted ordered list as markdown or None if it's empty.
    """
    count = 0
    outputs = []
    for l in olist.children:
        if not isinstance(l, Tag):
            continue
        elif l.name != 'li':
            continue
        else:
            child = list(map(lambda x: process_list_children(x, order), list(l.children)))
            child = list(filter(lambda x: x is not None, child))
            if len(child) == 0: 
                out = None
            else:
                out = ' '.join(child)
            if out is None:
                continue
            if out.strip(' \n\r\t') == '':
                continue
            else:
                count += 1
                outputs.append('\t' * order + f'{count}. {out}')

    if len(outputs) == 0:
        return None
    else:
        return '\n'.join(outputs)
def format_paragraph(paragraph: bs4.element.Tag) ‑> str

Format a paragraph element as markdown.

Args

paragraph : Tag
Paragraph element.

Returns

str
Formatted paragraph as markdown.
Expand source code
def format_paragraph(paragraph: Tag) -> str:
    """Format a paragraph element as markdown.

    Args:
        paragraph (Tag): Paragraph element.

    Returns:
        str: Formatted paragraph as markdown.
    """
    outputs = []
    for child in filtered_child(paragraph):
        if isinstance(child, NavigableString):
            outputs.append(child.get_text(strip=True))
        elif child.name in ['pre', 'code', 'samp']:
            code = format_code(child, with_wrapper=False)
            if code is not None:
                outputs.append(f'`{code}`')
            else:
                code = child.get_text(strip=True)
                outputs.append(f'`{code}`')

        elif child.name == 'a':
            outputs.append(format_link(child))
        else:
            outputs.append(child.get_text(strip=True))
    if len(outputs) == 0:
        return None
    else:
        return ' '.join(outputs)
def format_table(table: bs4.element.Tag) ‑> str

Format a table element as markdown.

Args

table : Tag
Table element.

Returns

str
Formatted table as markdown.
Expand source code
def format_table(table: Tag) -> str:
    """Format a table element as markdown.

    Args:
        table (Tag): Table element.

    Returns:
        str: Formatted table as markdown.
    """
    children = list(filter(lambda x: x.name == 'tr', list(table.children)))
    children = list(map(process_table_row, children))
    children = list(filter(lambda x: x is not None, children))
    if len(children) == 0:
        return None
    return '\n'.join(children)
def format_unordered_list(ulist: bs4.element.Tag, order: int = 0) ‑> Optional[str]

Format an unordered list element as markdown.

Args

ulist : Tag
Unordered list element.
order : int, optional
Order of the list. Defaults to 0.

Returns

Optional[str]
Formatted unordered list as markdown or None if it's empty.
Expand source code
def format_unordered_list(ulist: Tag, order: int = 0) -> Optional[str]:
    """Format an unordered list element as markdown.

    Args:
        ulist (Tag): Unordered list element.
        order (int, optional): Order of the list. Defaults to 0.

    Returns:
        Optional[str]: Formatted unordered list as markdown or None if it's empty.
    """
    outputs = []
    for l in ulist.children:
        if not isinstance(l, Tag):
            continue
        elif l.name != 'li':
            continue
        else:
            child = list(map(lambda x: process_list_children(x, order), list(l.children)))
            child = list(filter(lambda x: x is not None, child))
            if len(child) == 0: 
                out = None
            else:
                out = ' '.join(child)
            if out is None:
                continue
            if out.strip(' \n\r\t') == '':
                continue
            else:
                outputs.append('\t' * order + f'* {out}')

    if len(outputs) == 0:
        return None
    else:
        return '\n'.join(outputs)
def get_markdown(url: str, timeout: int = 8, as_list: bool = False) ‑> Union[str, List[str]]

Get the content of a URL as a string or a list of strings.

Args

url : str
URL of the website.
timeout : int, optional
Request timeout as seconds. Defaults to 8.
as_list : bool, optional
Whether to return the content as a list or as a string. Defaults to False.

Returns

Union[str, List[str]]
Content of the URL as a string or a list of string.
Expand source code
def get_markdown(url: str, timeout: int = 8, as_list: bool = False) -> Union[str, List[str]]:
    """Get the content of a URL as a string or a list of strings.

    Args:
        url (str): URL of the website.
        timeout (int, optional): Request timeout as seconds. Defaults to 8.
        as_list (bool, optional): Whether to return the content as a list or as a string. Defaults to False.

    Returns:
        Union[str, List[str]]: Content of the URL as a string or a list of string.
    """
    soup = get_soup_from_url(url, timeout=timeout)
    return process_element(soup, as_list=as_list)
def get_soup_from_url(url: str, timeout: int = 8) ‑> bs4.BeautifulSoup

Get the soup object from a URL.

Args

url : str
URL of the website.
timeout : int, optional
Timeout for the request in seconds. Defaults to 8.

Returns

BeautifulSoup
Soup object of the website.
Expand source code
def get_soup_from_url(url: str, timeout: int = 8) -> BeautifulSoup:
    """Get the soup object from a URL.

    Args:
        url (str): URL of the  website.
        timeout (int, optional): Timeout for the request in seconds. Defaults to 8.

    Returns:
        BeautifulSoup: Soup object of the website.
    """
    import requests
    from fake_useragent import UserAgent
    agent  = UserAgent(os = ['windows', 'macos'])
    response = requests.get(url, headers={'User-agent': agent.random}, timeout=timeout)
    if response.status_code != 200:
        return BeautifulSoup('', 'html.parser')
    return BeautifulSoup(response.content, 'html.parser')
def process_element(element: Union[bs4.BeautifulSoup, bs4.element.Tag, bs4.element.NavigableString], sep: str = '\n\n', end=' ', as_list: bool = False) ‑> Union[str, List[str], ForwardRef(None)]

Process an element recursively and return the output as text of list of texts by elements.

Args:
    element (Union[BeautifulSoup, Tag, NavigableString]): Element to process.
    sep (str, optional): Seperator of each element. Defaults to '

'. end (str, optional): Added string to the end of each element. Defaults to ' '. as_list (bool, optional): Whether to return a list of strings of elements or a single string. Defaults to False.

Returns:
    Optional[Union[str, List[str]]]: Content string or list of string of the element.
Expand source code
def process_element(element: Union[BeautifulSoup, Tag, NavigableString], sep: str = '\n\n', end='  ', as_list: bool = False) -> Optional[Union[str, List[str]]]:
    """Process an element recursively and return the output as text of list of texts by elements.

    Args:
        element (Union[BeautifulSoup, Tag, NavigableString]): Element to process.
        sep (str, optional): Seperator of each element. Defaults to '\n\n'.
        end (str, optional): Added string to the end of each element. Defaults to '  '.
        as_list (bool, optional): Whether to return a list of strings of elements or a single string. Defaults to False.

    Returns:
        Optional[Union[str, List[str]]]: Content string or list of string of the element.
    """
    outputs = []
    for e in filtered_child(element):
        if isinstance(e, NavigableString):
            text = e.get_text(strip=True)
            text = None if text.strip(' \n\r\t') == '' else text
            outputs.append(text)
        elif e.name in ['pre', 'code', 'samp']:
            outputs.append(format_code(e))
        elif e.name == 'ul':
            outputs.append(format_unordered_list(e))
        elif e.name == 'ol':
            outputs.append(format_ordered_list(e))
        elif e.name == 'table':
            outputs.append(format_table(e))
        elif e.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
            outputs.append(format_header(e))
        elif e.name == 'a':
            outputs.append(format_link(e))
        elif e.name in ['p']:
            outputs.append(format_paragraph(e))
        else:
            outputs.append(process_element(e, as_list=as_list))
    final = []
    for o in outputs:
        if o is None:
            continue
        elif isinstance(o, list):
            final.extend(o)
        elif o.strip(' \n\r\t') == '':
            continue
        elif len(o) < 3: # Remove random elements with less than 3 characters.
            continue
        else:
            final.append(o)
    if len(final) == 0:
        return None
    final = list(filter(lambda x: x is not None, final))
    if as_list:
        return final
    else:
        final = list(map(lambda x: x + end, final))
        return sep.join(final)
def process_list_children(child: Union[bs4.element.Tag, bs4.element.NavigableString], order: int = 0) ‑> Optional[str]

Process list child elements.

Args

child : Union[Tag, NavigableString]
List child element.
order : int, optional
Order of the list. Defaults to 0.

Returns

Optional[str]
Formatted child element as markdown or None if it's not needed.
Expand source code
def process_list_children(child: Union[Tag, NavigableString], order: int = 0) -> Optional[str]:
    """Process list child elements.

    Args:
        child (Union[Tag, NavigableString]): List child element.
        order (int, optional): Order of the list. Defaults to 0.

    Returns:
        Optional[str]: Formatted child element as markdown or None if it's not needed.
    """
    if isinstance(child, NavigableString):
        out = child.get_text(strip=True)
        out = None if out.strip(' \n\r\t') == '' else out
    elif child.name == 'a':
        out = format_link(child)
    elif child.name =='ol':
        out = format_ordered_list(child, order=order + 1)
    elif child.name =='ul':
        out = format_unordered_list(child, order=order + 1)
    else:
        out = child.get_text(strip=True)
        out = None if out.strip(' \n\r\t') == '' else out
    return out
def process_table_row(row: bs4.element.Tag) ‑> str

Process a table row element.

Args

row : Tag
Table row element.

Returns

str
Formatted row as markdown.
Expand source code
def process_table_row(row: Tag) -> str:
    """Process a table row element.

    Args:
        row (Tag): Table row element.

    Returns:
        str: Formatted row as markdown.
    """
    children = list(row.children)
    children = list(filter(lambda x: x.name in ['th', 'td'], children))
    if len(children) == 0:
        return None
    is_header = children[0].name == 'th'
    data = []
    for child in children:
        out = list(map(process_list_children, child.children))
        out = list(filter(lambda x: x is not None, out))
        out = ' '.join(out)
        out = 'EMPTY CELL' if out.strip(' \n\r\t')=='' else out
        data.append(out)
    num = len(data)
    data = '| ' + ' | '.join(data) + ' |'
    if is_header:
        seps = [':---'] * num
        seps = '| ' + ' | '.join(seps) + ' |'
        data += '\n' + seps
    return data
def unwanted_contents() ‑> List[str]

Unwanted elements.

Returns

List[str]
List of unwanted elements.
Expand source code
def unwanted_contents() -> List[str]:
    """Unwanted elements.

    Returns:
        List[str]: List of unwanted elements.
    """
    unwanted = ['notification-bar', 'banner', 'nav', 'footer', 'sidebar', '.nav', '.footer', '.sidebar', '#nav', '#footer', '#sidebar']
    return unwanted
def wanted_contents() ‑> List[str]

Wanted elements.

Returns

List[str]
List of wanted elements.
Expand source code
def wanted_contents() -> List[str]:
    """Wanted elements.

    Returns:
        List[str]: List of wanted elements.
    """
    wanted = ['a', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'ul', 'ol', 'table', 'article', 'section', 'blockquote', 'code', 'pre', 'samp']
    return wanted