PentestGPT/utils/web_parser.py

# Use functions from Auto-GPT: https://github.com/Torantulino/Auto-GPT/blob/master/scripts/browse.py
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin
from googlesearch import search


######### Quick documentation #########
## Use get response to get the original response from the URL
## Use parse_web to get the text from the URL (bs4 handled)
## Use google_search to get the search results from Google. Results are already parsed.
#######################################


# Function to check if the URL is valid
def is_valid_url(url):
    try:
        result = urlparse(url)
        return all([result.scheme, result.netloc])
    except ValueError:
        return False


# Function to sanitize the URL
def sanitize_url(url):
    return urljoin(url, urlparse(url).path)


def check_local_file_access(url):
    local_prefixes = [
        "file:///",
        "file://localhost",
        "http://localhost",
        "https://localhost",
    ]
    return any(url.startswith(prefix) for prefix in local_prefixes)


def get_response(url, timeout=10) -> tuple:
    """
    Get the response from the URL.

    Parameters:
    ----------
        url (str): The URL to get the response from.
        timeout (int): The timeout for the HTTP request.

    Returns:
    -------
        response (requests.models.Response): The response from the URL.
        error (str): The error message if any.
    """
    try:
        # Restrict access to local files
        if check_local_file_access(url):
            raise ValueError("Access to local files is restricted")

        # Most basic check if the URL is valid:
        if not url.startswith("http://") and not url.startswith("https://"):
            raise ValueError("Invalid URL format")

        sanitized_url = sanitize_url(url)

        user_agent_header = {
            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36"
        }

        response = requests.get(
            sanitized_url, headers=user_agent_header, timeout=timeout
        )

        # Check if the response contains an HTTP error
        if response.status_code >= 400:
            return None, "Error: HTTP " + str(response.status_code) + " error"

        return response, None
    except ValueError as ve:
        # Handle invalid URL format
        return None, "Error: " + str(ve)

    except requests.exceptions.RequestException as re:
        # Handle exceptions related to the HTTP request (e.g., connection errors, timeouts, etc.)
        return None, "Error: " + str(re)


def parse_web(url) -> str:
    # create a user agent header
    response, potential_error = get_response(url)
    if response is None:
        return potential_error

    # Check if the response contains an HTTP error
    if response.status_code >= 400:
        return "Error: HTTP " + str(response.status_code) + " error"

    soup = BeautifulSoup(response.text, "html.parser")

    for script in soup(["script", "style"]):
        script.extract()

    text = soup.get_text()
    lines = (line.strip() for line in text.splitlines())
    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
    text = "\n".join(chunk for chunk in chunks if chunk)

    return text


def google_search(keyword, num_results=5) -> dict:
    """
    Search on Google and return the results.

    Parameters:
    ----------
        keyword (str): The keyword to search on Google.
        num_results (int): The number of results to return.

    Returns:
    -------
        result (dict): The search results. Format: {"keyword": keyword, "search_result": {url, content}}}

    """
    search_result = {}
    for url in search(keyword, tld="com", num=num_results, stop=num_results, pause=2):
        search_result[url] = parse_web(url)
    result = {"keyword": keyword, "search_result": search_result}


if __name__ == "__main__":
    # test to query google search on "what is penetration testing?"
    query = "what is penetration testing?"
    for url in search(query, tld="com", num=5, stop=5, pause=2):
        print(url)
        web_content = parse_web(url)
        print(web_content)