feat: 🎸 Add basic google handler

2025-10-29 16:58:59 +00:00 · 2023-04-14 23:24:25 +08:00
parent 37f16b7c72
commit 3795331598
3 changed files with 98 additions and 6 deletions
--- a/requirements.txt
+++ b/requirements.txt
@@ -8,4 +8,5 @@ loguru
 beautifulsoup4~=4.11.2
 colorama
 rich
-prompt-toolkit
+prompt-toolkit
+google
--- a/tests/testBrowsing.py
+++ b/tests/testBrowsing.py
@@ -0,0 +1,9 @@
+import unittest
+
+
+class TestBrowsing(unittest.TestCase):
+    pass
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/utils/web_parser.py
+++ b/utils/web_parser.py
@@ -1,14 +1,87 @@
-# TODO: parse the web contents with bs4.
+# Use functions from Auto-GPT: https://github.com/Torantulino/Auto-GPT/blob/master/scripts/browse.py
 import requests
 from bs4 import BeautifulSoup
+from urllib.parse import urlparse, urljoin
+from googlesearch import search
+
+
+# Function to check if the URL is valid
+def is_valid_url(url):
+    try:
+        result = urlparse(url)
+        return all([result.scheme, result.netloc])
+    except ValueError:
+        return False
+
+
+# Function to sanitize the URL
+def sanitize_url(url):
+    return urljoin(url, urlparse(url).path)
+
+
+def check_local_file_access(url):
+    local_prefixes = [
+        "file:///",
+        "file://localhost",
+        "http://localhost",
+        "https://localhost",
+    ]
+    return any(url.startswith(prefix) for prefix in local_prefixes)
+
+
+def get_response(url, timeout=10):
+    """
+    Get the response from the URL.
+
+    Parameters:
+    ----------
+        url (str): The URL to get the response from.
+        timeout (int): The timeout for the HTTP request.
+
+    Returns:
+    -------
+        response (requests.models.Response): The response from the URL.
+        error (str): The error message if any.
+    """
+    try:
+        # Restrict access to local files
+        if check_local_file_access(url):
+            raise ValueError("Access to local files is restricted")
+
+        # Most basic check if the URL is valid:
+        if not url.startswith("http://") and not url.startswith("https://"):
+            raise ValueError("Invalid URL format")
+
+        sanitized_url = sanitize_url(url)
+
+        user_agent_header = {
+            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36"
+        }
+
+        response = requests.get(
+            sanitized_url, headers=user_agent_header, timeout=timeout
+        )
+
+        # Check if the response contains an HTTP error
+        if response.status_code >= 400:
+            return None, "Error: HTTP " + str(response.status_code) + " error"
+
+        return response, None
+    except ValueError as ve:
+        # Handle invalid URL format
+        return None, "Error: " + str(ve)
+
+    except requests.exceptions.RequestException as re:
+        # Handle exceptions related to the HTTP request (e.g., connection errors, timeouts, etc.)
+        return None, "Error: " + str(re)


 def parse_web(url):
    # create a user agent header
-    user_agent_header = {
-        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36"
-    }
-    response = requests.get(url, headers=user_agent_header)
+    response, potential_error = get_response(url)
+    if response is None:
+        return potential_error
+
    # Check if the response contains an HTTP error
    if response.status_code >= 400:
        return "Error: HTTP " + str(response.status_code) + " error"
@@ -24,3 +97,12 @@ def parse_web(url):
    text = "\n".join(chunk for chunk in chunks if chunk)

    return text
+
+
+if __name__ == "__main__":
+    # test to query google search on "what is penetration testing?"
+    query = "what is penetration testing?"
+    for url in search(query, tld="com", num=5, stop=5, pause=2):
+        print(url)
+        web_content = parse_web(url)
+        print(web_content)