diff --git a/requirements.txt b/requirements.txt index 0096752..0007f8d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,4 +8,5 @@ loguru beautifulsoup4~=4.11.2 colorama rich -prompt-toolkit \ No newline at end of file +prompt-toolkit +google \ No newline at end of file diff --git a/tests/testBrowsing.py b/tests/testBrowsing.py new file mode 100644 index 0000000..c8ae16a --- /dev/null +++ b/tests/testBrowsing.py @@ -0,0 +1,9 @@ +import unittest + + +class TestBrowsing(unittest.TestCase): + pass + + +if __name__ == "__main__": + unittest.main() diff --git a/utils/web_parser.py b/utils/web_parser.py index 4fbb520..ec12808 100644 --- a/utils/web_parser.py +++ b/utils/web_parser.py @@ -1,14 +1,87 @@ -# TODO: parse the web contents with bs4. +# Use functions from Auto-GPT: https://github.com/Torantulino/Auto-GPT/blob/master/scripts/browse.py import requests from bs4 import BeautifulSoup +from urllib.parse import urlparse, urljoin +from googlesearch import search + + +# Function to check if the URL is valid +def is_valid_url(url): + try: + result = urlparse(url) + return all([result.scheme, result.netloc]) + except ValueError: + return False + + +# Function to sanitize the URL +def sanitize_url(url): + return urljoin(url, urlparse(url).path) + + +def check_local_file_access(url): + local_prefixes = [ + "file:///", + "file://localhost", + "http://localhost", + "https://localhost", + ] + return any(url.startswith(prefix) for prefix in local_prefixes) + + +def get_response(url, timeout=10): + """ + Get the response from the URL. + + Parameters: + ---------- + url (str): The URL to get the response from. + timeout (int): The timeout for the HTTP request. + + Returns: + ------- + response (requests.models.Response): The response from the URL. + error (str): The error message if any. + """ + try: + # Restrict access to local files + if check_local_file_access(url): + raise ValueError("Access to local files is restricted") + + # Most basic check if the URL is valid: + if not url.startswith("http://") and not url.startswith("https://"): + raise ValueError("Invalid URL format") + + sanitized_url = sanitize_url(url) + + user_agent_header = { + "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36" + } + + response = requests.get( + sanitized_url, headers=user_agent_header, timeout=timeout + ) + + # Check if the response contains an HTTP error + if response.status_code >= 400: + return None, "Error: HTTP " + str(response.status_code) + " error" + + return response, None + except ValueError as ve: + # Handle invalid URL format + return None, "Error: " + str(ve) + + except requests.exceptions.RequestException as re: + # Handle exceptions related to the HTTP request (e.g., connection errors, timeouts, etc.) + return None, "Error: " + str(re) def parse_web(url): # create a user agent header - user_agent_header = { - "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36" - } - response = requests.get(url, headers=user_agent_header) + response, potential_error = get_response(url) + if response is None: + return potential_error + # Check if the response contains an HTTP error if response.status_code >= 400: return "Error: HTTP " + str(response.status_code) + " error" @@ -24,3 +97,12 @@ def parse_web(url): text = "\n".join(chunk for chunk in chunks if chunk) return text + + +if __name__ == "__main__": + # test to query google search on "what is penetration testing?" + query = "what is penetration testing?" + for url in search(query, tld="com", num=5, stop=5, pause=2): + print(url) + web_content = parse_web(url) + print(web_content)