feat: 🎸 Add basic google handler

This commit is contained in:
Grey_D
2023-04-14 23:24:25 +08:00
parent 37f16b7c72
commit 3795331598
3 changed files with 98 additions and 6 deletions

View File

@@ -8,4 +8,5 @@ loguru
beautifulsoup4~=4.11.2
colorama
rich
prompt-toolkit
prompt-toolkit
google

9
tests/testBrowsing.py Normal file
View File

@@ -0,0 +1,9 @@
import unittest
class TestBrowsing(unittest.TestCase):
pass
if __name__ == "__main__":
unittest.main()

View File

@@ -1,14 +1,87 @@
# TODO: parse the web contents with bs4.
# Use functions from Auto-GPT: https://github.com/Torantulino/Auto-GPT/blob/master/scripts/browse.py
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin
from googlesearch import search
# Function to check if the URL is valid
def is_valid_url(url):
try:
result = urlparse(url)
return all([result.scheme, result.netloc])
except ValueError:
return False
# Function to sanitize the URL
def sanitize_url(url):
return urljoin(url, urlparse(url).path)
def check_local_file_access(url):
local_prefixes = [
"file:///",
"file://localhost",
"http://localhost",
"https://localhost",
]
return any(url.startswith(prefix) for prefix in local_prefixes)
def get_response(url, timeout=10):
"""
Get the response from the URL.
Parameters:
----------
url (str): The URL to get the response from.
timeout (int): The timeout for the HTTP request.
Returns:
-------
response (requests.models.Response): The response from the URL.
error (str): The error message if any.
"""
try:
# Restrict access to local files
if check_local_file_access(url):
raise ValueError("Access to local files is restricted")
# Most basic check if the URL is valid:
if not url.startswith("http://") and not url.startswith("https://"):
raise ValueError("Invalid URL format")
sanitized_url = sanitize_url(url)
user_agent_header = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36"
}
response = requests.get(
sanitized_url, headers=user_agent_header, timeout=timeout
)
# Check if the response contains an HTTP error
if response.status_code >= 400:
return None, "Error: HTTP " + str(response.status_code) + " error"
return response, None
except ValueError as ve:
# Handle invalid URL format
return None, "Error: " + str(ve)
except requests.exceptions.RequestException as re:
# Handle exceptions related to the HTTP request (e.g., connection errors, timeouts, etc.)
return None, "Error: " + str(re)
def parse_web(url):
# create a user agent header
user_agent_header = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36"
}
response = requests.get(url, headers=user_agent_header)
response, potential_error = get_response(url)
if response is None:
return potential_error
# Check if the response contains an HTTP error
if response.status_code >= 400:
return "Error: HTTP " + str(response.status_code) + " error"
@@ -24,3 +97,12 @@ def parse_web(url):
text = "\n".join(chunk for chunk in chunks if chunk)
return text
if __name__ == "__main__":
# test to query google search on "what is penetration testing?"
query = "what is penetration testing?"
for url in search(query, tld="com", num=5, stop=5, pause=2):
print(url)
web_content = parse_web(url)
print(web_content)