mirror of
https://github.com/weyne85/PentestGPT.git
synced 2025-10-29 16:58:59 +00:00
feat: 🎸 Add basic google handler
This commit is contained in:
@@ -8,4 +8,5 @@ loguru
|
||||
beautifulsoup4~=4.11.2
|
||||
colorama
|
||||
rich
|
||||
prompt-toolkit
|
||||
prompt-toolkit
|
||||
google
|
||||
9
tests/testBrowsing.py
Normal file
9
tests/testBrowsing.py
Normal file
@@ -0,0 +1,9 @@
|
||||
import unittest
|
||||
|
||||
|
||||
class TestBrowsing(unittest.TestCase):
|
||||
pass
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
@@ -1,14 +1,87 @@
|
||||
# TODO: parse the web contents with bs4.
|
||||
# Use functions from Auto-GPT: https://github.com/Torantulino/Auto-GPT/blob/master/scripts/browse.py
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from urllib.parse import urlparse, urljoin
|
||||
from googlesearch import search
|
||||
|
||||
|
||||
# Function to check if the URL is valid
|
||||
def is_valid_url(url):
|
||||
try:
|
||||
result = urlparse(url)
|
||||
return all([result.scheme, result.netloc])
|
||||
except ValueError:
|
||||
return False
|
||||
|
||||
|
||||
# Function to sanitize the URL
|
||||
def sanitize_url(url):
|
||||
return urljoin(url, urlparse(url).path)
|
||||
|
||||
|
||||
def check_local_file_access(url):
|
||||
local_prefixes = [
|
||||
"file:///",
|
||||
"file://localhost",
|
||||
"http://localhost",
|
||||
"https://localhost",
|
||||
]
|
||||
return any(url.startswith(prefix) for prefix in local_prefixes)
|
||||
|
||||
|
||||
def get_response(url, timeout=10):
|
||||
"""
|
||||
Get the response from the URL.
|
||||
|
||||
Parameters:
|
||||
----------
|
||||
url (str): The URL to get the response from.
|
||||
timeout (int): The timeout for the HTTP request.
|
||||
|
||||
Returns:
|
||||
-------
|
||||
response (requests.models.Response): The response from the URL.
|
||||
error (str): The error message if any.
|
||||
"""
|
||||
try:
|
||||
# Restrict access to local files
|
||||
if check_local_file_access(url):
|
||||
raise ValueError("Access to local files is restricted")
|
||||
|
||||
# Most basic check if the URL is valid:
|
||||
if not url.startswith("http://") and not url.startswith("https://"):
|
||||
raise ValueError("Invalid URL format")
|
||||
|
||||
sanitized_url = sanitize_url(url)
|
||||
|
||||
user_agent_header = {
|
||||
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36"
|
||||
}
|
||||
|
||||
response = requests.get(
|
||||
sanitized_url, headers=user_agent_header, timeout=timeout
|
||||
)
|
||||
|
||||
# Check if the response contains an HTTP error
|
||||
if response.status_code >= 400:
|
||||
return None, "Error: HTTP " + str(response.status_code) + " error"
|
||||
|
||||
return response, None
|
||||
except ValueError as ve:
|
||||
# Handle invalid URL format
|
||||
return None, "Error: " + str(ve)
|
||||
|
||||
except requests.exceptions.RequestException as re:
|
||||
# Handle exceptions related to the HTTP request (e.g., connection errors, timeouts, etc.)
|
||||
return None, "Error: " + str(re)
|
||||
|
||||
|
||||
def parse_web(url):
|
||||
# create a user agent header
|
||||
user_agent_header = {
|
||||
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36"
|
||||
}
|
||||
response = requests.get(url, headers=user_agent_header)
|
||||
response, potential_error = get_response(url)
|
||||
if response is None:
|
||||
return potential_error
|
||||
|
||||
# Check if the response contains an HTTP error
|
||||
if response.status_code >= 400:
|
||||
return "Error: HTTP " + str(response.status_code) + " error"
|
||||
@@ -24,3 +97,12 @@ def parse_web(url):
|
||||
text = "\n".join(chunk for chunk in chunks if chunk)
|
||||
|
||||
return text
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# test to query google search on "what is penetration testing?"
|
||||
query = "what is penetration testing?"
|
||||
for url in search(query, tld="com", num=5, stop=5, pause=2):
|
||||
print(url)
|
||||
web_content = parse_web(url)
|
||||
print(web_content)
|
||||
|
||||
Reference in New Issue
Block a user