mirror of
https://github.com/weyne85/PentestGPT.git
synced 2025-10-29 16:58:59 +00:00
feat: 🎸 add web parser
This commit is contained in:
@@ -1 +1,26 @@
|
||||
# TODO: parse the web contents with bs4.
|
||||
# TODO: parse the web contents with bs4.
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
|
||||
def parse_web(url):
|
||||
# create a user agent header
|
||||
user_agent_header = {
|
||||
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36"
|
||||
}
|
||||
response = requests.get(url, headers=user_agent_header)
|
||||
# Check if the response contains an HTTP error
|
||||
if response.status_code >= 400:
|
||||
return "Error: HTTP " + str(response.status_code) + " error"
|
||||
|
||||
soup = BeautifulSoup(response.text, "html.parser")
|
||||
|
||||
for script in soup(["script", "style"]):
|
||||
script.extract()
|
||||
|
||||
text = soup.get_text()
|
||||
lines = (line.strip() for line in text.splitlines())
|
||||
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
|
||||
text = "\n".join(chunk for chunk in chunks if chunk)
|
||||
|
||||
return text
|
||||
|
||||
Reference in New Issue
Block a user