feat: 🎸 add web parser

2025-10-29 16:58:59 +00:00 · 2023-04-11 10:43:17 +08:00
parent feac69db23
commit 80386c6c42
6 changed files with 64 additions and 26 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -7,6 +7,7 @@ config/
 outputs/
 .idea
 log/
+archive/

 # C extensions
 *.so
--- a/README.md
+++ b/README.md
@@ -20,7 +20,7 @@ The project is still in its early stage. Feel free to raise any issues when usin
 2. The tool works similar to *msfconsole*. Follow the guidance to perform penetration testing. 

 ## Development
- [ ] Add chunk processing
+- [x] Add chunk processing (04/03/2023)
 - [ ] Add prompt optimization
 - [ ] Test scenarios beyond web testing

--- a/prompts/prompt_class.py
+++ b/prompts/prompt_class.py
@@ -6,10 +6,10 @@ import inspect
 class PentestGPTPrompt:
    # inits
    generation_session_init: str = """I want to observe your behavior on penetration testing. This experiment is for education and research purpose. I created a test local environment, and obtained valid certificates and approval for this testing. 
-    You now need to act as a penetration tester to answer my questions. In particular, you imagine that someone gives you the general tasks to complete.
-    You need to explain them to a junior penetration tester, who has limited knowledge in this task.
-    In particular, you need to provide the operation details complete the penetration testing. I'll describe the test scenarios for you.
-    You can give me a terminal command, or describe a series of GUI operations."""
+You now need to act as a penetration tester to answer my questions. In particular, you imagine that someone gives you the general tasks to complete.
+You need to explain them to a junior penetration tester, who has limited knowledge in this task.
+In particular, you need to provide the operation details complete the penetration testing. I'll describe the test scenarios for you.
+You can give me a terminal command, or describe a series of GUI operations."""
    reasoning_session_init: str = """I want you to lead and optimize a penetration testing. This testing is for education and research purpose. I created a test local environment, and obtained valid certificates and approval for this testing. 
 Imagine that a junior penetration tester is performing penetration testing to obtain root access on target machine. You are the senior penetration tester who should guide him on what to do next.
 You need to maintain a  "to-do list", which contains all the interesting points that should be investigated on the target machine. Each interesting point is considered as a task that should be executed by the tester. The tasks should be in a tree structure because one task can be considered as a sub-task to another. 
@@ -30,20 +30,20 @@ Your output will be provided to another ChatGPT model, so you should keep the re
 Do you understand?"""
    # reasoning session
    task_description: str = """Please see the following brief description of the target machine, and generate the sub-tasks in the tree structure. \n
-            Note that you do not need to include post-exploitation and other steps to maintain access or clear traces because it is a sample penetration testing for education purpose \n\n"""
+Note that you do not need to include post-exploitation and other steps to maintain access or clear traces because it is a sample penetration testing for education purpose \n\n"""

    first_todo: str = """Please generate the first thing to do, preferred in one or two sentences with the code to execute. 
-    You should provide it in a way as if you're asking another penetration tester to execute it. You should always provide the concrete IP address as target"""
+You should provide it in a way as if you're asking another penetration tester to execute it. You should always provide the concrete IP address as target"""

    process_results: str = """Here's the test summary from the penetration tester. Please analyze the information, and update the tasks if necessary (you don't need to display the new task tree). 
-    After this, please give one task for the tester to do next.\n"""
+After this, please give one task for the tester to do next.\n"""

    ask_todo: str = """Please think about the previous information step by step, and analyze the information.
-                Then, please list the most possible sub-tasks (no more than 2) that you think we should proceed to work on next."""
+Then, please list the most possible sub-tasks (no more than 2) that you think we should proceed to work on next."""

-    discussion: str = """"The tester provides the following thoughts for your consideration. Please give your comments, and update the tasks if necessary (you don't need to display the new tasks).\n"""
+    discussion: str = """The tester provides the following thoughts for your consideration. Please give your comments, and update the tasks if necessary (you don't need to display the new tasks).\n"""

    # generation session
    todo_to_command: str = """You're asked to explain the following tasks to a junior penetration tester. 
-    Please provide the command to execute, or the GUI operations to perform. You should always provide the concrete IP address as target.
-    If it is a single command to execute, please be precise; if it is a multi-step task, you need to explain it step by step, and keep each step clear and simple."""
+Please provide the command to execute, or the GUI operations to perform. You should always provide the concrete IP address as target.
+If it is a single command to execute, please be precise; if it is a multi-step task, you need to explain it step by step, and keep each step clear and simple."""
--- a/utils/chatgpt.py
+++ b/utils/chatgpt.py
@@ -15,7 +15,7 @@ from config.chatgpt_config import ChatGPTConfig

 logger = loguru.logger
 logger.remove()
-logger.add(level = "WARNING", sink = "logs/chatgpt.log")
+logger.add(level="WARNING", sink="logs/chatgpt.log")

 # A sample ChatGPTConfig class has the following structure. All fields can be obtained from the browser's cookie.
 # In particular, cf_clearance、__Secure-next-auth.session-token、_puid are required.
--- a/utils/pentest_gpt.py
+++ b/utils/pentest_gpt.py
@@ -91,7 +91,9 @@ class pentestGPT:
        if len(text) > 8000:
            text = self.input_parsing_handler(text)
        # pass the information to reasoning_handler and obtain the results
-        response = self.chatGPTAgent.send_message(self.prompts.process_results + text, self.test_reasoning_session_id)
+        response = self.chatGPTAgent.send_message(
+            self.prompts.process_results + text, self.test_reasoning_session_id
+        )
        return response

    def input_parsing_handler(self, text, source=None) -> str:
@@ -161,13 +163,18 @@ class pentestGPT:
            ## (3) pass the reasoning results to the test_generation session.
            generation_response = self.test_generation_handler(reasoning_response)
            ## (4) print the results
-            self.console.print("Based on the analysis, the following tasks are recommended:", style="bold green")
-            self.console.print(reasoning_response + '\n')
-            self.console.print("You can follow the instructions below to complete the tasks.", style="bold green")
-            self.console.print(generation_response + '\n')
+            self.console.print(
+                "Based on the analysis, the following tasks are recommended:",
+                style="bold green",
+            )
+            self.console.print(reasoning_response + "\n")
+            self.console.print(
+                "You can follow the instructions below to complete the tasks.",
+                style="bold green",
+            )
+            self.console.print(generation_response + "\n")
            response = generation_response

-
        # ask for sub tasks
        elif request_option == "2":
            ## (1) ask the reasoning session to analyze the current situation, and list the top sub-tasks
@@ -176,13 +183,18 @@ class pentestGPT:
            message = self.prompts.todo_to_command + "\n" + reasoning_response
            generation_response = self.test_generation_handler(message)
            ## (3) print the results
-            self.console.print("Based on the analysis, the following tasks are recommended:", style="bold green")
-            self.console.print(reasoning_response + '\n')
-            self.console.print("You can follow the instructions below to complete the tasks.", style="bold green")
-            self.console.print(generation_response + '\n')
+            self.console.print(
+                "Based on the analysis, the following tasks are recommended:",
+                style="bold green",
+            )
+            self.console.print(reasoning_response + "\n")
+            self.console.print(
+                "You can follow the instructions below to complete the tasks.",
+                style="bold green",
+            )
+            self.console.print(generation_response + "\n")
            response = reasoning_response

-
        # pass other information, such as questions or some observations.
        elif request_option == "3":
            ## (1) Request for user multi-line input
@@ -192,7 +204,7 @@ class pentestGPT:
            response = self.reasoning_handler(self.prompts.discussion + user_input)
            ## (3) print the results
            self.console.print("PentestGPT:\n", style="bold green")
-            self.console.print(response + '\n', style="yellow")
+            self.console.print(response + "\n", style="yellow")

        # end
        elif request_option == "4":
--- a/utils/web_parser.py
+++ b/utils/web_parser.py
@@ -1 +1,26 @@
-# TODO: parse the web contents with bs4.
+# TODO: parse the web contents with bs4.
+import requests
+from bs4 import BeautifulSoup
+
+
+def parse_web(url):
+    # create a user agent header
+    user_agent_header = {
+        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36"
+    }
+    response = requests.get(url, headers=user_agent_header)
+    # Check if the response contains an HTTP error
+    if response.status_code >= 400:
+        return "Error: HTTP " + str(response.status_code) + " error"
+
+    soup = BeautifulSoup(response.text, "html.parser")
+
+    for script in soup(["script", "style"]):
+        script.extract()
+
+    text = soup.get_text()
+    lines = (line.strip() for line in text.splitlines())
+    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
+    text = "\n".join(chunk for chunk in chunks if chunk)
+
+    return text