Merge pull request #11 from GreyDGL/browse

Browse
2025-10-29 16:58:59 +00:00 · 2023-04-17 10:22:24 +08:00
parent 6e5e6816e5 408a5b48c7
commit 05bb7ece79
8 changed files with 161 additions and 11 deletions
--- a/README.md
+++ b/README.md
@@ -1,5 +1,10 @@
 # PentestGPT

+17/04/2023: Due to the additional verification by OpenAI, you now need to pass the full `cookie` variable into the session. 
+You can obtain it from `https://chat.openai.com/api/auth/session`, by examining the HTTP request cookie. Check the updated chatgpt_config_sample.
+I'll try to fix this later.
+
+
 ## Introduction
 **PentestGPT** is a penetration testing tool empowered by **ChatGPT**. It is designed to automate the penetration testing process. It is built on top of ChatGPT and operate in an interactive mode to guide penetration testers in both overall progress and specific operations.
 A sample testing process of **PentestGPT** on a target VulnHub machine (Hackable II) is available at [here](./resources/PentestGPT_Hackable2.pdf).
--- a/config/chatgpt_config_sample.py
+++ b/config/chatgpt_config_sample.py
@@ -7,5 +7,8 @@ class ChatGPTConfig:
    _puid: str = ""
    cf_clearance: str = ""
    session_token: str = ""
+    accessToken: str = "eyJhbGciOiJSUzI1NiIsInR5cCI6IkpXVCIsImtpZCI6Ik1UaEVOVUpHTkVNMVFURTRNMEZCTWpkQ05UZzVNRFUxUlRVd1FVSkRNRU13UmtGRVFrRXpSZyJ9.eyJodHRwczovL2FwaS5vcGVuYWkuY29tL3Byb2ZpbGUiOnsiZW1haWwiOiJnZWxlaUBxdWFudHN0YW1wLmNvbSIsImVtYWlsX3ZlcmlmaWVkIjp0cnVlfSwiaHR0cHM6Ly9hcGkub3BlbmFpLmNvbS9hdXRoIjp7InVzZXJfaWQiOiJ1c2VyLW53ZmxBZzJ0aGxTVkh6cEJnd0dGUmdxRSJ9LCJpc3MiOiJodHRwczovL2F1dGgwLm9wZW5haS5jb20vIiwic3ViIjoiZ29vZ2xlLW9hdXRoMnwxMTM3MDI0Nzk2MzI2NTQ3NTk3NjIiLCJhdWQiOlsiaHR0cHM6Ly9hcGkub3BlbmFpLmNvbS92MSIsImh0dHBzOi8vb3BlbmFpLm9wZW5haS5hdXRoMGFwcC5jb20vdXNlcmluZm8iXSwiaWF0IjoxNjgxNjM3Mjc1LCJleHAiOjE2ODI4NDY4NzUsImF6cCI6IlRkSkljYmUxNldvVEh0Tjk1bnl5d2g1RTR5T282SXRHIiwic2NvcGUiOiJvcGVuaWQgcHJvZmlsZSBlbWFpbCBtb2RlbC5yZWFkIG1vZGVsLnJlcXVlc3Qgb3JnYW5pemF0aW9uLnJlYWQgb2ZmbGluZV9hY2Nlc3MifQ.n1FPeYK6Bu2VCJf5NtY2yro9qzpaA8OfUcscXbUoHkIeuth__LHdbjjnwFbO9midXiaP5nY2wi86LMIWZUU99P-_A6CodAT-Qs26Hef3-6daSFfYuL_tgMJ6jcPGq9KGlGIhytcJ7v4v90XSV5M9JjWMHMsTbMGhz1MuHplQGLZdjDG23mH_qxRUWduhEDExnEkBMNgiFT7POs30fNU5YzLz15w7UBnBTEctH60IfUFlni6C4ibybVZLcbY4IOoAIifW-okKhBpazlSIPDcLAq3CG7nzELbML69omdEM0qbEUCEOiB-E6Z0ICbWJFJGmACGHjycuN2d8F2oDbyGTeQ"
+    # pass the complete cookie string
+    cookie: str = "intercom-device-id-dgkjq2bp=0b79bf97-190f-4146-90b1-8e5ee76889a9; __Host-next-auth.csrf-token=0b7e3bb24cc2f1d21030a03269484f928527e4aab16c9b4d344529ee46ca9fe8%7C2a6e7e38eaac7ca8cbcae40912bade72150d8aa18317e3db74f49b125957613a; oai-asdf-ugss=user-nwflAg2thlSVHzpBgwGFRgqE; oai-asdf-gsspc=user-nwflAg2thlSVHzpBgwGFRgqE; intercom-id-dgkjq2bp=73b81fc6-1a89-4778-8602-938e95bb1c8f; cf_clearance=_z12D.4Y9J04S58iQVBXt_SuylQgcf_.9ZhUa6PdQgg-1680495871-0-1-b0e12fb5.3143ca51.e7e07055-160; _ga=GA1.1.251554109.1679673782; _ga_9YTZJE58M9=GS1.1.1681351156.9.1.1681351217.0.0.0; _cfuvid=FpwoyzyYPrG0a0NqtkhvayIWPZmzOQc4B9g3pEunvo8-1681470057686-0-604800000; cf_clearance=Abi_.usI_LK3Eoensyvi9UgaMHxP8uRhgqHgEgwYNS8-1681634935-0-1-30d8c658.7e8ece0b.32f642bc-160; __Secure-next-auth.callback-url=https%3A%2F%2Fchat.openai.com%2F; _puid=user-nwflAg2thlSVHzpBgwGFRgqE:1681637789-PBgn031oMYpyBkSzmaJs5%2F9gCN%2BWMUl6%2B4eJKNKXBHk%3D; intercom-session-dgkjq2bp=TjZpVEU3S0RDa0dZVkJuS3QxNkJpNE44L2VKNGswMnA0ZTJCYi9ud3B5MlN3ZXVQN1Q0Y2htcThUcGR1V3N1Vi0tbGJLd2ViR1hZL0oxek9Wc2NYbjBOdz09--237d1264dc1f1bb1e8751c867154000e0febe08f; __cf_bm=Nl7zy2rM7a8Ix1MB64EyiG5vePkLZ0HX2RtJuj1SYT4-1681638176-0-AUS+5CeavTt8Xs/aw07CxlVVfvtjjfcLCxru0byb1OdTmb5UpP6kbVhesib0j8vJblhaO19VTr7wVEtr46iiA7D+4zVNYD0b4Lh6gZ5wBXlSmf0lrOl/vDhtCn4WOiY92Uu2+6JqWAk6gtHYfSx+waQqzaKIPQnQMNti2IrpyZZd; __Secure-next-auth.session-token=eyJhbGciOiJkaXIiLCJlbmMiOiJBMjU2R0NNIn0..AWS-LTB6eFjfk-oM.7MNUUFAU1yRzRnRzwNlefn-076MjjtFxN2gylg6TN6IJGk19QJ5EkAeLwqgc1ce0YDhbkKCSOf0Y3D5C_ww--qFNfq5gGszIDjcjT7AKx_FMwjfRqVFjAJ2EUvnyZxTGRRXNXUQY21zjYndcETy66mpbVI1v2tZin7ge_9oqqNh_ikedVql0Gn9awBL5qS1hI8NHaAa9EjuJCJjtw4I4cCQ7AlYc52Ze3__cGQxR01cjPZq0HFrVlu89oIOqetexubO1OSYqaDc351Lk16C_dSLQz7XxTv5P_kDv2tUmTExx36z9DuuJY8InwrpHTPa6xH9vqy9HmDj9Yu_56fsNeVhArdbE4ulKzkZo-N1sZfTu2TzJVdOJWlIQSR6Vbdp5BEdVm_a8iYrAC4sCyswSYhnUm-EjxA4UGMFB-K7eDMwbimy1rYKk9hCwLgX2LWSPcRuFNvXvv4mZFPwIO3ZmbJ-U6vE__IaMwq-6fHqvQiQEjjInU5LyIGp4EFrXvlWI3iP7vGRXDneMsRhc84mL-iDYlOddfqWvhko-RLW3X-soMWH1ARpiKyEZCYAPSRZ_HnYjWWeFfEgvEy-lFb5MPPGPvpb9DC3PZty9NoQwQIIxKvWLxzkH4kEiIfQ7Wz4_tXadvgPddqkeBTHAUwvzThPDP_zL6KGy_i09xWAmAwgWU221AXQMc-RD1T0X8tq6ivQBSjVvtgpx4wP0tltG81hiAlTXZMjqG5hWGlWkPDg-uT2SafB5t-8rPnaj_ZIhej3wa78jjgfcJCairEtlFerNtZsCRnoslx9yRIAFuPs4ogIkCJdzNb5vfkJM1djZDaiFBRYIocjjE-AXNJY-bngjDQsofyWxFF_tX3hSwkF4KovMcRyvhp7Jn4QIpioL-n8D3uf11AqSINXY3jXkAXRumBxVtNBgZEEKuaBEqmGckQW5r7UTYai2f9Izoml9d-ccdJdpAS4BigY5_QTZc4fVZ8UJ8wr5rPS-hnx-L-Fp2rhwwqNCDUINKX9v5X3LATibQJkWovxw2HCJK-KVrNlsrh8kRpzqZpjVQ-G7niq8sc5ckE-8IrJuUwNSiDIvi9xHgCl8XQGP0qEivis-cHX_rcNZL7giZvm0jIxra9V1PABxrDpjLuvaLBTquhdvE296Tn7qesKKeVlo4TmxSg3kO6Tro4SFi_-bpNPuS1xXMkGdkP63fFfRSK6k4qPn87MQLu7DDvi_Ccdh_gE8mDhfMKcW6UE9IBf4AMQ3fUbWuQ7hCDtPT-ZLEPR6BkUkyxlkuzImNWWWFK0Q2m0lHsr3WSijm8_gQbWZ1Z1Nw9QNUPWzQVAP2sY6LTvCzq94bR5uwIjjcul7aDSZGG0NWnCuj6tN9jhdDKL3Ble8zJG7v_MBz0ZU64EFCsmM6oplfAPXXofNjYeaViccngn2q4ecosbUCrdw4x6cwBU1l2B_P40qTUGg4Pe301A6qe-7qnuFFe6rYp_ylPB7ADCxYpxHOjeR_bUx-cPd15_FyeBhY7Gly7CETli-LwJaQu51nPccxPj90H49b4NLWzo83Gq4f2Ah4Li-QH-iK_AwZPqhA40-0awZzq9PwGAfZKoEFppwvxBRQ-nRxCz6G_BCLPPZg1bKrLxF-uQWQl3pk1eFmdUaPtqvMEHgrT-28QTSxpzyu_Lh1aoJRuA-P8BeEnanIPJurAxFhspZCzX-37tY7AlQKcMScouASpNP7obQ4DR1nhAmhzj_LpV1lef9hx7EHfknKcxnsyJzg0QNnTpNL_rjMagM5Ni25A8fy5WGXlcjAmp8mlYYuyPQ1XQOxHVy4CrgyyS9t6R-t8s70x3jwFc000mnouU5exF8oijkR41dS-r0A6_Wc4dge6yEktodQVpM2Gqzkdl3pmhH_uIcvfDXiHqu-6mbGhaZrH6yk-L8VAtRut8iSKj0MUX8eBztTPYEaBFuYrH8F36aolFMBuOhBU9FxjXOx2xthOVg-UQttbZA4V5QaiJTIuQMOMis-0V9ErL1Z1Y5fh3vqS408guOheb8fhSobAJD9Bq9hMaJVf-rnWBtjMmnss-jU9VC7fFn1tgk-TAaUauKIyxd-RaWrUfwWocWBmtSO_Y0kwPG5kp5aXv7BUd_SyF_9AipuoGc8z7tfZTlPhKuogL9zZJ7DBz4BlidOtSuzeQQS1yx4GM4TJGBgdfaESQlifYyV4Xp21keyCBFf8xWA_HCKGdyXKmO80YtMoOa_WoCIf6KYo_w2-h5BbHe0GhrObIDWD99-TJB-k0Gh17b6TLH9L2J1MO1_wiI3HlKPERYPthE18OQhkktu8iGQ2SeQj1g-Yuh7wrwcFjOpzfM-xZoL2SfXc1Q54fWtglzkophWhRgX9mObEamV97aGrZH-SIT8HswDaH4gzAi094PaxqSQ47lT5iTZerffYWCzL86Y2JTXX7Hoqk84TyhRa6GtDkPu3PLjSYYjbRK7F4iCJ2Vk7UJ3OZ4Hx659gxZ9OO6az4Ymz1TDcpDXgucCpeF86pTsaYrhhe5YP6K6q_C18l7_iViDii41jUbcZpcAFLNWg8clni8Q1X5kF2lM0g3C3ezLcWxjk16bLpK11HPmk5IEEY6DK1TLKoHR6ZT.xzlZwEwM0beQe45RQIeSCw"
    error_wait_time: float = 20
    is_debugging: bool = False
--- a/requirements.txt
+++ b/requirements.txt
@@ -8,4 +8,5 @@ loguru
 beautifulsoup4~=4.11.2
 colorama
 rich
-prompt-toolkit
+prompt-toolkit
+google
--- a/tests/testBrowsing.py
+++ b/tests/testBrowsing.py
@@ -0,0 +1,9 @@
+import unittest
+
+
+class TestBrowsing(unittest.TestCase):
+    pass
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/utils/chatgpt.py
+++ b/utils/chatgpt.py
@@ -19,6 +19,7 @@ logger.add(level="WARNING", sink="logs/chatgpt.log")

 # A sample ChatGPTConfig class has the following structure. All fields can be obtained from the browser's cookie.
 # In particular, cf_clearance、__Secure-next-auth.session-token、_puid are required.
+# Update: the login is currently not available. The current solution is to paste in the full cookie.

 # @dataclasses.dataclass
 # class ChatGPTConfig:
@@ -67,10 +68,11 @@ class ChatGPT:
        self.conversation_dict: Dict[str, Conversation] = {}
        self.headers = dict(
            {
-                "cookie": f"cf_clearance={self.cf_clearance}; _puid={self._puid}; "
-                f"__Secure-next-auth.session-token={self.session_token}",
-                "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
-                "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36",
+                # "cookie": f"cf_clearance={self.cf_clearance}; _puid={self._puid}; __cf_bm=Nl7zy2rM7a8Ix1MB64EyiG5vePkLZ0HX2RtJuj1SYT4-1681638176-0-AUS+5CeavTt8Xs/aw07CxlVVfvtjjfcLCxru0byb1OdTmb5UpP6kbVhesib0j8vJblhaO19VTr7wVEtr46iiA7D+4zVNYD0b4Lh6gZ5wBXlSmf0lrOl/vDhtCn4WOiY92Uu2+6JqWAk6gtHYfSx+waQqzaKIPQnQMNti2IrpyZZd; __Secure-next-auth.callback-url=https%3A%2F%2Fchat.openai.com%2F; __Host-next-auth.csrf-token=0b7e3bb24cc2f1d21030a03269484f928527e4aab16c9b4d344529ee46ca9fe8%7C2a6e7e38eaac7ca8cbcae40912bade72150d8aa18317e3db74f49b125957613a;_cfuvid=FpwoyzyYPrG0a0NqtkhvayIWPZmzOQc4B9g3pEunvo8-1681470057686-0-604800000"
+                # f"__Secure-next-auth.session-token={self.session_token}",
+                "cookie": self.config.cookie,
+                "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36",
+                "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7"
                # 'Content-Type': 'text/event-stream; charset=utf-8',
            }
        )
@@ -80,12 +82,15 @@ class ChatGPT:
        url = "https://chat.openai.com/api/auth/session"
        r = requests.get(url, headers=self.headers)
        authorization = r.json()["accessToken"]
+        # authorization = self.config.accessToken
        return "Bearer " + authorization

    def get_latest_message_id(self, conversation_id):
        # Get continuous conversation message id
        url = f"https://chat.openai.com/backend-api/conversation/{conversation_id}"
+        print(self.headers)
        r = requests.get(url, headers=self.headers, proxies=self.proxies)
+
        return r.json()["current_node"]

    def _parse_message_raw_output(self, response: requests.Response):
--- a/utils/pentest_gpt.py
+++ b/utils/pentest_gpt.py
@@ -7,6 +7,7 @@ from prompts.prompt_class import PentestGPTPrompt
 from utils.prompt_select import prompt_select, prompt_ask
 from prompt_toolkit.formatted_text import HTML
 from utils.task_handler import main_task_entry, mainTaskCompleter
+from utils.web_parser import google_search, parse_web

 import loguru
 import time, os, textwrap
@@ -230,6 +231,20 @@ class pentestGPT:
            self.console.print("PentestGPT:\n", style="bold green")
            self.console.print(response + "\n", style="yellow")

+        # Google
+        elif request_option == "google":
+            # get the users input
+            self.console.print(
+                "Please enter your search query. PentestGPT will summarize the info from google.",
+                style="bold green",
+            )
+            user_input = prompt_ask(
+                "(End with <shift + right-arrow>) Your input: ", multiline=False
+            )
+            with self.console.status("[bold green] PentestGPT Thinking...") as status:
+                # query the question
+                result = self.google_search(user_input, 5)  # 5 results by default
+
        # end
        elif request_option == "quit":
            response = False
--- a/utils/task_handler.py
+++ b/utils/task_handler.py
@@ -16,6 +16,7 @@ class mainTaskCompleter(Completer):
        "more",
        "todo",
        "discuss",
+        "google",
        "help",
        "quit",
    ]
@@ -25,6 +26,7 @@ class mainTaskCompleter(Completer):
        "more": HTML("Explain the task with more details."),
        "todo": HTML("Ask <b>PentestGPT</b> for todos."),
        "discuss": HTML("Discuss with <b>PentestGPT</b>."),
+        "google": HTML("Search on Google."),
        "help": HTML("Show the help page."),
        "quit": HTML("End the current session."),
    }
@@ -35,6 +37,7 @@ Below are the available tasks:
 - more: Explain the previous given task with more details.
 - todo: Ask PentestGPT for the task list and what to do next.
 - discuss: Discuss with PentestGPT. You can ask for help, discuss the task, or give any feedbacks.
+ - google: Search your question on Google. The results are automatically parsed by Google.
 - help: Show this help page.
 - quit: End the current session."""

--- a/utils/web_parser.py
+++ b/utils/web_parser.py
@@ -1,14 +1,94 @@
-# TODO: parse the web contents with bs4.
+# Use functions from Auto-GPT: https://github.com/Torantulino/Auto-GPT/blob/master/scripts/browse.py
 import requests
 from bs4 import BeautifulSoup
+from urllib.parse import urlparse, urljoin
+from googlesearch import search


-def parse_web(url):
+######### Quick documentation #########
+## Use get response to get the original response from the URL
+## Use parse_web to get the text from the URL (bs4 handled)
+## Use google_search to get the search results from Google. Results are already parsed.
+#######################################
+
+
+# Function to check if the URL is valid
+def is_valid_url(url):
+    try:
+        result = urlparse(url)
+        return all([result.scheme, result.netloc])
+    except ValueError:
+        return False
+
+
+# Function to sanitize the URL
+def sanitize_url(url):
+    return urljoin(url, urlparse(url).path)
+
+
+def check_local_file_access(url):
+    local_prefixes = [
+        "file:///",
+        "file://localhost",
+        "http://localhost",
+        "https://localhost",
+    ]
+    return any(url.startswith(prefix) for prefix in local_prefixes)
+
+
+def get_response(url, timeout=10) -> tuple:
+    """
+    Get the response from the URL.
+
+    Parameters:
+    ----------
+        url (str): The URL to get the response from.
+        timeout (int): The timeout for the HTTP request.
+
+    Returns:
+    -------
+        response (requests.models.Response): The response from the URL.
+        error (str): The error message if any.
+    """
+    try:
+        # Restrict access to local files
+        if check_local_file_access(url):
+            raise ValueError("Access to local files is restricted")
+
+        # Most basic check if the URL is valid:
+        if not url.startswith("http://") and not url.startswith("https://"):
+            raise ValueError("Invalid URL format")
+
+        sanitized_url = sanitize_url(url)
+
+        user_agent_header = {
+            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36"
+        }
+
+        response = requests.get(
+            sanitized_url, headers=user_agent_header, timeout=timeout
+        )
+
+        # Check if the response contains an HTTP error
+        if response.status_code >= 400:
+            return None, "Error: HTTP " + str(response.status_code) + " error"
+
+        return response, None
+    except ValueError as ve:
+        # Handle invalid URL format
+        return None, "Error: " + str(ve)
+
+    except requests.exceptions.RequestException as re:
+        # Handle exceptions related to the HTTP request (e.g., connection errors, timeouts, etc.)
+        return None, "Error: " + str(re)
+
+
+def parse_web(url) -> str:
    # create a user agent header
-    user_agent_header = {
-        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36"
-    }
-    response = requests.get(url, headers=user_agent_header)
+    response, potential_error = get_response(url)
+    if response is None:
+        return potential_error
+
    # Check if the response contains an HTTP error
    if response.status_code >= 400:
        return "Error: HTTP " + str(response.status_code) + " error"
@@ -24,3 +104,32 @@ def parse_web(url):
    text = "\n".join(chunk for chunk in chunks if chunk)

    return text
+
+
+def google_search(keyword, num_results=5) -> dict:
+    """
+    Search on Google and return the results.
+
+    Parameters:
+    ----------
+        keyword (str): The keyword to search on Google.
+        num_results (int): The number of results to return.
+
+    Returns:
+    -------
+        result (dict): The search results. Format: {"keyword": keyword, "search_result": {url, content}}}
+
+    """
+    search_result = {}
+    for url in search(keyword, tld="com", num=num_results, stop=num_results, pause=2):
+        search_result[url] = parse_web(url)
+    result = {"keyword": keyword, "search_result": search_result}
+
+
+if __name__ == "__main__":
+    # test to query google search on "what is penetration testing?"
+    query = "what is penetration testing?"
+    for url in search(query, tld="com", num=5, stop=5, pause=2):
+        print(url)
+        web_content = parse_web(url)
+        print(web_content)