From 2166113902a491ae2f4e4e5ed8b8c60da217281a Mon Sep 17 00:00:00 2001
From: Grey_D <GDENG003@e.ntu.edu.sg>
Date: Tue, 21 Mar 2023 13:16:14 +0800
Subject: [PATCH] =?UTF-8?q?fix:=20=F0=9F=90=9B=20minor=20API=20bug=20fix?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Delete the chat session from class
---
 NLP_RESTAPI_Generation.py       | 66 +++++++++++++++++++++++++++++++++
 tasks/NLP_RESTAPI_Generation.py | 13 -------
 tasks/crawler.py                | 61 ++++++++++++++++++++----------
 utils/__init__.py               |  0
 utils/chatgpt.py                | 23 +++++++++---
 utils/chatgpt_browser.py        | 12 +++---
 6 files changed, 131 insertions(+), 44 deletions(-)
 create mode 100644 NLP_RESTAPI_Generation.py
 delete mode 100644 tasks/NLP_RESTAPI_Generation.py
 create mode 100644 utils/__init__.py

diff --git a/NLP_RESTAPI_Generation.py b/NLP_RESTAPI_Generation.py
new file mode 100644
index 0000000..bd014b7
--- /dev/null
+++ b/NLP_RESTAPI_Generation.py
@@ -0,0 +1,66 @@
+from utils.chatgpt import ChatGPT
+from config.chatgpt_config import ChatGPTConfig
+import json
+
+import loguru
+
+logger = loguru.logger
+
+# format: {name: {description: str, sample_curl: str, sample_response: str}}
+
+task_prompt_0 = """
+I need your help to convert natural language REST API documentation to OpenAPI 3.0 standard.
+Here are the detailed requirements:
+(1) I hope that the converted openapi documentation is in json format. I will give you the description for one request at a time, and you return me the corresponding json. You should handle the output with proper indent, so that I could paste all your outputs together to form a complete documentation.
+(2) For each request, I'll give you a sample curl, and a request description. You should formulate the documentation based on them, especially to fill the "example" field of the request.
+"""
+
+task_prompt_1 = """
+Now we start with a service called dotCMS. Please generate a header for OpenAPI 3.0 first. Take care of the indentation so that I can directly put it together with later outputs to form one API documentation.
+It supports authorization token for each request. A sample curl looks like this: 
+```
+curl --location --request GET 'https://demo.dotcms.com/api/v1/containers/working?containerId=REPLACE_THIS_UUID' \
+--header 'Content-Type: application/json' \
+--header 'Authorization: Basic YWRtaW5AZG90Y21zLmNvbTphZG1pbg=='
+```
+"""
+
+task_prompt_2 = """
+Let's start now. In the following, I'll give you a sample curl, and a request description. 
+"""
+
+if __name__ == "__main__":
+    code_fragments = []
+    chatGPTAgent = ChatGPT(ChatGPTConfig())
+    text, conversation_id = chatGPTAgent.send_new_message(task_prompt_0)
+    text = chatGPTAgent.send_message(task_prompt_1, conversation_id)
+    text = chatGPTAgent.send_message(task_prompt_2, conversation_id)
+
+    # load the documentation
+    with open("outputs/container_api.json", "r") as f:
+        container_api = json.load(f)
+    for key, value in container_api.items():
+        if key == "title":
+            # TODO: get title
+            pass
+        elif len(value) != 0:  # is not an empty list
+            title_name = key
+            for item_list in value:
+                description = item_list[0]
+                sample_curl = item_list[1]
+            # concat description and sample_curl
+            ask_text = (
+                "The meta function is "
+                + title_name
+                + "\nThe request description is:"
+                + description
+                + "\nThe sample curl is below: \n"
+                + sample_curl
+                + "\n"
+            )
+            # send description and curl
+            response = chatGPTAgent.send_message(ask_text, conversation_id)
+            # extract code fragments
+            code_fragments.append(chatGPTAgent.extract_code_fragments(response))
+        else:
+            logger.info("No request to process.")
diff --git a/tasks/NLP_RESTAPI_Generation.py b/tasks/NLP_RESTAPI_Generation.py
deleted file mode 100644
index 3833c97..0000000
--- a/tasks/NLP_RESTAPI_Generation.py
+++ /dev/null
@@ -1,13 +0,0 @@
-from utils.chatgpt import ChatGPT
-from config.chatgpt_config import ChatGPTConfig
-
-import loguru
-
-logger = loguru.logger
-
-# format: {name: {description: str, sample_curl: str, sample_response: str}}
-API_description = {}
-
-
-if __name__ == "__main__":
-    chatGPTAgent = ChatGPT()
diff --git a/tasks/crawler.py b/tasks/crawler.py
index 0962877..acd86ae 100644
--- a/tasks/crawler.py
+++ b/tasks/crawler.py
@@ -2,39 +2,62 @@ import requests
 from bs4 import BeautifulSoup
 import json
 
-def parse_markdown_page(url):
+
+def crawl_dotCMS_description_page(
+    url="https://www.dotcms.com/docs/latest/container-api", output_dir="outputs"
+):
     page = requests.get(url)
-    soup = BeautifulSoup(page.content, 'html.parser')
+    soup = BeautifulSoup(page.content, "html.parser")
 
     # Extract the title of the page
-    title = soup.find('h1').get_text()
+    title = soup.find("h1").get_text()
 
     # Extract the subtitles and their descriptions and code chunks
-    subtitles = soup.find_all('h2')
+    subtitles = soup.find_all("h2")
     parsed_subtitles = []
     for subtitle in subtitles:
         subtitle_title = subtitle.get_text()
-        subtitle_contents = subtitle.find_next_siblings(['p', 'pre'])
+        subtitle_contents = subtitle.find_next_siblings(["p", "pre"])
         subtitle_parsed_contents = []
-        for i in range(0, len(subtitle_contents), 2):
-            description = subtitle_contents[i].get_text()
-            if len(subtitle_contents) > i+1:
-                code = subtitle_contents[i+1].get_text()
+        description = ""
+        for content in subtitle_contents:
+            # Check if the content is a code block
+            if content.name == "pre" and content.code:
+                code = content.get_text()
+                # Add the previous description and code chunk to the list
+                if len(description) != 0:  # If there is no description, don't add it
+                    parsed_description = description.strip().replace("\n", " ")
+                    parsed_code = code.strip().replace("\n", " ")
+                    subtitle_parsed_contents.append([parsed_description, parsed_code])
+
             else:
-                code = ''
-            subtitle_parsed_contents.append([description, code])
+                # Concatenate the non-code content into a single description string
+                description += (
+                    "\n" + content.get_text() if description else content.get_text()
+                )
         parsed_subtitles.append([subtitle_title, subtitle_parsed_contents])
 
     # Save the results as a structured JSON object
-    output = {'title': title}
+    title = title.strip().replace(" ", "_").lower()
+    output = {"title": title}
     for i in range(len(parsed_subtitles)):
-        output[f'subtitle{i+1}'] = parsed_subtitles[i][1]
-
-    with open('output.json', 'w') as outfile:
-        json.dump(output, outfile, indent=4)
+        output[parsed_subtitles[i][0]] = parsed_subtitles[i][1]
 
+    with open(f"{output_dir}/{title}.json", "w") as f:
+        json.dump(output, f)
     return output
 
-url = 'https://www.dotcms.com/docs/latest/container-api'
-output = parse_markdown_page(url)
-print(output)
\ No newline at end of file
+
+def crawl_strapi_documentation(url, output_dir="outputs"):
+    pass
+
+
+if __name__ == "__main__":
+    output_dir = "outputs"
+
+    # example 1: crawl the description page of dotCMS container API
+    # dotCMS_url = 'https://www.dotcms.com/docs/latest/container-api'
+    # output = crawl_dotCMS_description_page(url=dotCMS_url, output_dir=output_dir)
+
+    # example 2: crawl the documentation page of
+    pass
diff --git a/utils/__init__.py b/utils/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/utils/chatgpt.py b/utils/chatgpt.py
index d3daf1e..96658e2 100644
--- a/utils/chatgpt.py
+++ b/utils/chatgpt.py
@@ -13,6 +13,18 @@ from config.chatgpt_config import ChatGPTConfig
 
 logger = loguru.logger
 
+# A sample ChatGPTConfig class has the following structure. All fields can be obtained from the browser's cookie.
+# In particular, cf_clearance、__Secure-next-auth.session-token、_puid are required.
+
+# @dataclasses.dataclass
+# class ChatGPTConfig:
+#     model: str = "text-davinci-002-render-sha"
+#     _puid: str = ""
+#     cf_clearance: str = ""
+#     session_token: str = ""
+#     error_wait_time: float = 20
+#     is_debugging: bool = False
+
 
 class ChatGPT:
     def __init__(self, config: ChatGPTConfig):
@@ -42,13 +54,11 @@ class ChatGPT:
         return "Bearer " + authorization
 
     def get_latest_message_id(self, conversation_id):
-        # 获取会话窗口最新消息id，连续对话必须
         url = f"https://chat.openai.com/backend-api/conversation/{conversation_id}"
         r = requests.get(url, headers=self.headers, proxies=self.proxies)
         return r.json()["current_node"]
 
     def send_new_message(self, message):
-        # 发送新会话窗口消息，返回会话id
         logger.info(f"send_new_message")
         url = "https://chat.openai.com/backend-api/conversation"
         message_id = str(uuid1())
@@ -67,7 +77,6 @@ class ChatGPT:
 
         r = requests.post(url, headers=self.headers, json=data, proxies=self.proxies)
         if r.status_code != 200:
-            # 发送消息阻塞时等待20秒从新发送
             logger.error(r.json()["detail"])
             time.sleep(self.config.error_wait_time)
             return self.send_new_message(message)
@@ -89,10 +98,8 @@ class ChatGPT:
         return text, conversation_id
 
     def send_message(self, message, conversation_id):
-        # 指定会话窗口发送连续对话消息
         logger.info(f"send_message")
         url = "https://chat.openai.com/backend-api/conversation"
-        # 获取会话窗口最新消息id
         if conversation_id not in self.latest_message_id_dict:
             logger.info(f"conversation_id: {conversation_id}")
             message_id = self.get_latest_message_id(conversation_id)
@@ -115,7 +122,6 @@ class ChatGPT:
         }
         r = requests.post(url, headers=self.headers, json=data, proxies=self.proxies)
         if r.status_code != 200:
-            # 发送消息阻塞时等待20秒从新发送
             logger.warning(r.json()["detail"])
             time.sleep(self.config.error_wait_time)
             return self.send_message(message, conversation_id)
@@ -158,6 +164,11 @@ class ChatGPT:
             "is_visible": False,
         }
         r = requests.patch(url, headers=self.headers, json=data, proxies=self.proxies)
+
+        # delete conversation id locally
+        if conversation_id in self.latest_message_id_dict:
+            del self.latest_message_id_dict[conversation_id]
+
         if r.status_code == 200:
             return True
         else:
diff --git a/utils/chatgpt_browser.py b/utils/chatgpt_browser.py
index 44893bd..c796f85 100644
--- a/utils/chatgpt_browser.py
+++ b/utils/chatgpt_browser.py
@@ -20,20 +20,21 @@ class ChatGPTBrowser:
     The ChatGPT Wrapper based on browser (playwright).
     It keeps the same interface as ChatGPT.
     """
+
     def __init__(self, model=None):
         config = Config()
         if model is not None:
-            config.set('chat.model', model)
+            config.set("chat.model", model)
         self.bot = ChatGPT(config)
 
     def get_authorization(self):
         # TODO: get authorization from browser
-        return 
+        return
 
     def get_latest_message_id(self, conversation_id):
         # TODO: get latest message id from browser
-        return 
-    
+        return
+
     def get_conversation_history(self, limit=20, offset=0):
         # Get the conversation id in the history
         return self.bot.get_history(limit, offset)
@@ -52,16 +53,15 @@ class ChatGPTBrowser:
     def extract_code_fragments(self, text):
         code_fragments = re.findall(r"```(.*?)```", text, re.DOTALL)
         return code_fragments
-    
 
     def delete_conversation(self, conversation_id=None):
         # delete conversation with its uuid
         if conversation_id is not None:
             self.bot.delete_conversation(conversation_id)
 
+
 if __name__ == "__main__":
     chatgptBrowser_session = ChatGPTBrowser()
     text, conversation_id = chatgptBrowser_session.send_new_message(
         "I am a new tester for RESTful APIs."
     )
-