fix: 🐛 minor API bug fix

Delete the chat session from class
2025-10-29 16:58:59 +00:00 · 2023-03-21 13:16:14 +08:00
parent de9010d25e
commit 2166113902
6 changed files with 131 additions and 44 deletions
--- a/tasks/NLP_RESTAPI_Generation.py
+++ b/tasks/NLP_RESTAPI_Generation.py
@@ -1,13 +0,0 @@
-from utils.chatgpt import ChatGPT
-from config.chatgpt_config import ChatGPTConfig
-
-import loguru
-
-logger = loguru.logger
-
-# format: {name: {description: str, sample_curl: str, sample_response: str}}
-API_description = {}
-
-
-if __name__ == "__main__":
-    chatGPTAgent = ChatGPT()
--- a/tasks/crawler.py
+++ b/tasks/crawler.py
@@ -2,39 +2,62 @@ import requests
 from bs4 import BeautifulSoup
 import json

-def parse_markdown_page(url):
+
+def crawl_dotCMS_description_page(
+    url="https://www.dotcms.com/docs/latest/container-api", output_dir="outputs"
+):
    page = requests.get(url)
-    soup = BeautifulSoup(page.content, 'html.parser')
+    soup = BeautifulSoup(page.content, "html.parser")

    # Extract the title of the page
-    title = soup.find('h1').get_text()
+    title = soup.find("h1").get_text()

    # Extract the subtitles and their descriptions and code chunks
-    subtitles = soup.find_all('h2')
+    subtitles = soup.find_all("h2")
    parsed_subtitles = []
    for subtitle in subtitles:
        subtitle_title = subtitle.get_text()
-        subtitle_contents = subtitle.find_next_siblings(['p', 'pre'])
+        subtitle_contents = subtitle.find_next_siblings(["p", "pre"])
        subtitle_parsed_contents = []
-        for i in range(0, len(subtitle_contents), 2):
-            description = subtitle_contents[i].get_text()
-            if len(subtitle_contents) > i+1:
-                code = subtitle_contents[i+1].get_text()
+        description = ""
+        for content in subtitle_contents:
+            # Check if the content is a code block
+            if content.name == "pre" and content.code:
+                code = content.get_text()
+                # Add the previous description and code chunk to the list
+                if len(description) != 0:  # If there is no description, don't add it
+                    parsed_description = description.strip().replace("\n", " ")
+                    parsed_code = code.strip().replace("\n", " ")
+                    subtitle_parsed_contents.append([parsed_description, parsed_code])
+
            else:
-                code = ''
-            subtitle_parsed_contents.append([description, code])
+                # Concatenate the non-code content into a single description string
+                description += (
+                    "\n" + content.get_text() if description else content.get_text()
+                )
        parsed_subtitles.append([subtitle_title, subtitle_parsed_contents])

    # Save the results as a structured JSON object
-    output = {'title': title}
+    title = title.strip().replace(" ", "_").lower()
+    output = {"title": title}
    for i in range(len(parsed_subtitles)):
-        output[f'subtitle{i+1}'] = parsed_subtitles[i][1]
-
-    with open('output.json', 'w') as outfile:
-        json.dump(output, outfile, indent=4)
+        output[parsed_subtitles[i][0]] = parsed_subtitles[i][1]

+    with open(f"{output_dir}/{title}.json", "w") as f:
+        json.dump(output, f)
    return output

-url = 'https://www.dotcms.com/docs/latest/container-api'
-output = parse_markdown_page(url)
-print(output)
+
+def crawl_strapi_documentation(url, output_dir="outputs"):
+    pass
+
+
+if __name__ == "__main__":
+    output_dir = "outputs"
+
+    # example 1: crawl the description page of dotCMS container API
+    # dotCMS_url = 'https://www.dotcms.com/docs/latest/container-api'
+    # output = crawl_dotCMS_description_page(url=dotCMS_url, output_dir=output_dir)
+
+    # example 2: crawl the documentation page of
+    pass