PentestGPT/tasks/crawler.py

import requests
from bs4 import BeautifulSoup
import json


def crawl_dotCMS_description_page(
    url="https://www.dotcms.com/docs/latest/container-api", output_dir="outputs"
):
    page = requests.get(url)
    soup = BeautifulSoup(page.content, "html.parser")

    # Extract the title of the page
    title = soup.find("h1").get_text()

    # Extract the subtitles and their descriptions and code chunks
    subtitles = soup.find_all("h2")
    parsed_subtitles = []
    for subtitle in subtitles:
        subtitle_title = subtitle.get_text()
        subtitle_contents = subtitle.find_next_siblings(["p", "pre"])
        subtitle_parsed_contents = []
        description = ""
        for content in subtitle_contents:
            # Check if the content is a code block
            if content.name == "pre" and content.code:
                code = content.get_text()
                # Add the previous description and code chunk to the list
                if len(description) != 0:  # If there is no description, don't add it
                    parsed_description = description.strip().replace("\n", " ")
                    parsed_code = code.strip().replace("\n", " ")
                    subtitle_parsed_contents.append([parsed_description, parsed_code])

            else:
                # Concatenate the non-code content into a single description string
                description += (
                    "\n" + content.get_text() if description else content.get_text()
                )
        parsed_subtitles.append([subtitle_title, subtitle_parsed_contents])

    # Save the results as a structured JSON object
    title = title.strip().replace(" ", "_").lower()
    output = {"title": title}
    for i in range(len(parsed_subtitles)):
        output[parsed_subtitles[i][0]] = parsed_subtitles[i][1]

    with open(f"{output_dir}/{title}.json", "w") as f:
        json.dump(output, f)
    return output


def crawl_strapi_documentation(url, output_dir="outputs"):
    pass


if __name__ == "__main__":
    output_dir = "outputs"

    # example 1: crawl the description page of dotCMS container API
    # dotCMS_url = 'https://www.dotcms.com/docs/latest/container-api'
    # output = crawl_dotCMS_description_page(url=dotCMS_url, output_dir=output_dir)

    # example 2: crawl the documentation page of
    pass