PentestGPT/tasks/crawler.py
Grey_D a3e4d5696c feat: 🎸 API support
Add support for ChatGPT API (for pro only)
2023-03-19 19:13:33 +08:00

40 lines
1.3 KiB
Python

import requests
from bs4 import BeautifulSoup
import json
def parse_markdown_page(url):
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
# Extract the title of the page
title = soup.find('h1').get_text()
# Extract the subtitles and their descriptions and code chunks
subtitles = soup.find_all('h2')
parsed_subtitles = []
for subtitle in subtitles:
subtitle_title = subtitle.get_text()
subtitle_contents = subtitle.find_next_siblings(['p', 'pre'])
subtitle_parsed_contents = []
for i in range(0, len(subtitle_contents), 2):
description = subtitle_contents[i].get_text()
if len(subtitle_contents) > i+1:
code = subtitle_contents[i+1].get_text()
else:
code = ''
subtitle_parsed_contents.append([description, code])
parsed_subtitles.append([subtitle_title, subtitle_parsed_contents])
# Save the results as a structured JSON object
output = {'title': title}
for i in range(len(parsed_subtitles)):
output[f'subtitle{i+1}'] = parsed_subtitles[i][1]
with open('output.json', 'w') as outfile:
json.dump(output, outfile, indent=4)
return output
url = 'https://www.dotcms.com/docs/latest/container-api'
output = parse_markdown_page(url)
print(output)