mirror of
https://github.com/weyne85/PentestGPT.git
synced 2025-10-29 16:58:59 +00:00
40 lines
1.3 KiB
Python
40 lines
1.3 KiB
Python
import requests
|
|
from bs4 import BeautifulSoup
|
|
import json
|
|
|
|
def parse_markdown_page(url):
|
|
page = requests.get(url)
|
|
soup = BeautifulSoup(page.content, 'html.parser')
|
|
|
|
# Extract the title of the page
|
|
title = soup.find('h1').get_text()
|
|
|
|
# Extract the subtitles and their descriptions and code chunks
|
|
subtitles = soup.find_all('h2')
|
|
parsed_subtitles = []
|
|
for subtitle in subtitles:
|
|
subtitle_title = subtitle.get_text()
|
|
subtitle_contents = subtitle.find_next_siblings(['p', 'pre'])
|
|
subtitle_parsed_contents = []
|
|
for i in range(0, len(subtitle_contents), 2):
|
|
description = subtitle_contents[i].get_text()
|
|
if len(subtitle_contents) > i+1:
|
|
code = subtitle_contents[i+1].get_text()
|
|
else:
|
|
code = ''
|
|
subtitle_parsed_contents.append([description, code])
|
|
parsed_subtitles.append([subtitle_title, subtitle_parsed_contents])
|
|
|
|
# Save the results as a structured JSON object
|
|
output = {'title': title}
|
|
for i in range(len(parsed_subtitles)):
|
|
output[f'subtitle{i+1}'] = parsed_subtitles[i][1]
|
|
|
|
with open('output.json', 'w') as outfile:
|
|
json.dump(output, outfile, indent=4)
|
|
|
|
return output
|
|
|
|
url = 'https://www.dotcms.com/docs/latest/container-api'
|
|
output = parse_markdown_page(url)
|
|
print(output) |