-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
69 lines (59 loc) · 2.69 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import argparse
import logging
import requests
from bs4 import BeautifulSoup
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
class MarkdownScraper:
def __init__(self):
self.session = requests.Session()
def scrape_website(self, url):
logging.info(f"Attempting to scrape the website: {url}")
try:
response = self.session.get(url)
response.raise_for_status()
logging.info("Successfully retrieved the website content.")
return response.text
except requests.exceptions.HTTPError as http_err:
logging.error(f"HTTP error occurred: {http_err}")
raise
except Exception as err:
logging.error(f"An error occurred: {err}")
raise
def convert_to_markdown(self, html_content):
logging.info("Converting HTML content to Markdown.")
soup = BeautifulSoup(html_content, "html.parser")
title = soup.title.string if soup.title else "No Title"
headers = [f"## {header.get_text()}" for header in soup.find_all(["h1", "h2", "h3", "h4", "h5", "h6"])]
paragraphs = [p.get_text() for p in soup.find_all("p")]
links = [f"[{a.get_text()}]({a['href']})" for a in soup.find_all("a", href=True)]
images = [f"![{img.get('alt', 'image')}]({img['src']})" for img in soup.find_all("img", src=True)]
lists = []
for ul in soup.find_all("ul"):
lists.append("\n".join([f"- {li.get_text()}" for li in ul.find_all("li")]))
markdown_content = f"# {title}\n\n" + "\n\n".join(headers + paragraphs + links + images + lists)
logging.info("Conversion to Markdown completed.")
return markdown_content
def save_markdown(self, markdown_content, output_file):
with open(output_file, "w") as f:
f.write(markdown_content)
logging.info(f"Markdown file '{output_file}' has been created successfully.")
def main(url, output_file):
scraper = MarkdownScraper()
try:
html_content = scraper.scrape_website(url)
markdown_content = scraper.convert_to_markdown(html_content)
scraper.save_markdown(markdown_content, output_file)
except Exception as e:
logging.error(f"An error occurred during the process: {e}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Scrape a website and convert it to Markdown.")
parser.add_argument("url", type=str, help="The URL of the website to scrape")
parser.add_argument(
"-o",
"--output",
type=str,
default="output.md",
help="The output Markdown file name",
)
args = parser.parse_args()
main(args.url, args.output)