Created
April 5, 2020 13:29
-
-
Save sosi-deadeye/ac5af7258613a4b72351e43f8ae9bf90 to your computer and use it in GitHub Desktop.
Code Improvement
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
https://pastebin.com/raw/dxUqR7RH | |
https://www.facebook.com/groups/PythonUserDeutschland/?multi_permalinks=2079508368861796¬if_id=1585918939113861¬if_t=group_highlights&ref=notif | |
""" | |
import csv | |
from contextlib import contextmanager | |
from datetime import datetime | |
import ratelimit | |
import requests | |
from bs4 import BeautifulSoup | |
@contextmanager | |
def csv_writer(filename): | |
with open(filename, "w") as fd: | |
writer = csv.writer(fd) | |
writer.writerow( | |
("Index", "", "", "url", "meta_description_length", "meta_description_text") | |
) | |
yield writer | |
@ratelimit.sleep_and_retry | |
@ratelimit.RateLimitDecorator(20, 60) | |
def get_bs4(url): | |
return BeautifulSoup(requests.get(url).content, "html.parser") | |
def get_sitemap_urls(): | |
soup = get_bs4("https://draeger-it.blog/sitemap_index.xml") | |
return [element.text for element in soup.find_all("loc")] | |
def get_urls_from_sitemaps(sitemaps): | |
for sitemap_url in sitemaps: | |
sitemap = get_bs4(sitemap_url) | |
for loc in sitemap.find_all("loc"): | |
yield loc.text | |
def crawl_meta_description(url, index): | |
parser = get_bs4(url) | |
content = parser.find("meta", attrs={"name": "description"}) | |
if content is not None: | |
meta_description = content.get("content", "") | |
else: | |
meta_description = "" | |
return str(index), "", "", url, str(len(meta_description)), meta_description | |
def crawl(): | |
sitemaps = get_sitemap_urls() | |
urls = get_urls_from_sitemaps(sitemaps) | |
with csv_writer("output.csv") as writer: | |
for index, url in enumerate(urls): | |
print(f"Starte Crawling ({datetime.now().isoformat()}) {url}") | |
result = crawl_meta_description(url, index) | |
writer.writerow(result) | |
print(f"Ende - Crawling ({datetime.now().isoformat()}) {url}") | |
if __name__ == "__main__": | |
crawl() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment