Last active
December 4, 2018 03:01
-
-
Save kizernis/c3ec556f96e43352a7e65765ceaf4b57 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
from bs4 import BeautifulSoup | |
import re | |
with open('input.txt', encoding='utf-8') as f: | |
keywords = f.read().splitlines() | |
pattern = re.compile('|'.join(keywords), flags=re.IGNORECASE) | |
xml = requests.get('https://www.kill-the-newsletter.com/feeds/axl40j979d8l9a8f07q2.xml').text | |
soup_common = BeautifulSoup(xml, 'lxml') | |
blocks_count_total = 0 | |
blocks_count_matched = 0 | |
# f = open('output.txt', 'w', newline='\n', encoding='utf-8') | |
for soup_content in soup_common.find_all('content'): | |
soup = BeautifulSoup(soup_content.get_text(), 'lxml') | |
for soup_br in soup.find_all('br'): | |
soup_br.replace_with('\n') | |
text = soup.get_text().split('****************************', 1)[1] | |
text = re.sub(r'[\r\n]{3,}', '\n\n', text) | |
matches = re.findall(r'^(\d+\)\s+)(.+?)([\s\r\n]+Back to Top Back to Category Index)$', text, flags=re.MULTILINE|re.DOTALL) | |
blocks_count_total += len(matches) | |
for match in matches: | |
if re.search(pattern, match[1]): | |
blocks_count_matched += 1 | |
print(match[1] + '\n\n\n') | |
# f.write(match[1] + '\n\n\n') | |
summary_string = f'Total blocks: {blocks_count_total}, matched blocks: {blocks_count_matched}' | |
print(summary_string) | |
# f.write(summary_string) | |
# f.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment