Created
July 2, 2024 21:54
-
-
Save woshichuanqilz/8633cb3acf54c8e8f0c045a13e9b6069 to your computer and use it in GitHub Desktop.
get text content from html
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import re | |
from bs4 import BeautifulSoup | |
from multiprocessing import Pool | |
def extract_html_content(file_path): | |
with open(file_path, 'r', encoding='utf-8') as f: | |
content = f.read() | |
soup = BeautifulSoup(content, 'html.parser') | |
main_content = soup.get_text() | |
return main_content | |
def merge_html_content(html_content, output_file): | |
with open(output_file, 'w', encoding='utf-8') as f: | |
for content in html_content: | |
lines = content.split('\n') | |
filtered_lines = [line for line in lines if line.strip()] # Exclude empty lines | |
f.write('\n'.join(filtered_lines) + '\n') | |
def process_html_files(folder_path): | |
html_files = [] | |
for root, dirs, files in os.walk(folder_path): | |
html_files.extend([os.path.join(root, file) for file in files if file.endswith(".html")]) | |
return html_files | |
if __name__ == '__main__': | |
folder_path = '.' | |
html_files = process_html_files(folder_path) | |
with Pool() as pool: | |
html_content = pool.map(extract_html_content, html_files) | |
output_file = 'merged_html_content.txt' | |
merge_html_content(html_content, output_file) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment