Last active
December 31, 2019 01:26
-
-
Save ScribbleGhost/77af43af2fe8d45163c2d3f8bfabd7a1 to your computer and use it in GitHub Desktop.
For all files in folder of spesific file type, remove all HTML tags and save as new files. Article here: https://scribbleghost.net/2019/12/31/remove-html-from-local-files-with-python
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Import OS so we can write to files | |
| import os | |
| # Import Beautiful Soup 4 so we can parse HTML | |
| from bs4 import BeautifulSoup | |
| # Set the path where the target files are located | |
| path = r'C:/some_folder' | |
| # Set the file extension to look for | |
| ext = 'ass' | |
| # Start a loop - for each file in the path | |
| for filename in os.listdir(path): | |
| # Set which file types to look for | |
| if filename.endswith(ext): | |
| # Get the file name without the extension | |
| fullpath = os.path.join(path, filename) | |
| # Get the file path including the file and the file extension | |
| filename = os.path.splitext(os.path.basename(filename))[0] | |
| # Parse the file with Beautiful Soup | |
| soup = BeautifulSoup(open(fullpath), 'html.parser') | |
| text = soup.get_text() | |
| # Make new files where the content can be saved | |
| f = open(filename + '-new.' + ext, "x") | |
| # Write the content to the file | |
| f.write(text) | |
| # Close the file | |
| f.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment