Created
October 30, 2023 19:01
-
-
Save hmelenok/60f194b1dc70a73c67acb7d718c57c58 to your computer and use it in GitHub Desktop.
Google Takeout YouTube watched video HTML to CSV script (exports to: Products,Video title,Video Link,Channel Name,Channel Link,Date)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup | |
import csv | |
# Load the content from the HTML file | |
with open('input.html', 'r', encoding='utf-8') as file: | |
content = file.read() | |
soup = BeautifulSoup(content, 'lxml') | |
# Prepare a list to hold the extracted data | |
data = [] | |
# Find all '.outer-cell' blocks | |
outer_cells = soup.select('.outer-cell') | |
total_cells = len(outer_cells) | |
print(f"Total outer cells found: {total_cells}\nStarting data extraction...") | |
# Iterate over each '.outer-cell' block | |
for index, outer_cell in enumerate(outer_cells, 1): | |
# Extract the product name | |
product = outer_cell.select_one('.mdl-typography--title').get_text(strip=True) | |
# Extract video and channel details | |
content_cell = outer_cell.select_one('.content-cell.mdl-cell.mdl-cell--6-col.mdl-typography--body-1') | |
links = content_cell.find_all('a') | |
video_title = links[0].get_text(strip=True) if links else '' | |
video_link = links[0]['href'] if links else '' | |
channel_name = links[1].get_text(strip=True) if len(links) > 1 else '' | |
channel_link = links[1]['href'] if len(links) > 1 else '' | |
# Extract date (assuming it is the last string in the content_cell) | |
date = list(content_cell.stripped_strings)[-1] if content_cell else '' | |
# Append the extracted details to the data list | |
data.append((product, video_title, video_link, channel_name, channel_link, date)) | |
# Print progress | |
print(f"Processed cell {index} of {total_cells}") | |
print("\nWriting data to output.csv...") | |
if __name__ == "__main__": | |
# Write the extracted data to a CSV file | |
with open('output.csv', 'w', encoding='utf-8', newline='') as csv_file: | |
writer = csv.writer(csv_file) | |
# Write the headers | |
writer.writerow(['Products', 'Video title', 'Video Link', 'Channel Name', 'Channel Link', 'Date']) | |
# Write the extracted data | |
for row in data: | |
writer.writerow(row) | |
print("Data has been written to output.csv") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment