Created
November 19, 2023 23:26
-
-
Save redgeoff/95fa190669e5282b28743616d7eced1d to your computer and use it in GitHub Desktop.
LinkedIn Profile Extractor
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup | |
import re | |
import csv | |
# Load the uploaded HTML file | |
html_file_path = '/mnt/data/lease-abstractions.html' | |
# Reload the HTML content | |
with open(html_file_path, 'r', encoding='utf-8') as file: | |
html_content = file.read() | |
# Function to sanitize HTML content | |
def sanitize_html(content): | |
# Remove specific pattern \x3C!----> | |
content = re.sub(r'\\x3C!---+>', '', content) | |
# Remove newlines, tabs, and any leading/trailing whitespace | |
content = re.sub(r'[\n\t\r]+', ' ', content).strip() | |
return content | |
# Sanitize the HTML content | |
sanitized_html = sanitize_html(html_content) | |
# Define the function to extract profiles using the sanitized HTML | |
def extract_profiles(html): | |
soup = BeautifulSoup(html, 'html.parser') | |
profiles = [] | |
for entity in soup.find_all("div", class_="entity-result__item"): | |
profile = {} | |
# Extract LinkedIn URL without query parameters and strip whitespace | |
link_tag = entity.find("a", class_="app-aware-link") | |
if link_tag and 'href' in link_tag.attrs: | |
profile["LinkedIn URL"] = link_tag['href'].split('?')[0].strip() | |
# Extract Name and strip whitespace | |
name_tag = entity.find("img", class_="presence-entity__image") | |
if name_tag and 'alt' in name_tag.attrs: | |
profile["Name"] = name_tag['alt'].strip() | |
# Extract Title and Company and strip whitespace | |
subtitle_tag = entity.find("div", class_="entity-result__primary-subtitle") | |
if subtitle_tag: | |
subtitle_text = subtitle_tag.get_text(" ", strip=True).strip() | |
subtitle_text = ' '.join(subtitle_text.split()) | |
if ' at ' in subtitle_text: | |
title, company = subtitle_text.split(' at ', 1) | |
profile["Title"] = title.strip() | |
profile["Company"] = company.strip() | |
else: | |
profile["Title"] = subtitle_text.strip() | |
profile["Company"] = '' | |
profiles.append(profile) | |
return profiles | |
# Extract profiles from the sanitized HTML content | |
extracted_profiles = extract_profiles(sanitized_html) | |
# Output file path for the CSV | |
output_csv_path = '/mnt/data/extracted_profiles.csv' | |
# Writing the extracted data to a CSV file | |
with open(output_csv_path, 'w', newline='', encoding='utf-8') as csvfile: | |
fieldnames = ['Name', 'LinkedIn URL', 'Title', 'Company'] | |
writer = csv.DictWriter(csvfile, fieldnames=fieldnames) | |
writer.writeheader() | |
for profile in extracted_profiles: | |
# Ensure any remaining newline characters are removed | |
for key, value in profile.items(): | |
profile[key] = value.replace('\\n', ' ').strip() | |
writer.writerow(profile) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment