Last active
December 6, 2021 18:13
-
-
Save scrapehero/7fb058f67a2988cba74957ef2ec013d1 to your computer and use it in GitHub Desktop.
Python script to scrape a company details from a public company page on LinkedIn.com. Written as part of How to Scrape educational post - https://www.scrapehero.com/tutorial-scraping-linkedin-for-public-company-data/
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from lxml import html | |
import csv, os, json | |
import requests | |
from exceptions import ValueError | |
from time import sleep | |
def linkedin_companies_parser(url): | |
for i in range(5): | |
try: | |
headers = { | |
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.90 Safari/537.36' | |
} | |
print "Fetching :",url | |
response = requests.get(url, headers = headers,verify=False) | |
formatted_response = response.content.replace('<!--', '').replace('-->', '') | |
doc = html.fromstring(formatted_response) | |
datafrom_xpath = doc.xpath('//code[@id="stream-promo-top-bar-embed-id-content"]//text()') | |
content_about = doc.xpath('//code[@id="stream-about-section-embed-id-content"]') | |
if not content_about: | |
content_about = doc.xpath('//code[@id="stream-footer-embed-id-content"]') | |
if content_about: | |
pass | |
# json_text = content_about[0].html_content().replace('<code id="stream-footer-embed-id-content"><!--','').replace('<code id="stream-about-section-embed-id-content"><!--','').replace('--></code>','') | |
if datafrom_xpath: | |
try: | |
json_formatted_data = json.loads(datafrom_xpath[0]) | |
company_name = json_formatted_data['companyName'] if 'companyName' in json_formatted_data.keys() else None | |
size = json_formatted_data['size'] if 'size' in json_formatted_data.keys() else None | |
industry = json_formatted_data['industry'] if 'industry' in json_formatted_data.keys() else None | |
description = json_formatted_data['description'] if 'description' in json_formatted_data.keys() else None | |
follower_count = json_formatted_data['followerCount'] if 'followerCount' in json_formatted_data.keys() else None | |
year_founded = json_formatted_data['yearFounded'] if 'yearFounded' in json_formatted_data.keys() else None | |
website = json_formatted_data['website'] if 'website' in json_formatted_data.keys() else None | |
type = json_formatted_data['companyType'] if 'companyType' in json_formatted_data.keys() else None | |
specialities = json_formatted_data['specialties'] if 'specialties' in json_formatted_data.keys() else None | |
if "headquarters" in json_formatted_data.keys(): | |
city = json_formatted_data["headquarters"]['city'] if 'city' in json_formatted_data["headquarters"].keys() else None | |
country = json_formatted_data["headquarters"]['country'] if 'country' in json_formatted_data['headquarters'].keys() else None | |
state = json_formatted_data["headquarters"]['state'] if 'state' in json_formatted_data['headquarters'].keys() else None | |
street1 = json_formatted_data["headquarters"]['street1'] if 'street1' in json_formatted_data['headquarters'].keys() else None | |
street2 = json_formatted_data["headquarters"]['street2'] if 'street2' in json_formatted_data['headquarters'].keys() else None | |
zip = json_formatted_data["headquarters"]['zip'] if 'zip' in json_formatted_data['headquarters'].keys() else None | |
street = street1 + ', ' + street2 | |
else: | |
city = None | |
country = None | |
state = None | |
street1 = None | |
street2 = None | |
street = None | |
zip = None | |
data = { | |
'company_name': company_name, | |
'size': size, | |
'industry': industry, | |
'description': description, | |
'follower_count': follower_count, | |
'founded': year_founded, | |
'website': website, | |
'type': type, | |
'specialities': specialities, | |
'city': city, | |
'country': country, | |
'state': state, | |
'street': street, | |
'zip': zip, | |
'url': url | |
} | |
return data | |
except: | |
print "cant parse page", url | |
# Retry in case of captcha or login page redirection | |
if len(response.content) < 2000 or "trk=login_reg_redirect" in url: | |
if response.status_code == 404: | |
print "linkedin page not found" | |
else: | |
raise ValueError('redirecting to login page or captcha found') | |
except : | |
print "retrying :",url | |
def readurls(): | |
companyurls = ['https://www.linkedin.com/company/tata-consultancy-services'] | |
extracted_data = [] | |
for url in companyurls: | |
extracted_data.append(linkedin_companies_parser(url)) | |
f = open('data.json', 'w') | |
json.dump(extracted_data, f, indent=4) | |
if __name__ == "__main__": | |
readurls() |
Just found this site which seems to be very useful and it has more than 20+ automation scripts. They are providing this for free.
You can use this script to pull company Data: TexAu - LinkedInCompanyInfo You may have to create an account.
The link you have mentioned is not working.
Link : (https://marketing.texau.com/api/getLinkedinCompanyInfo?Ref=gist7fb058f67a2988cba74957ef2ec013d1)
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
I tried the original script and it wasn't working for me.