Skip to content

Instantly share code, notes, and snippets.

@RajeshKrSahoo
Created January 28, 2021 12:52
Show Gist options
  • Save RajeshKrSahoo/2641393399b190222c744c3efbbc66ab to your computer and use it in GitHub Desktop.
Save RajeshKrSahoo/2641393399b190222c744c3efbbc66ab to your computer and use it in GitHub Desktop.
##With Naukri data
#### Used selenium as direct request module was not able to get the HTML element so Selenium helped to achieve so
from selenium import webdriver
from bs4 import BeautifulSoup
import time
import pandas as pd
from selenium.webdriver.chrome.options import Options
header = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36",
"X-Requested-With": "XMLHttpRequest"
}
# read search terms from csv into a list
search_terms = [
"",
"Python",
"SQL",
"R",
"Spark",
"Hadoop",
"Java",
"Tableau",
"AWS",
"SAS",
"Hive",
"Scala",
"Excel",
"TensorFlow",
"C++",
"Azure",
"NoSQL",
"Linux",
"C",
"Matlab",
"Scikit-learn",
"Pandas",
"Git",
"Keras",
"Javascript",
"Pig",
"Hbase",
"Google Cloud",
"Docker",
"NumPy",
"PyTorch",
"C#",
"SPSS",
"MySQL",
"Perl",
"Cassandra",
"MongoDB",
"GCP",
"Kubernetes",
"D3",
"Databricks",
"postgresql",
"Caffe",
"Airflow",
"Alteryx",
"BigQuery",
# "Fastai",
]
naukri_list = []
for term in search_terms:
url=f'https://www.naukri.com/data-science-{term}-jobs?k=data%20science%2C%20'
CHROMEDRIVER_PATH = '/usr/local/bin/chromedriver'
options = Options()
options.headless = True
driver = webdriver.Chrome(CHROMEDRIVER_PATH, options=options)
try:
print(url)
driver.get(url)
time.sleep(2)
soup = BeautifulSoup(driver.page_source,'html5lib')
# print(soup.prettify())
# driver.close()
try:
data=soup.find('span',class_='fleft grey-text mr-5 fs12').get_text().strip().split()[-1]
print(data, term)
naukri_list.append(data)
except:
naukri_list.append('0')
except Exception as e:
print(f'error: {e}')
naukri_list
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment