arminwasicek · April 23, 2024 16:18 · doterobcn · Apr 23, 2024
diff --git a/ScrapeLinkedInJobtitle.ipynb b/ScrapeLinkedInJobtitle.ipynb
 {
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Scrape Job Titles from LinkedIn\n",
    "\n",
    "The trick is to go through Google to get the data. LinkedIn is restricting its API. The script queries Google for a name and uses the keywords _linkedin professional profile_ in addition to be sure to hot the profile data. It then extracts name, job title, and company from the first search result."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from bs4 import BeautifulSoup\n",
    "import requests\n",
    "from lxml.html import html5parser\n",
    "import re\n",
    "\n",
    "def decomptitle(title):\n",
    "    h = html5parser.fromstring(str(title))\n",
    "    name = h.getchildren()[0].getchildren()[0].text\n",
    "    profile = h.getchildren()[0].getchildren()[1].text\n",
    "    linkedin = h.getchildren()[0].getchildren()[2].text\n",
    "    return name, profile, linkedin\n",
    "\n",
    "def decompjobdesc(jobdesc):\n",
    "    h = html5parser.fromstring(str(jobdesc))\n",
    "    location, job, company = str(h.text).split('\\xa0-\\xa0')\n",
    "    job, company2 = job.split(' at ')\n",
    "    if company != company2:\n",
    "        return None, None, None\n",
    "    return location, job, company\n",
    "\n",
    "def scrape_job_title(firstname, lastname):\n",
    "    url = 'https://www.google.com/search?q={}+{}+linkedin+professional+profile'.format(firstname, lastname)\n",
    "    r  = requests.get(url)\n",
    "    data = r.text\n",
    "    soup = BeautifulSoup(data, \"lxml\")\n",
    "    title = soup.find_all(\"h3\", {\"class\":\"r\"})[0]\n",
    "    jobdesc = soup.find_all(\"div\", {\"class\":\"slp\"})[0]\n",
    "    name, profile, linkedin = decomptitle(title)\n",
    "    location, job, company = decompjobdesc(jobdesc)\n",
    "    return job, name, company\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "scrape_job_title('John', 'Doe')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [conda env:sumo]",
   "language": "python",
   "name": "conda-env-sumo-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# Scrape Job Titles from LinkedIn\n",
	"\n",
	"The trick is to go through Google to get the data. LinkedIn is restricting its API. The script queries Google for a name and uses the keywords _linkedin professional profile_ in addition to be sure to hot the profile data. It then extracts name, job title, and company from the first search result."
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"from bs4 import BeautifulSoup\n",
	"import requests\n",
	"from lxml.html import html5parser\n",
	"import re\n",
	"\n",
	"def decomptitle(title):\n",
	" h = html5parser.fromstring(str(title))\n",
	" name = h.getchildren()[0].getchildren()[0].text\n",
	" profile = h.getchildren()[0].getchildren()[1].text\n",
	" linkedin = h.getchildren()[0].getchildren()[2].text\n",
	" return name, profile, linkedin\n",
	"\n",
	"def decompjobdesc(jobdesc):\n",
	" h = html5parser.fromstring(str(jobdesc))\n",
	" location, job, company = str(h.text).split('\\xa0-\\xa0')\n",
	" job, company2 = job.split(' at ')\n",
	" if company != company2:\n",
	" return None, None, None\n",
	" return location, job, company\n",
	"\n",
	"def scrape_job_title(firstname, lastname):\n",
	" url = 'https://www.google.com/search?q={}+{}+linkedin+professional+profile'.format(firstname, lastname)\n",
	" r = requests.get(url)\n",
	" data = r.text\n",
	" soup = BeautifulSoup(data, \"lxml\")\n",
	" title = soup.find_all(\"h3\", {\"class\":\"r\"})[0]\n",
	" jobdesc = soup.find_all(\"div\", {\"class\":\"slp\"})[0]\n",
	" name, profile, linkedin = decomptitle(title)\n",
	" location, job, company = decompjobdesc(jobdesc)\n",
	" return job, name, company\n",
	" "
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"scrape_job_title('John', 'Doe')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python [conda env:sumo]",
	"language": "python",
	"name": "conda-env-sumo-py"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.6.2"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}