Skip to content

Instantly share code, notes, and snippets.

@rounakdatta
Last active October 18, 2019 20:08
Show Gist options
  • Save rounakdatta/d46ca394b549cce68c6b1205293c5762 to your computer and use it in GitHub Desktop.
Save rounakdatta/d46ca394b549cce68c6b1205293c5762 to your computer and use it in GitHub Desktop.
Pick up name, phone numbers and emails from Kijiji
import sys
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver import ActionChains
import re
import codecs
import csv
import time
def writeToFile(table):
# csvfile = codecs.open('output.csv', 'w', 'utf_8_sig')
csvfile = open('output.csv', 'a')
writer = csv.writer(csvfile)
writer.writerows(table)
csvfile.close()
parentUrl = "https://www.kijiji.ca"
driver = webdriver.Chrome()
print(sys.argv)
# logging in to Kijiji
LFLAG = False
for retry in range(5):
try:
driver.get(parentUrl + '/t-login.html')
LFLAG = True
break
except:
continue
if not LFLAG:
sys.exit()
emailBox = driver.find_element_by_id('LoginEmailOrNickname')
passwordBox = driver.find_element_by_id('login-password')
loginButton = driver.find_element_by_id('SignInButton')
emailBox.send_keys('[email protected]')
passwordBox.send_keys('cabot#123')
loginButton.click()
resultsWanted = int(sys.argv[1])
print("Getting approximately {0} results for each query".format(resultsWanted))
for searchQuery in sys.argv[2:]:
FFLAG = False
for retry in range(5):
try:
driver.get(parentUrl)
FFLAG = True
break
except:
continue
if not FFLAG:
sys.exit()
# put the query into the search box
driver.execute_script("document.getElementById('SearchKeyword').value = '{0}'".format(searchQuery))
# wait for the details to be entered by the user and pressing the search button
element = WebDriverWait(driver, 600).until(
EC.presence_of_element_located((By.XPATH, '//*[@id="mainPageContent"]/div[3]/div[3]/div/div[1]/div[2]'))
)
print("Moved to search results page")
pageIndex = 1
adUrls = []
while(True):
currentPage = driver.current_url
currentPageList = currentPage.split("/")
# understanding the end of pagination
checkerPageList = currentPageList[:-2] + ["page-" + str(pageIndex - 1)] + currentPageList[-1:]
checkerPage = '/'.join(checkerPageList)
if (checkerPage == currentPage):
break
# aggregating all the results in the current page
allResults = driver.find_elements_by_tag_name("table")
for result in allResults:
adUrl = result.get_attribute("data-vip-url")
if adUrl is not None:
adUrls.append(parentUrl + adUrl)
allResults1 = driver.find_elements_by_css_selector('div.search-item')
for result in allResults1:
adUrl = result.get_attribute("data-vip-url")
if adUrl is not None:
adUrls.append(parentUrl + adUrl)
# move to the next page
pageIndex += 1
newPageList = currentPageList[:-1] + ["page-" + str(pageIndex)] + currentPageList[-1:]
newPage = '/'.join(newPageList)
SFLAG = False
for retry in range(5):
try:
driver.get(newPage)
SFLAG = True
break
except:
continue
if not SFLAG:
pageIndex += 1
continue
urlCollectedCount = len(adUrls)
print("Collected {0} URLs".format(urlCollectedCount))
if urlCollectedCount >= resultsWanted:
break
print("Collected all ad URLs")
table = []
adUrlCount = len(adUrls)
print(adUrlCount)
for adIndex in range(len(adUrls)):
payloadUrl = adUrls[adIndex]
print(payloadUrl)
TFLAG = False
for retry in range(5):
try:
driver.execute_script('window.open()')
driver.switch_to_window(driver.window_handles[adIndex + 1])
driver.get(payloadUrl)
TFLAG = True
break
except Exception as e:
print(e)
continue
if not TFLAG:
continue
allText = ""
allParagraphs = driver.find_elements_by_tag_name("p")
for p in allParagraphs:
allText += p.text
# pick the business name from the ad
businessName = driver.find_element_by_xpath('//*[@id="ViewItemPage"]/div[5]/div[1]/div[1]/div/h1').text
print(businessName)
allText += ("\n" + businessName)
try:
messageBox = driver.find_element_by_id('message')
messageBox.send_keys(Keys.CONTROL, 'a')
messageBox.send_keys('Hey there!')
# messageBox.submit()
except Exception as e:
print(e)
# pick the phone number from the ad
phoneNumberRegex = re.compile(r'((\()?\d\d\d(\)?)(-| )?(\d\d\d(-| )?\d\d\d\d))')
regexGrouper = phoneNumberRegex.search(allText)
try:
allNumbersCaptured = list(regexGrouper.groups())
allNumbersCaptured = [el for el in allNumbersCaptured if el is not None]
allNumbersCaptured.sort(key = lambda s: len(s))
phoneNumber = allNumbersCaptured[-1]
except:
phoneNumber = ""
pass
print(phoneNumber)
# pick the person name from the ad
try:
personName = driver.find_element_by_xpath('//*[@id="vip-body"]/div[6]/div[3]/div/div[1]/div/a').text
except Exception as e:
personName = ""
pass
print(personName)
# pick up the email from the ad
allEmailsCaptured = re.findall(r'\S+@\S+', allText)
try:
allEmailsCaptured = [el for el in allEmailsCaptured if el is not None]
allEmailsCaptured.sort(key = lambda s: len(s))
personEmail = allEmailsCaptured[-1]
except Exception as e:
personEmail = ""
pass
print(personEmail)
table.append([businessName, phoneNumber, personName, personEmail])
if (adUrlCount % 10 == 0):
writeToFile(table)
table = []
adUrlCount -= 1
driver.switch_to_window(driver.window_handles[0])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment