Last active
October 18, 2019 20:08
-
-
Save rounakdatta/d46ca394b549cce68c6b1205293c5762 to your computer and use it in GitHub Desktop.
Pick up name, phone numbers and emails from Kijiji
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
from selenium import webdriver | |
from selenium.webdriver.common.keys import Keys | |
from selenium.webdriver.common.by import By | |
from selenium.webdriver.support.ui import WebDriverWait | |
from selenium.webdriver.support import expected_conditions as EC | |
from selenium.webdriver import ActionChains | |
import re | |
import codecs | |
import csv | |
import time | |
def writeToFile(table): | |
# csvfile = codecs.open('output.csv', 'w', 'utf_8_sig') | |
csvfile = open('output.csv', 'a') | |
writer = csv.writer(csvfile) | |
writer.writerows(table) | |
csvfile.close() | |
parentUrl = "https://www.kijiji.ca" | |
driver = webdriver.Chrome() | |
print(sys.argv) | |
# logging in to Kijiji | |
LFLAG = False | |
for retry in range(5): | |
try: | |
driver.get(parentUrl + '/t-login.html') | |
LFLAG = True | |
break | |
except: | |
continue | |
if not LFLAG: | |
sys.exit() | |
emailBox = driver.find_element_by_id('LoginEmailOrNickname') | |
passwordBox = driver.find_element_by_id('login-password') | |
loginButton = driver.find_element_by_id('SignInButton') | |
emailBox.send_keys('[email protected]') | |
passwordBox.send_keys('cabot#123') | |
loginButton.click() | |
resultsWanted = int(sys.argv[1]) | |
print("Getting approximately {0} results for each query".format(resultsWanted)) | |
for searchQuery in sys.argv[2:]: | |
FFLAG = False | |
for retry in range(5): | |
try: | |
driver.get(parentUrl) | |
FFLAG = True | |
break | |
except: | |
continue | |
if not FFLAG: | |
sys.exit() | |
# put the query into the search box | |
driver.execute_script("document.getElementById('SearchKeyword').value = '{0}'".format(searchQuery)) | |
# wait for the details to be entered by the user and pressing the search button | |
element = WebDriverWait(driver, 600).until( | |
EC.presence_of_element_located((By.XPATH, '//*[@id="mainPageContent"]/div[3]/div[3]/div/div[1]/div[2]')) | |
) | |
print("Moved to search results page") | |
pageIndex = 1 | |
adUrls = [] | |
while(True): | |
currentPage = driver.current_url | |
currentPageList = currentPage.split("/") | |
# understanding the end of pagination | |
checkerPageList = currentPageList[:-2] + ["page-" + str(pageIndex - 1)] + currentPageList[-1:] | |
checkerPage = '/'.join(checkerPageList) | |
if (checkerPage == currentPage): | |
break | |
# aggregating all the results in the current page | |
allResults = driver.find_elements_by_tag_name("table") | |
for result in allResults: | |
adUrl = result.get_attribute("data-vip-url") | |
if adUrl is not None: | |
adUrls.append(parentUrl + adUrl) | |
allResults1 = driver.find_elements_by_css_selector('div.search-item') | |
for result in allResults1: | |
adUrl = result.get_attribute("data-vip-url") | |
if adUrl is not None: | |
adUrls.append(parentUrl + adUrl) | |
# move to the next page | |
pageIndex += 1 | |
newPageList = currentPageList[:-1] + ["page-" + str(pageIndex)] + currentPageList[-1:] | |
newPage = '/'.join(newPageList) | |
SFLAG = False | |
for retry in range(5): | |
try: | |
driver.get(newPage) | |
SFLAG = True | |
break | |
except: | |
continue | |
if not SFLAG: | |
pageIndex += 1 | |
continue | |
urlCollectedCount = len(adUrls) | |
print("Collected {0} URLs".format(urlCollectedCount)) | |
if urlCollectedCount >= resultsWanted: | |
break | |
print("Collected all ad URLs") | |
table = [] | |
adUrlCount = len(adUrls) | |
print(adUrlCount) | |
for adIndex in range(len(adUrls)): | |
payloadUrl = adUrls[adIndex] | |
print(payloadUrl) | |
TFLAG = False | |
for retry in range(5): | |
try: | |
driver.execute_script('window.open()') | |
driver.switch_to_window(driver.window_handles[adIndex + 1]) | |
driver.get(payloadUrl) | |
TFLAG = True | |
break | |
except Exception as e: | |
print(e) | |
continue | |
if not TFLAG: | |
continue | |
allText = "" | |
allParagraphs = driver.find_elements_by_tag_name("p") | |
for p in allParagraphs: | |
allText += p.text | |
# pick the business name from the ad | |
businessName = driver.find_element_by_xpath('//*[@id="ViewItemPage"]/div[5]/div[1]/div[1]/div/h1').text | |
print(businessName) | |
allText += ("\n" + businessName) | |
try: | |
messageBox = driver.find_element_by_id('message') | |
messageBox.send_keys(Keys.CONTROL, 'a') | |
messageBox.send_keys('Hey there!') | |
# messageBox.submit() | |
except Exception as e: | |
print(e) | |
# pick the phone number from the ad | |
phoneNumberRegex = re.compile(r'((\()?\d\d\d(\)?)(-| )?(\d\d\d(-| )?\d\d\d\d))') | |
regexGrouper = phoneNumberRegex.search(allText) | |
try: | |
allNumbersCaptured = list(regexGrouper.groups()) | |
allNumbersCaptured = [el for el in allNumbersCaptured if el is not None] | |
allNumbersCaptured.sort(key = lambda s: len(s)) | |
phoneNumber = allNumbersCaptured[-1] | |
except: | |
phoneNumber = "" | |
pass | |
print(phoneNumber) | |
# pick the person name from the ad | |
try: | |
personName = driver.find_element_by_xpath('//*[@id="vip-body"]/div[6]/div[3]/div/div[1]/div/a').text | |
except Exception as e: | |
personName = "" | |
pass | |
print(personName) | |
# pick up the email from the ad | |
allEmailsCaptured = re.findall(r'\S+@\S+', allText) | |
try: | |
allEmailsCaptured = [el for el in allEmailsCaptured if el is not None] | |
allEmailsCaptured.sort(key = lambda s: len(s)) | |
personEmail = allEmailsCaptured[-1] | |
except Exception as e: | |
personEmail = "" | |
pass | |
print(personEmail) | |
table.append([businessName, phoneNumber, personName, personEmail]) | |
if (adUrlCount % 10 == 0): | |
writeToFile(table) | |
table = [] | |
adUrlCount -= 1 | |
driver.switch_to_window(driver.window_handles[0]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment