This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
doInstall <- TRUE | |
toInstall <- c("twitteR", "dismo", "maps", "ggplot2") | |
if(doInstall){install.packages(toInstall, repos = "http://cran.us.r-project.org")} | |
lapply(toInstall, library, character.only = TRUE) | |
searchTerm <- "#rstats" | |
searchResults <- searchTwitter(searchTerm, n = 1000) # Gather Tweets | |
tweetFrame <- twListToDF(searchResults) # Convert to a nice dF | |
userInfo <- lookupUsers(tweetFrame$screenName) # Batch lookup of user info |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Command line | |
pip install scrapy | |
pip install selenium | |
#cd your/desired/project/path/ | |
scrapy startproject airbnb #will create a project "airbnb" in the folder you are in | |
cd airbnb/ #need to cd to a folder with a scrapy.cfg file | |
#this is the test address to kick off the project | |
#homes.py created in airbnb/airbnb/spiders/ |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# ipython | |
import scrapy | |
import selenium | |
from scrapy.selector import Selector | |
from selenium import webdriver | |
driver = webdriver.Chrome('path/to/the/chromedriver') | |
driver.get('https://www.airbnb.ae/s/Dubai/homes?refinement_paths%5B%5D=%2Fhomes&adults=0&children=0&infants=0&guests=0&place_id=ChIJRcbZaklDXz4RYlEphFBu5r0&query=Dubai&allow_override%5B%5D=&s_tag=CKxLe9y7') | |
scrapy_selector = Selector(text = driver.page_source) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
python3.6 | |
part of the Scrapy architecture | |
goes through the "Homes" page of airbnb.ae and gets property name, price per night, rating per property | |
""" | |
from time import sleep | |
import scrapy, selenium | |
from selenium import webdriver | |
from scrapy import Spider |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
python3.6 | |
downloads images from URLs stored in a csv file according to labels | |
and saves them to a specified directory / subdirectory | |
e.g. for sorting images into folders by labels for image classification | |
""" | |
import pandas as pd | |
import urllib | |
import os |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Command line | |
scrapy shell 'https://www.airbnb.ae/s/Dubai/homes?refinement_paths%5B%5D=%2Fhomes&adults=0&children=0&infants=0&guests=0&place_id=ChIJRcbZaklDXz4RYlEphFBu5r0&query=Dubai&allow_override%5B%5D=&s_tag=CKxLe9y7' | |
# testing selector for the property name | |
response.xpath('//*[@class="_ng4pvpo"]') | |
# Out[1]: [] | |
response.xpath('//*[@itemtype="http://schema.org/ListItem"]') | |
# Out[2]: [] | |
response.xpath('//*[@itemprop="name"]') | |
# Out[3]: [] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
python3.6 | |
ipython console test | |
""" | |
from scrapy.selector import Selector | |
scrapy_selector = Selector(text = self.driver.page_source) | |
homes_selector = scrapy_selector.xpath('//*[@itemtype="http://schema.org/ListItem"]') #name of an item can be changed by Airbnb | |
try: |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
python3.6 | |
""" | |
from time import sleep | |
import scrapy | |
import selenium | |
from scrapy import Spider | |
from scrapy.selector import Selector |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
python3.6 | |
Scrapy + Selenium | |
""" | |
for profile_url in profile_urls_distinct: | |
self.logger.info('Home #' + str(q)) | |
self.driver.get(profile_url) | |
q = q+1 | |
sleep(10) | |
link_to_home = profile_url |
OlderNewer