Created
June 26, 2015 16:18
-
-
Save abhigenie92/7505af94b045d729d62e to your computer and use it in GitHub Desktop.
Python
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from selenium import webdriver | |
from selenium.webdriver.common.by import By | |
from selenium.common.exceptions import StaleElementReferenceException, TimeoutException | |
from selenium.webdriver.common.by import By | |
from selenium.webdriver.support.ui import WebDriverWait | |
from selenium.webdriver.support import expected_conditions as EC | |
from bs4 import BeautifulSoup | |
import urllib,requests,unidecode,lxml | |
class wait_for_more_than_n_elements_to_be_present(object): | |
def __init__(self, locator, count): | |
self.locator = locator | |
self.count = count | |
def __call__(self, driver): | |
try: | |
elements = EC._find_elements(driver, self.locator) | |
return len(elements) > self.count | |
except StaleElementReferenceException: | |
return False | |
def return_html_code(url): | |
driver = webdriver.Firefox() | |
driver.maximize_window() | |
driver.get(url) | |
# initial wait for the tweets to load | |
wait = WebDriverWait(driver, 10) | |
wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "li[data-item-id]"))) | |
# scroll down to the last tweet until there is no more tweets loaded | |
while True: | |
tweets = driver.find_elements_by_css_selector("li[data-item-id]") | |
number_of_tweets = len(tweets) | |
print number_of_tweets | |
driver.execute_script("arguments[0].scrollIntoView();", tweets[-1]) | |
try: | |
wait.until(wait_for_more_than_n_elements_to_be_present((By.CSS_SELECTOR, "li[data-item-id]"), number_of_tweets)) | |
except TimeoutException: | |
break | |
html_full_source=driver.page_source | |
driver.close() | |
return html_full_source | |
url='https://twitter.com/thecoolstacks' | |
#using selenium browser | |
html_source=return_html_code(url) | |
soup = BeautifulSoup(html_source, "lxml") | |
for tweet in soup.select("div.tweet div.content"): | |
print tweet.p.text | |
#using request modules | |
# if False: | |
# req = requests.get(url) | |
# soup = BeautifulSoup(req.content) | |
# text_tweet=[] | |
# alltweets = soup.find_all(attrs={'data-item-type' : 'tweet'}) | |
# for tweet in alltweets: | |
# #Text of tweet | |
# html_tweet= tweet.find_all("p", class_="TweetTextSize TweetTextSize--16px js-tweet-text tweet-text") | |
# text_tweet.append(''.join(html_tweet[0].findAll(text=True))) | |
# print text_tweet | |
#finalcode | |
# alltweets_selenium = soup_selenium.find_all(attrs={'data-item-type' : 'tweet'}) | |
# for tweet in alltweets_selenium: | |
# #Text of tweet | |
# html_tweet= tweet.find_all("p", class_="TweetTextSize TweetTextSize--16px js-tweet-text tweet-text") | |
# text_tweet.append(''.join(html_tweet[0].findAll(text=True))) | |
# print text_tweet |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment