Last active
May 31, 2017 01:41
-
-
Save yarko/1edf7a6ff8ac524ec98928158a86012e to your computer and use it in GitHub Desktop.
debugging scraping threads from twitter
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
# 2017-05-30 17:02:10.049243 | |
url = 'https://twitter.com/MinaMarkham/status/865606994614296576' | |
r = requests.get(url) | |
# Save the text, see if response is there or not manually: | |
# 2017-05-30 17:04:07.293014 | |
with open('bug_reqraw.txt', 'w') as f: | |
f.write(r.text) | |
# Or parse, and see if response is in parsed tree | |
# 2017-05-30 17:09:04.052462 | |
from bs4 import BeautifulSoup | |
# 2017-05-30 17:10:33.255331 | |
soup = BeautifulSoup(r.text, "html.parser") | |
# 2017-05-30 17:11:09.909756 | |
import re | |
# 2017-05-30 17:11:43.421237 | |
t = soup.find(string=re.compile("Reading")) | |
# 2017-05-30 17:11:48.822705 | |
len(t) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# the simplest (and fails): | |
import urllib.request | |
url = 'https://twitter.com/MinaMarkham/status/865606994614296576' | |
local_filename, headers = urllib.request.urlretrieve(url, 'bug-urllib.txt') |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import vcr | |
import requests | |
from hashlib import sha1 | |
from bs4 import BeautifulSoup | |
from textwrap import wrap | |
# by default, this is for grabbing twitter threads, | |
# which are otherwise impossible to print! | |
def grab(url, update=False, dev=None): | |
''' | |
grab a url and save it locally w/ vcr; | |
update = True; use vcrpy record_mode "all", | |
to update the cassette; | |
dev - develop: save an uncompressed text | |
copy if results, with the url, to | |
the text file named in 'dev' argument. | |
''' | |
vcr_settings = { | |
'cassette_library_dir': 'vcr_cassettes' | |
} | |
if update: | |
vcr_settings.update({'record_mode':'all'}) | |
my_vcr = vcr.VCR(**vcr_settings) | |
# need bytestring for hashlib routines; | |
burl = sha1(bytearray(url, 'utf8')) | |
# name the vcr saved file w/ hexdigest of url | |
hd = burl.hexdigest() # for yaml filename | |
with my_vcr.use_cassette(f'{hd}.yaml'): | |
soup = requests.get(url).text | |
# if you want to test a bit more: | |
if dev: # save this, so you can develop a script | |
with open(dev, 'w') as f: | |
f.write(url+'\n\n') | |
for line in soup.splitlines(): | |
s = '\n'.join(wrap(line, width=70)) + '\n' | |
f.write(s) | |
return soup | |
if __name__ == "__main__": | |
#DEV: | |
url = 'https://twitter.com/MinaMarkham/status/865606994614296576' | |
soup = grab(url, dev="grab_net.txt", update=True) | |
soup = grab(url, dev="grab_vcr.txt") | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# this also fails: | |
curl -o bug-curl.txt https://twitter.com/MinaMarkham/status/865606994614296576 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This is a work in progress - there is a reply in this thread which seems:
The missing thread response I noticed is https://twitter.com/yarkot/status/866166065113571329,
with text beginning with "Reading all the responses, 2 things clear:"
Since this reply shows up in "show source" from a browser (e.g. Chrome), but fails also in a simple curl,
it begs the question: what is special about some thread responses (and getting them)?!?