Last active
February 6, 2024 10:43
-
-
Save drscotthawley/67382bf5a8ddd3478097efde94bae404 to your computer and use it in GitHub Desktop.
Generate BibTex from list of Tweets you (or another user) have favorited ('liked'), tweeted or RT'd
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python3 | |
""" | |
faves2bibtex.py | |
Author: Scott Hawley | |
Scrapes URLs contained in tweets you (or another user) favorited ('liked') for DOI & other bibliographic info, | |
and tries to generate a set of BibTex entries to stdout. | |
Status messages go to stderr | |
Sample usage: | |
./faves2bibtex.py drscotthawley | tee refs.bib | |
Status: | |
- It will generate BibTeX if it finds a DOI immediately, or if the reference is on arXiv.org but too new to have a DOI | |
- Otherwise, it tries to cobble together some kind of @misc entry by searching for 'common' meta tags, but often fails :'( | |
- Added a book ISBN-to-BibTeX functionality but it's currently not being used. TODO: scrape for ISBNs. | |
Tested on: Mac OS X 10.12.6, Python 3.5 (anaconda) | |
Aside: this project has been eye-opening re. the # of ways that, even if your HTTP request succeeds, various library routines may crash your code | |
""" | |
import tweepy | |
import sys | |
import re | |
import requests | |
#import urllib3 | |
import http.client as client | |
import urllib | |
from bs4 import BeautifulSoup | |
import os | |
import time | |
# You need to supply your own Twitter API developer keys here | |
# Some instructions here: https://www.digitalocean.com/community/tutorials/how-to-authenticate-a-python-application-with-twitter-using-tweepy-on-ubuntu-14-04 | |
consumer_key = '****' | |
consumer_secret = '****' | |
access_token = '****' | |
access_token_secret = '****' | |
def eprint(*args, **kwargs): # print to stderr | |
print(*args, file=sys.stderr, **kwargs) | |
def doi2bib(doi): | |
""" | |
Return a bibTeX string of metadata for a given DOI. | |
Based on https://gist.github.com/jrsmith3/5513926 | |
""" | |
if ("" == doi): | |
return "" | |
bibtext = '' | |
url = "http://dx.doi.org/" + doi | |
headers = {"accept": "application/x-bibtex"} | |
r = requests.get(url, headers = headers) | |
if ('This DOI cannot be found in the DOI System' not in r.text): | |
bibtext = r.text | |
else: | |
eprint("Warning: Attempt to convert DOI",doi,"failed.") | |
return bibtext | |
def slurp_url(url): # just read in an entire webpage and return the source as text | |
# from https://stackoverflow.com/questions/13303449/urllib2-httperror-http-error-403-forbidden | |
html = '' | |
user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7' | |
headers={'User-Agent':user_agent,} | |
request=urllib.request.Request(url,None,headers) | |
try: | |
response = urllib.request.urlopen(request) | |
except urllib.error.HTTPError: # e.g. 404 | |
eprint(" slurp_url: http error ",urllib.error.HTTPError) | |
return '' | |
try: # to decode | |
html = response.read().decode("utf-8") # sometimes this gives you 'bad continuation bit' error | |
except: | |
html = response.read().decode("latin-1") # ...in which case latin-1 usually succeeds | |
return html | |
''' | |
def slurp_url_old(url): | |
html = '' | |
try: # requests is tempermental https://github.com/requests/requests/issues/3840 | |
html = requests.get(url).text | |
except requests.exceptions.ContentDecodingError: # huffington post generates these regularly: malformed gzipped encoding | |
eprint(" ContentDecodingError for url = ",url) | |
eprint(" Skipping this url. ") # tired of this %&$$%$# | |
return html | |
''' | |
def expand_url(url): | |
# requests is nice in that it follows multiple links, but can crash too | |
actual_url = '' | |
try: # requests is tempermental https://github.com/requests/requests/issues/3840 | |
r = requests.get(url) | |
actual_url = r.url | |
except requests.exceptions.ContentDecodingError: # huffington post generates these regularly | |
# below code from https://stackoverflow.com/questions/4201062/how-can-i-unshorten-a-url | |
# this 'old school' approach won't follow redirects, but is otherwise robust | |
parsed = urllib.parse.urlparse(url) | |
h = client.HTTPConnection(parsed.netloc) | |
h.request('HEAD', parsed.path) | |
response = h.getresponse() | |
if response.status//100 == 3 and response.getheader('Location'): | |
actual_url = response.getheader('Location') | |
return actual_url | |
''' | |
def expand_url_old(url): # Twitters tc.o link expression needs expanding | |
return old_skool(url) | |
actual_url = '' | |
query_url = 'https://unshorten.me/s/'+url # unshorten.me is great but only allows 10 new evals per hour! | |
actual_url = requests.get(query_url).text | |
eprint(" expand_url: actual_url =",actual_url) | |
if actual_url.find("Usage") != -1: | |
#if ('Usage' in actual_url): | |
eprint(" Hey") | |
actual_url = '' | |
try: # requests is tempermental https://github.com/requests/requests/issues/3840 | |
r = requests.get(url) | |
eprint(" r = ",r) | |
actual_url = r.url | |
except requests.exceptions.ContentDecodingError: # huffington post generates these regularly | |
eprint(" ContentDecodingError for url = ",url) | |
eprint(" Trying urllib instead: ") | |
parsed = urllib.parse.urlparse(url) | |
h = client.HTTPConnection(parsed.netloc) | |
h.request('HEAD', parsed.path) | |
response = h.getresponse() | |
if response.status//100 == 3 and response.getheader('Location'): | |
return response.getheader('Location') | |
else: | |
return url | |
else: | |
eprint(" Nope") | |
return actual_url | |
''' | |
def extract_doi(url, html): # searches webpage text for the first string matching the DOI format | |
doi = "" # blank DOI string doubles as error/fail message | |
if ('doi' in url): # a couple easy special cases | |
url = url.replace('http://','') | |
url = url.replace('https://','') | |
doi = url.replace('aapt.scitation.org/doi/','') | |
doi = doi.replace('doi.org/','') | |
else: | |
doi_re = re.compile(r"\b(10[.][0-9]{4,}(?:[.][0-9]+)*/(?:(?![\"&\'])\S)+)\b") # doi pattern regexp | |
matchObj = doi_re.search(html) | |
if matchObj: | |
doi = matchObj.group(0) # grab the first thing in the page that fits the doi format | |
return doi | |
def arxiv_without_doi(url, html): # if the arxiv entry is so new that it doesn't contain a DOI | |
bibtext = '' # blank bibtext serves an initialization and error code | |
# first a check: is this a pdf arxiv link? If so, can we get a DOI if we search the 'abs' url? | |
if ('pdf' in url): | |
new_url = url.replace('.pdf','') | |
new_url = new_url.replace('pdf','abs') | |
new_html = slurp_url(new_url) | |
doi = extract_doi(new_url, new_html) | |
if ("" != doi): # if webpage contains a DOI, we're finished | |
eprint(" Found DOI = ",doi) | |
bibtext = doi2bib(doi) | |
if ('' == bibtext): | |
# bibsonomy.org does a GREAT job of formatting, but sets a limit on how frequently it can be accessed | |
query_url = 'http://scraper.bibsonomy.org/service?format=bibtex&selection=&url='+url | |
eprint(" query_url = ",query_url) | |
attempts, maxattempts = 0, 10 | |
while ((attempts < maxattempts) and ('' == bibtext)): | |
attempts += 1 | |
r = requests.get(query_url) | |
if ('You sent too many requests' in r.text): | |
eprint(" Bibsonomy says we're using it too much. (Attempt",attempts,"of",maxattempts,").",end="") | |
if (attempts < maxattempts): | |
nsecs = 60 | |
eprint(" Waiting",nsecs,"seconds before trying again...") | |
time.sleep(nsecs) | |
else: # success! | |
bibtext = r.text | |
eprint("") | |
if ('' == bibtext): # Try a different method | |
# arxiv2bibtex.org: no frequency limits but isn't formatted as nicely IMHO | |
arxiv_val = url.replace('https://arxiv.org/abs/','') # get only the arxiv index number | |
query_url = 'https://arxiv2bibtex.org/?q='+ arxiv_val +'&format=bibtex' | |
r = requests.get(query_url) | |
soup = BeautifulSoup(''.join(r.text), "html.parser") | |
textarea = soup.find('textarea') # the first textarea from arxiv2bibtex is the BibTeX output | |
if (textarea): | |
bibtext = textarea.getText() | |
return bibtext | |
def generic_web_page(url, html): | |
# For now, we're going to largely rely on common meta tags, e.g. facebook | |
# So far, if it can't find an author, then it doesn't produce anything. | |
# TODO: This is horrible and I will gladly replace this | |
bibtext = '' | |
if ('https://twitter.com/' in url): # url is un-shortened of course | |
eprint(" generic_web_page: skipping 'mere tweet'") | |
return '' # have yet to find any bibtex-able info in a mere tweet | |
soup = BeautifulSoup(''.join(html), "html.parser") | |
author = soup.find(name="author") | |
if not author: | |
author = soup.find(property="og:author") | |
eprint(" generic_web_page: author =",author) | |
if (author): | |
author = author.get("content") | |
bibtext += '@misc{'+author+',\n' | |
bibtext += ' Author = {'+author+'},\n' | |
else: | |
eprint(" skipping.") | |
title = soup.find(property="og:title") | |
if (author and title): | |
title = title.get("content") | |
bibtext += ' Title = {'+title+'},\n' | |
date = soup.find(itemprop="datePublished") | |
if (date): | |
date = date.get("content") | |
website_name = soup.find(property="og:site_name") | |
if (website_name): | |
website_name = website_name.get("content") | |
if ('' != bibtext): | |
bibtext += ' URL = {'+url+'},\n' | |
last_access = time.strftime("%b %d %Y") | |
bibtext += ' Note = {Last accessed '+last_access+'},\n' | |
bibtext += '}' | |
return bibtext | |
def scrape_for_isbn(actual_url, html): | |
isbn = None | |
# TODO: put something here! | |
return isbn | |
def isbn_to_bibtex(isbn): | |
# source: borrows from https://gist.github.com/wcaleb/5178632 | |
bibtext = '' | |
query_url = 'http://www.ottobib.com/isbn/'+isbn+'/bibtex' | |
html = slurp_url(query_url) | |
# Use BS4 to get the formatted citation returned by OttoBib | |
soup = BeautifulSoup(''.join(html), "html.parser") | |
for br in soup.find_all(name='br'): | |
br.decompose() | |
result = soup.find("div", class_="nine columns") | |
if (result): | |
bibtext = result.text | |
return bibtext | |
def limit_handled(cursor): # limits API calls so Twitter won't block the bot | |
while True: | |
try: | |
yield cursor.next() | |
except tweepy.RateLimitError: | |
mins = 15 | |
eprint(' Hit the Twitter API rate limit. Waiting',mins,'minutes') | |
time.sleep(mins * 60) # wait 15 minutes before trying again | |
def tweet_to_bibtex(tweet, bibcount): | |
bibtext = "" | |
# get list or urls | |
urls = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', tweet.full_text) | |
for url in urls: # all the urls in the Tweet (usually just one) get followed | |
bibtext = "" | |
eprint(" Trying url = ",url) | |
actual_url = expand_url(url) | |
eprint(" actual_url = ",actual_url) | |
if ('' != actual_url): | |
html = slurp_url(actual_url) # full text of web page | |
doi = extract_doi(actual_url, html) | |
if ("" != doi): # if webpage contains a DOI, we're finished | |
eprint(" Found DOI = ",doi) | |
bibtext = doi2bib(doi) | |
elif ("arxiv.org" in actual_url): # if the url is for an arxiv post (which doesn't contain a DOI) | |
bibtext = arxiv_without_doi(actual_url, html) | |
elif ('ISBN' in html): # somewhere in the linked page may be a book ISBN id | |
isbn = scrape_for_isbn(actual_url, html) | |
if (isbn): | |
bibtext = isbn_to_bibtex(isbn) | |
else: # let's try to generate an entry for the linked webpage itself | |
bibtext = generic_web_page(actual_url, html) | |
if ("" != bibtext): | |
bibcount += 1 | |
print(bibtext,'\n',flush=True) | |
return bibtext, bibcount | |
def scrape_faves(user_id): | |
""" | |
This is the main routine. | |
""" | |
auth = tweepy.OAuthHandler(consumer_key, consumer_secret) | |
auth.set_access_token(access_token, access_token_secret) | |
api = tweepy.API(auth) | |
favecount, bibcount = 0, 0 | |
for tweet in limit_handled( tweepy.Cursor(api.favorites, screen_name = user_id, include_entities = True, tweet_mode='extended').items() ): | |
#for tweet in api.favorites(screen_name = user_id, include_entities = True, tweet_mode='extended'): # only does 20 at a time | |
favecount += 1 | |
bibtext, bibcount = tweet_to_bibtex(tweet, bibcount) | |
eprint("-----") # just a divider between tweets | |
# Things the user has tweeted or re-tweeted are as bib-worthy as things they've faved | |
for tweet in limit_handled( tweepy.Cursor(api.user_timeline, screen_name = user_id, include_entities = True, tweet_mode='extended').items() ): | |
favecount += 1 | |
bibtext, bibcount = tweet_to_bibtex(tweet, bibcount) | |
eprint("-----") # just a divider between tweets | |
eprint(favecount,"favorites, tweets & RTs scraped") | |
eprint(bibcount,"BibTeX entries generated.") | |
if __name__ == '__main__': | |
if (False): # quick-testing block | |
eprint(isbn_to_bibtex('0754666913')) # testing for now; brent waters' book | |
url = 'https://t.co/Wf9U9fuPoI' # problem url from huffpo | |
url = 'https://fb.me/1sfK1HGSE' # problem url from fb | |
eprint(" trying url = ",url) | |
actual_url = expand_url(url) | |
eprint(" actual url = ",actual_url) | |
html = slurp_url(actual_url) | |
eprint(" html = ",html) | |
if len(sys.argv) == 2: | |
user_id = sys.argv[1] | |
scrape_faves(user_id) | |
else: | |
eprint("Usage: ",sys.argv[0]," <user_id>",sep="") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment