Skip to content

Instantly share code, notes, and snippets.

View thisismattmiller's full-sized avatar
😑
...

Matt Miller thisismattmiller

😑
...
View GitHub Profile
import requests
import json
source_data = json.load(open('data.json'))
url = 'https://maps.googleapis.com/maps/api/geocode/json'
import requests
import shutil
import camelot.io as camelot
import os
from PyPDF2 import PdfFileReader, PdfFileWriter
for year in range(2011,2017):
url = f'https://files.dep.state.pa.us/Waste/Recycling/RecyclingPortalFiles/Documents/{year}_Recycling_Report.pdf'
@thisismattmiller
thisismattmiller / extract.py
Created March 30, 2022 18:31
Code for https://youtu.be/pwnIcJ9p2C4 Web scraping with selenium
import glob
from bs4 import BeautifulSoup
import json
all_files = list(glob.glob('html/*.html'))
all_data = []
for file_name in all_files:
with open(file_name) as infile:
import glob
import json
urls = {}
for file in glob.glob('data_sogb/*'):
with open(file) as inf:
import waybackpy
urls = [
"http://dmc.signourguestbook.com/?username=dmc&trail=25",
"http://dmc.signourguestbook.com/?username=dmc&trail=50",
"http://dmc.signourguestbook.com/?username=dmc&trail=75",
"http://dmc.signourguestbook.com/?username=dmc&trail=100",
"http://dmc.signourguestbook.com/?username=dmc&trail=125",
"http://dmc.signourguestbook.com/?username=dmc&trail=150",
"http://dmc.signourguestbook.com/?username=dmc&trail=175",
{
"id": "lc:RT:bf2:MIBluRayDVD:Instance",
"propertyTemplates": [
{
"mandatory": "false",
"propertyLabel": "Instance Of",
"propertyURI": "http://id.loc.gov/ontologies/bibframe/instanceOf",
"repeatable": "false",
"resourceTemplates": [],
"type": "resource",
{
"russcarnahan.com": 16,
"secure.actblue.com": 13,
"secure.piryx.com": 6,
"services.myngp.com": 5,
"rickperry.org": 5,
"secure.mydccc.org": 5,
"markleyva.com": 4,
"clyburnforcongress.com": 4,
"johnsprattforcongress.com": 4,
<div><strong>imdb_id1</strong>: <a href="/movie/<%=imdb_id1%>"><%=imdb_id1%></a></div>
<div><strong>color1</strong>: <%=color1%></div>
<div><strong>director_name1</strong>: <%=director_name1%></div>
<div><strong>num_critic_for_reviews1</strong>: <%=num_critic_for_reviews1%></div>
<div><strong>duration1</strong>: <%=duration1%></div>
<div><strong>director_facebook_likes1</strong>: <%=director_facebook_likes1%></div>
<div><strong>actor_3_facebook_likes1</strong>: <%=actor_3_facebook_likes1%></div>
<div><strong>actor_2_name1</strong>: <%=actor_2_name1%></div>
<div><strong>actor_1_facebook_likes1</strong>: <%=actor_1_facebook_likes1%></div>
<div><strong>gross1</strong>: <%=gross1%></div>
@thisismattmiller
thisismattmiller / ch_sparql.py
Last active August 6, 2021 14:34
Example using python to interact with Carnegie Hall's SPARQL endpoint
import requests
import json
url = "http://data.carnegiehall.org/sparql/"
sparql = """
#Find works by string in the title (case-insensitive)
PREFIX dcterms: <http://purl.org/dc/terms/>
PREFIX foaf: <http://xmlns.com/foaf/0.1/>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
@thisismattmiller
thisismattmiller / get_links.py
Created April 26, 2021 14:50
Downloading PBCore records from https://americanarchive.org