Skip to content

Instantly share code, notes, and snippets.

View thisismattmiller's full-sized avatar
😑
...

Matt Miller thisismattmiller

😑
...
View GitHub Profile
{
"load_composer_autoloader": true,
"name": "WikibaseQualityConstraints",
"author": [
"BP2014N1",
"Lucas Werkmeister"
],
"url": "https://www.mediawiki.org/wiki/Extension:WikibaseQualityConstraints",
"descriptionmsg": "wbqc-desc",
"version": "1.0.0",
{
"load_composer_autoloader": true,
"name": "WikibaseQualityConstraints",
"author": [
"BP2014N1",
"Lucas Werkmeister"
],
"url": "https://www.mediawiki.org/wiki/Extension:WikibaseQualityConstraints",
"descriptionmsg": "wbqc-desc",
"version": "1.0.0",
<!DOCTYPE html>
<html lang="en">
<head>
<!--
This is an HTML comment
You can write text in a comment and the content won't be visible in the page
-->
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1" />
<?xml version="1.0" encoding="UTF-8"?>
<rdf:RDF xmlns:bf="http://id.loc.gov/ontologies/bibframe/" xmlns:bflc="http://id.loc.gov/ontologies/bflc/" xmlns:lclocal="http://id.loc.gov/ontologies/lclocal/" xmlns:madsrdf="http://www.loc.gov/mads/rdf/v1#" xmlns:pmo="http://performedmusicontology.org/ontology/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:rdfs="http://www.w3.org/2000/01/rdf-schema#" xmlns:streams="info:lc/streams#" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
<bf:Instance rdf:about="http://id.loc.gov/resources/instances/20898769">
<rdf:type rdf:resource="http://id.loc.gov/ontologies/bibframe/Print"/>
<bf:issuance>
<bf:Issuance rdf:about="http://id.loc.gov/vocabulary/issuance/mono"/>
</bf:issuance>
<bf:provisionActivity>
<bf:ProvisionActivity>
<rdf:type rdf:resource="http://id.loc.gov/ontologies/bibframe/Publication"/>
import requests
import json
source_data = json.load(open('data.json'))
url = 'https://maps.googleapis.com/maps/api/geocode/json'
import requests
import shutil
import camelot.io as camelot
import os
from PyPDF2 import PdfFileReader, PdfFileWriter
for year in range(2011,2017):
url = f'https://files.dep.state.pa.us/Waste/Recycling/RecyclingPortalFiles/Documents/{year}_Recycling_Report.pdf'
@thisismattmiller
thisismattmiller / extract.py
Created March 30, 2022 18:31
Code for https://youtu.be/pwnIcJ9p2C4 Web scraping with selenium
import glob
from bs4 import BeautifulSoup
import json
all_files = list(glob.glob('html/*.html'))
all_data = []
for file_name in all_files:
with open(file_name) as infile:
import glob
import json
urls = {}
for file in glob.glob('data_sogb/*'):
with open(file) as inf:
import waybackpy
urls = [
"http://dmc.signourguestbook.com/?username=dmc&trail=25",
"http://dmc.signourguestbook.com/?username=dmc&trail=50",
"http://dmc.signourguestbook.com/?username=dmc&trail=75",
"http://dmc.signourguestbook.com/?username=dmc&trail=100",
"http://dmc.signourguestbook.com/?username=dmc&trail=125",
"http://dmc.signourguestbook.com/?username=dmc&trail=150",
"http://dmc.signourguestbook.com/?username=dmc&trail=175",
{
"id": "lc:RT:bf2:MIBluRayDVD:Instance",
"propertyTemplates": [
{
"mandatory": "false",
"propertyLabel": "Instance Of",
"propertyURI": "http://id.loc.gov/ontologies/bibframe/instanceOf",
"repeatable": "false",
"resourceTemplates": [],
"type": "resource",