Skip to content

Instantly share code, notes, and snippets.

View macloo's full-sized avatar
💭
Teaching some Python

Mindy McAdams macloo

💭
Teaching some Python
View GitHub Profile
@macloo
macloo / headless_selenium.py
Last active April 9, 2019 23:46
Run Chrome headless with Selenium
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
chrome_options = Options()
chrome_options.add_argument("--headless")
# fill in your own path to installed chromedriver
driver = webdriver.Chrome(executable_path='/Users/dirname/dirname/dirname//chromedriver',
options=chrome_options)
@macloo
macloo / get_wiki_extract.py
Last active April 7, 2019 14:08
Use Python Wikipedia-API to get text summary for any subject in a list of subjects
"""
Requires Wikipedia-API 0.5.1 or greater - and Python 3
https://pypi.org/project/Wikipedia-API/
"""
import wikipediaapi
w = wikipediaapi.Wikipedia('en')
p = w.page('N._K._Jemisin')
@macloo
macloo / get_all_agency_urls2.py
Created April 1, 2019 13:33
For Sarah April 2019 - part 2
from urllib.request import urlopen
from bs4 import BeautifulSoup
from selenium import webdriver
import time
import csv
driver = webdriver.Chrome('/Users/mcadams/Documents/python/scraping2019/chromedriver')
# testing the 'C' page only
driver.get('https://www.usa.gov/federal-agencies/c')
@macloo
macloo / get_all_agency_urls.py
Created April 1, 2019 13:23
For Sarah April 2019
from urllib.request import urlopen
from bs4 import BeautifulSoup
from selenium import webdriver
import time
import csv
driver = webdriver.Chrome('/Users/mcadams/Documents/python/scraping2019/chromedriver')
driver.get('https://www.usa.gov/federal-agencies')
# pause because page is slow to load
@macloo
macloo / get_rep_deets.py
Created March 31, 2019 22:22
Collect deets for each FL rep from 2 pages
import requests
from bs4 import BeautifulSoup
import time
base_url = "https://www.myfloridahouse.gov"
# using a small list for testing
small_list = [
"/Sections/Representatives/details.aspx?MemberId=4684&LegislativeTermId=88",
"/Sections/Representatives/details.aspx?MemberId=4624&LegislativeTermId=88",
@macloo
macloo / get_all_urls.py
Created March 31, 2019 21:31
For Madison March 2019
import requests
from bs4 import BeautifulSoup
url = "https://www.myfloridahouse.gov/Sections/Representatives/representatives.aspx"
hdr = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
'Accept-Encoding': 'none',
'Accept-Language': 'en-US,en;q=0.8',
@macloo
macloo / read_db.php
Last active January 24, 2023 01:14
A read-only page from a MySQL database - no JavaScript, no form handling
<?php include 'database.php'; ?>
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8">
<meta name=viewport content="width=device-width, initial-scale=1">
<title> Read the Shoutbox DB </title>
<link rel="stylesheet" href="css/main.css">
</head>
@macloo
macloo / keys.js
Created December 12, 2018 00:16
For Lorenzo P
// old code
/*
$('#bebop-key').click(function(){
resetAll();
$('#bebop-key').hide();
$('#bebop').css('background-color', '#EC7063');
$('#hide3').show();
$('#bebop').animate({height: '510px'}, 400);
$('#bebop').animate({width: '400px'}, 400);
$('#bebop-text').delay(1100).slideDown();
@macloo
macloo / clean_transcript.py
Created May 26, 2018 21:41
Clean any .sbv transcript file from YouTube - remove blank lines and timecode
# clean any .sbv transcript file from YouTube
# preserve linebreaks
filename = input('What is the filename? (include .sbv) ')
myfile = open(filename)
mylist = myfile.readlines()
myfile.close()
length = str(len(mylist))
new_length = str( int(len(mylist) / 3) )
@macloo
macloo / govtrack.py
Last active February 25, 2025 15:51
Scraping a page - requires BeautifulSoup and Requests
from bs4 import BeautifulSoup
import requests
url = 'https://www.govtrack.us/congress/members/amy_klobuchar/412242'
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')
# get a list of all the h2 elements
head_list = soup.find_all('h2')
# loop over the list to find the heading where we start to scrape