Skip to content

Instantly share code, notes, and snippets.

View jsfenfen's full-sized avatar

Jacob Fenton jsfenfen

View GitHub Profile
@jsfenfen
jsfenfen / test_for_pdf.py
Last active February 18, 2017 06:40
test if a file is a pdf by only downloading the first 4 bytes. It may be better to use http headers, but if you think those may be wrong...
import requests
def test_for_pdf(url):
r = requests.get(url, stream=True)
return ( next(r.iter_content(chunk_size=4)) == '%PDF' )
# See pdf file spec, p. 92: http://www.adobe.com/content/dam/Adobe/en/devnet/acrobat/pdfs/pdf_reference_1-7.pdf
if __name__ == '__main__':
# a pdf:
url = 'https://fremont.gov/DocumentCenter/View/31856'
print(url + " is pdf? " + str(test_for_pdf(url)) )
SELECT ••• FROM "core_candidate" INNER JOIN "core_person" ON ("core_candidate"."person_ptr_id" = "core_person"."person_id") WHERE "core_person"."fec_id" = 'S8NY00082'
4.42025803649%
104.12
+
SELECT ••• FROM "core_candidate" INNER JOIN "core_person" ON ("core_candidate"."person_ptr_id" = "core_person"."person_id") WHERE "core_candidate"."fec_candidate_id" = 'S8NY00082'
4.09909409277%
96.55
import sys
import os
import pdfplumber
# Only find checkboxes this size
RECT_WIDTH = 9.3
RECT_HEIGHT = 9.3
RECT_TOLERANCE = 2
pageid page_dim text object_type height width x0 x1 y0 y1
1 2164x1664 Report: word 18 90 95 185 1584 1602
1 2164x1664 CHECKREG word 15 107 205 312 1587 1602
1 2164x1664 Generated: word 16 131 560 691 1585 1601
1 2164x1664 26JUN15 word 16 92 711 803 1585 1601
1 2164x1664 17:19 word 17 64 862 926 1584 1601
1 2164x1664 Run: word 14 48 1147 1195 1584 1598
1 2164x1664 THURSDAY word 15 107 1216 1323 1583 1598
1 2164x1664 FEB1116 word 17 92 1339 1431 1582 1599
1 2164x1664 9:07 word 16 50 1490 1540 1582 1598
@jsfenfen
jsfenfen / output.png
Created February 17, 2016 07:01 — forked from endolith/output.png
Detecting rotation and line spacing of image of page of text using Radon transform
output.png
missing key: F501_502.REC_TYPE
missing key: F501_502.FORM_TYPE
missing key: F501_502.FILER_ID
missing key: F501_502.COMMITTEE_ID
missing key: F501_502.ENTITY_CD
missing key: F501_502.FROM_DATE
missing key: F501_502.THRU_DATE
missing key: F501_502.ELECT_DATE
missing key: F501_502.CAND_NAML
missing key: F501_502.CAND_NAMF
model._meta.db_table str(field).split('.')[2].upper() field.maxlength
CVR_SO_CD ID None
CVR_SO_CD ACCT_OPENDT None
CVR_SO_CD ACTVTY_LVL 2
CVR_SO_CD AMEND_ID None
CVR_SO_CD BANK_ADR1 55
CVR_SO_CD BANK_ADR2 55
CVR_SO_CD BANK_CITY 30
CVR_SO_CD BANK_NAM 200
CVR_SO_CD BANK_PHON 20
doc_source_page table fieldname maxlength
14 CVR_CAMPAIGN_DISCLOSURE ENTITY_CD 3
14 CVR_CAMPAIGN_DISCLOSURE FORM_TYPE 4
14 CVR_CAMPAIGN_DISCLOSURE ENTITY_CD 3
14 CVR_CAMPAIGN_DISCLOSURE FILER_ID 9
14 CVR_CAMPAIGN_DISCLOSURE REC_TYPE 3
14 CVR_CAMPAIGN_DISCLOSURE ENTITY_CD 3
14 CVR_CAMPAIGN_DISCLOSURE FILING_ID
14 CVR_CAMPAIGN_DISCLOSURE AMEND_ID
14 CVR_CAMPAIGN_DISCLOSURE ENTITY_CD 3
{
"C000127": [
"http://www.flickr.com/photos/mariacantwell"
],
"C000141": [
"http://www.flickr.com/photos/senatorbencardin"
],
"C000174": [
"http://www.flickr.com/photos/55205960%40N08/",
"http://www.flickr.com/photos/55205960@N08/"
# crosswalk for bioguide ids and 2014 house members candidate ids 4/16/2013
house_2014_crosswalk = [
{'bioguide':'Y000033', 'fec_id':'H6AK00045'},
{'bioguide':'A000055', 'fec_id':'H6AL04098'},
{'bioguide':'B000013', 'fec_id':'H2AL06035'},
{'bioguide':'B001244', 'fec_id':'H2AL01077'},
{'bioguide':'B001274', 'fec_id':'H0AL05163'},
{'bioguide':'R000591', 'fec_id':'H0AL02087'},
{'bioguide':'R000575', 'fec_id':'H2AL03032'},