I hereby claim:
- I am smerity on github.
- I am smerity (https://keybase.io/smerity) on keybase.
- I have a public key whose fingerprint is 56A2 5996 3078 B205 1053 883A 6615 0186 B74F 858B
To claim this, I am signing this object:
| 0 48 | |
| 0000 6 | |
| 0l 1 | |
| 0xdc00 13 | |
| 1 69 | |
| 10 11 | |
| 100 3 | |
| 1001 1 | |
| 100154 1 | |
| 1004 1 |
| #include <algorithm> | |
| #include <fstream> | |
| #include <iostream> | |
| #include <iterator> | |
| #include <map> | |
| #include <set> | |
| #include <sstream> | |
| #include <unordered_map> | |
| #include <vector> |
| package main | |
| import ( | |
| "encoding/gob" | |
| "fmt" | |
| "log" | |
| "net" | |
| "net/rpc" | |
| ) |
| smerity@pegasus:~/Coding/montelight/python$ time ~/Coding/Reference/pypy-2.2.1-linux64/bin/pypy -m cProfile minilight.py roomfront-n-1000.ml.txt | |
| MiniLight 1.6 Python - http://www.hxa.name/minilight | |
| iteration: 3^C | |
| interrupted | |
| 1155613811 function calls (1062023566 primitive calls) in 89.591 seconds | |
| Ordered by: standard name |
| # To run: python just_text.py > text | |
| ### | |
| from glob import glob | |
| # | |
| import warc | |
| # List any of the WARC files found in the data folder | |
| warc_files = glob('data/*.wet.gz') | |
| # Process each of the WARC files we found |
I hereby claim:
To claim this, I am signing this object:
| common-crawl/crawl-data/CC-MAIN-2014-15/segments/1397609521512.15/warc/CC-MAIN-20140416005201-00000-ip-10-147-4-33.ec2.internal.warc.gz | |
| common-crawl/crawl-data/CC-MAIN-2014-15/segments/1397609521512.15/warc/CC-MAIN-20140416005201-00001-ip-10-147-4-33.ec2.internal.warc.gz | |
| common-crawl/crawl-data/CC-MAIN-2014-15/segments/1397609521512.15/warc/CC-MAIN-20140416005201-00002-ip-10-147-4-33.ec2.internal.warc.gz | |
| common-crawl/crawl-data/CC-MAIN-2014-15/segments/1397609521512.15/warc/CC-MAIN-20140416005201-00003-ip-10-147-4-33.ec2.internal.warc.gz | |
| common-crawl/crawl-data/CC-MAIN-2014-15/segments/1397609521512.15/warc/CC-MAIN-20140416005201-00004-ip-10-147-4-33.ec2.internal.warc.gz | |
| common-crawl/crawl-data/CC-MAIN-2014-15/segments/1397609521512.15/warc/CC-MAIN-20140416005201-00005-ip-10-147-4-33.ec2.internal.warc.gz | |
| common-crawl/crawl-data/CC-MAIN-2014-15/segments/1397609521512.15/warc/CC-MAIN-20140416005201-00006-ip-10-147-4-33.ec2.internal.warc.gz | |
| common-crawl/crawl-data/CC-MAIN-2014-15/segments/1397609521512.15/warc/C |
| import re | |
| # | |
| from collections import Counter | |
| from glob import glob | |
| from urlparse import urlparse | |
| # | |
| import warc | |
| # Extract the names and total usage count of all the opening HTML tags in the document |
| import boto | |
| from boto.s3.key import Key | |
| import zlib | |
| def stream_decompress_multi(stream): | |
| dec = zlib.decompressobj(16 + zlib.MAX_WBITS) | |
| while True: | |
| chunk = stream.read(1024 * 8) | |
| if not chunk: |