Using Python's built-in defaultdict we can easily define a tree data structure:
def tree(): return defaultdict(tree)That's it!
| #!/usr/bin/env python | |
| # -*- coding: utf-8 -*- | |
| import os | |
| import json | |
| import requests | |
| COOKIES = {'logged-in-sig': os.environ['LOGGED_IN_SIG'], | |
| 'logged-in-user': os.environ['LOGGED_IN_USER'], | |
| } |
| #!/usr/bin/env python | |
| import os, sys | |
| import requests | |
| import json | |
| COOKIES = {'logged-in-sig': os.environ['LOGGED_IN_SIG'], | |
| 'logged-in-user': os.environ['LOGGED_IN_USER'], | |
| 'verbose': '1', | |
| } |
| #!/bin/bash | |
| # Download all Symlink Instruction files from a given list of identifiers. | |
| # Usage: ./get-symlinks.sh itemlist.txt | |
| while read identifier | |
| do | |
| mkdir -p symlink_instructions | |
| URL="http://archive.org/download/${identifier}/${identifier}_symlinks.txt" | |
| COOK="Cookie: logged-in-sig=$LOGGED_IN_SIG; logged-in-user=$LOGGED_IN_USER" | |
| wget -q "$URL" -O symlink_instructions/${identifier}_symlinks.txt \ |
| #!/usr/bin/env python | |
| import os | |
| #import lxml.html, lxml.etree | |
| import lxml.etree | |
| import subprocess | |
| import json | |
| import urllib | |
| ROOT_DIR = os.getcwd() | |
| utf8_parser = lxml.etree.HTMLParser(encoding='utf-8') |
| #!/usr/bin/env python | |
| """ Check if an item on archive.org has an acoustid. | |
| Usage: | |
| ./check_for_acoustid.py {item} | |
| Usage with GNU Parallel: | |
| cat itemlist.txt | parallel --max-procs=8 --group './check_for_acoustid.py {}' |
Using Python's built-in defaultdict we can easily define a tree data structure:
def tree(): return defaultdict(tree)That's it!
| # This demonstrates doing multiple metadata fetches in parallel. | |
| # It seems to be fast enough that the json decoding cost becomes | |
| # a significant proportion of the execution time. | |
| # It requires gevent; see http://www.gevent.org/intro.html#installation | |
| # To make this do something useful, modify do_work(). | |
| import gevent |
| taskid="$1" | |
| log=$(curl -s "http://www-tracey.us.archive.org/log_show.php?task_id=$taskid&full=1") | |
| filepath=$(echo $log | grep -Po '(?<=\[dir\]\ \=\>\ )/14/items/archiveteam-mobileme-hero-9' | head -1) | |
| node=$(echo $log | grep -Po '(?<=\[server).*?(?=.us.archive.org)' | head -1 | grep -Po 'ia6[0-9]{5}') | |
| itemsize_url="http://$node.us.archive.org/item-size.php?path=$filepath" | |
| # Item size in KB | |
| _itemsize=$(curl -s "$itemsize_url" | grep -Po '(?<=\<size\>).*(?=\</size\>)') | |
| # Convert to GB | |
| itemsize=$(echo "$_itemsize/1000000" | bc) |
| #!/usr/bin/env python | |
| # | |
| # Provided a list of identifiers for items on archive.org, return all items | |
| # that have an "acoustid" for every original audio file, but NOT a | |
| # "mb_recording_id". | |
| # | |
| import sys | |
| import logging | |
| from datetime import datetime |
| #!/usr/bin/env python | |
| # | |
| # Find out the most used metadata fields on archive.org | |
| # | |
| import sys | |
| import logging | |
| from datetime import datetime | |
| import ujson as json | |
| import cPickle as pickle |