Skip to content

Instantly share code, notes, and snippets.

@bennuttall
Last active April 3, 2021 21:22
Show Gist options
  • Save bennuttall/bc8df25783b1a76bea87aac6218d030b to your computer and use it in GitHub Desktop.
Save bennuttall/bc8df25783b1a76bea87aac6218d030b to your computer and use it in GitHub Desktop.
for f in */*LOG*.csv; do
tr < $f -d '\000' > $f.tmp
mv $f.tmp $f
done
for f in */*LOGJSON.csv; do
echo $f
time psql piwheels -c "\copy project_json_downloads FROM '$f' WITH (FORMAT csv);";
done
for f in */*LOGPAGE.csv; do
echo $f
time psql piwheels -c "\copy web_page_hits (page, accessed_by, accessed_at, user_agent) FROM '$f' WITH (FORMAT csv);";
done
for f in */*LOGPROJECT.csv; do
echo $f
time psql piwheels -c "\copy project_page_hits (package, accessed_by, accessed_at, user_agent) FROM '$f' WITH (FORMAT csv);";
done
for f in */*LOGDOWNLOAD.csv; do
echo $f
time psql piwheels -c "\copy downloads FROM '$f' WITH (FORMAT csv);";
done
for f in */*LOGSEARCH.csv; do
echo $f
time psql piwheels -c "\copy searches FROM '$f' WITH (FORMAT csv);";
done
import csv
import sys
import io
import gzip
from collections import defaultdict
from datetime import datetime
from piwheels.logger import ApacheSource, COMBINED, get_log_type, log_transform
Y2K = datetime(2000, 1, 1)
def main(files):
for filename in files:
logs = defaultdict(list)
sizes = defaultdict(int)
print(filename)
with io.TextIOWrapper(gzip.open(filename, 'rb'), encoding='ascii') as logf:
with ApacheSource(logf, COMBINED) as src:
for row in src:
log_type = get_log_type(row)
sizes[log_type] += row.size
if log_type:
data = tuple(log_transform(row, log_type))
logs[log_type].append(data)
if row.time < Y2K:
row = row._replace(time=last_time)
last_time = row.time
for log_type, rows in logs.items():
log_csv = '{}-{}.csv'.format(filename, log_type)
with open(log_csv, 'w') as csvf:
writer = csv.writer(csvf)
writer.writerows(rows)
sizes_csv = '{}-sizes.csv'.format(filename)
with open(sizes_csv, 'w') as csvf:
writer = csv.writer(csvf)
sizes['OTHER'] = sizes.pop(None)
for row in sizes.items():
writer.writerow(row)
if __name__ == '__main__':
main(sys.argv[1:])
# Run with e.g:
# python -W ignore logger.py 202010/ssl_access.log-202010*.gz
pg_restore -l piwheels.dump > piwheels.list.tmp
grep "TABLE DATA public build_abis" piwheels.list.tmp > piwheels.list
grep "TABLE DATA public preinstalled_apt_packages" piwheels.list.tmp >> piwheels.list
grep "TABLE DATA public packages" piwheels.list.tmp >> piwheels.list
grep "TABLE DATA public package_names" piwheels.list.tmp >> piwheels.list
grep "TABLE DATA public versions" piwheels.list.tmp >> piwheels.list
grep "TABLE DATA public builds" piwheels.list.tmp >> piwheels.list
grep "TABLE DATA public files" piwheels.list.tmp >> piwheels.list
grep "TABLE DATA public dependencies" piwheels.list.tmp >> piwheels.list
pg_restore -1 -v -L piwheels.list -d piwheels < piwheels.dump
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment