purpleidea · March 3, 2016 07:54
diff --git a/dnf_count_files.py b/dnf_count_files.py
 #!/usr/bin/python
 # James Shubin, @purpleidea, 2016+, AGPLv3+
 # Count number of files in each package, and figure out which has the most
 # We took a string based parsing approach to the xml filelists for simplicity
 # When I ran this, the max was: kcbench-data-4.0, with 52116 files
 # Verify with dnf repoquery --quiet -l kcbench-data-4.0 | wc -l
 # To run this script, do something like the following:
 # wget http://mirror.its.dal.ca/pub/fedora/linux/releases/23/Everything/x86_64/os/repodata/874f220caf48ccd307c203772c04b8550896c42a25f82b93bd17082d69df80db-filelists.xml.gz
 # gunzip 874f220caf48ccd307c203772c04b8550896c42a25f82b93bd17082d69df80db-filelists.xml.gz
 # time cat 874f220caf48ccd307c203772c04b8550896c42a25f82b93bd17082d69df80db-filelists.xml | ./dnf_count_files.py > /tmp/output
 # head /tmp/output

 import sys
 import operator

 count = {}
 name = ""
 m = 0
 for line in sys.stdin:	# this is a dirty hack, don't parse xml this way
 	if line.startswith("<package"):
 		if name != "":
 			count[name] = z # store

 		start = line.find('name="')
 		q1 = line.find('"', start)
 		q2 = line.find('"', start+6) # name= is the 5+1
 		name = line[q1+1:q2]
 		z = 0
 	if line.startswith("  <file"):
 		z = z + 1

 sorted_counts = sorted(count.items(), key=operator.itemgetter(1))
 for ix in reversed(sorted_counts):
 	print("%s\t%d" % (ix[0], ix[1]))
	#!/usr/bin/python
	# James Shubin, @purpleidea, 2016+, AGPLv3+
	# Count number of files in each package, and figure out which has the most
	# We took a string based parsing approach to the xml filelists for simplicity
	# When I ran this, the max was: kcbench-data-4.0, with 52116 files
	# Verify with dnf repoquery --quiet -l kcbench-data-4.0 \| wc -l
	# To run this script, do something like the following:
	# wget http://mirror.its.dal.ca/pub/fedora/linux/releases/23/Everything/x86_64/os/repodata/874f220caf48ccd307c203772c04b8550896c42a25f82b93bd17082d69df80db-filelists.xml.gz
	# gunzip 874f220caf48ccd307c203772c04b8550896c42a25f82b93bd17082d69df80db-filelists.xml.gz
	# time cat 874f220caf48ccd307c203772c04b8550896c42a25f82b93bd17082d69df80db-filelists.xml \| ./dnf_count_files.py > /tmp/output
	# head /tmp/output

	import sys
	import operator

	count = {}
	name = ""
	m = 0
	for line in sys.stdin: # this is a dirty hack, don't parse xml this way
	if line.startswith("<package"):
	if name != "":
	count[name] = z # store

	start = line.find('name="')
	q1 = line.find('"', start)
	q2 = line.find('"', start+6) # name= is the 5+1
	name = line[q1+1:q2]
	z = 0
	if line.startswith(" <file"):
	z = z + 1

	sorted_counts = sorted(count.items(), key=operator.itemgetter(1))
	for ix in reversed(sorted_counts):
	print("%s\t%d" % (ix[0], ix[1]))
No results found