simonlindgren · October 6, 2017 05:35
diff --git a/ttd.py b/ttd.py
 '''
 TOPIC TOP DOCUMENTS
 '''

 # Required libraries and settings
 import re
 import pandas as pd
 pd.set_option('display.max_colwidth', -1)


 # Input
 file = open('doc-topics_50.txt', 'r')
 lines = file.readlines()

 # Make each line into a list of items to prepare for dataframe format
 lines = [line.split(" ") for line in lines]

 # Make dataframe
 df = pd.DataFrame(lines)
 df.columns = df.iloc[0] # set first line as column headers
 df = df[1:] # remove the first line

 # Make new dataframe based on the columns we need
 df = df.loc[:,['#doc', 'topic', 'proportion']]

 # Keep only the rows with the topic number we want
 topic = "3"
 df = df.loc[df['topic'] == topic]

 # Set a threshold value for when a topic is 'strong'
 median = df['proportion'].median() # we can use the median value
 percentile = df['proportion'].astype(float).quantile(.75) # or a certain percentile
 n_largest = df['proportion'].astype(float).nlargest(20+1).min() # or get, say, the top 20
 split = 0.87 # or simply a manually set value
 print(median, percentile, split,n_largest) # Inspect the values if needed

 # In this example, we choose the median as threshold
 # Getting the documents (rows) that are above the threshold for our chosen topic
 df = df.loc[df['proportion'].astype(float) > median]
 topdocs = df['#doc'].tolist()

 # Output
 outfile = open('topdocs.txt', 'w')

 for doc in topdocs: # Write all doc numbers to file
 	#print(doc)
 	outfile.write(doc + '\n')
	'''
	TOPIC TOP DOCUMENTS
	'''

	# Required libraries and settings
	import re
	import pandas as pd
	pd.set_option('display.max_colwidth', -1)


	# Input
	file = open('doc-topics_50.txt', 'r')
	lines = file.readlines()

	# Make each line into a list of items to prepare for dataframe format
	lines = [line.split(" ") for line in lines]

	# Make dataframe
	df = pd.DataFrame(lines)
	df.columns = df.iloc[0] # set first line as column headers
	df = df[1:] # remove the first line

	# Make new dataframe based on the columns we need
	df = df.loc[:,['#doc', 'topic', 'proportion']]

	# Keep only the rows with the topic number we want
	topic = "3"
	df = df.loc[df['topic'] == topic]

	# Set a threshold value for when a topic is 'strong'
	median = df['proportion'].median() # we can use the median value
	percentile = df['proportion'].astype(float).quantile(.75) # or a certain percentile
	n_largest = df['proportion'].astype(float).nlargest(20+1).min() # or get, say, the top 20
	split = 0.87 # or simply a manually set value
	print(median, percentile, split,n_largest) # Inspect the values if needed

	# In this example, we choose the median as threshold
	# Getting the documents (rows) that are above the threshold for our chosen topic
	df = df.loc[df['proportion'].astype(float) > median]
	topdocs = df['#doc'].tolist()

	# Output
	outfile = open('topdocs.txt', 'w')

	for doc in topdocs: # Write all doc numbers to file
	#print(doc)
	outfile.write(doc + '\n')