Skip to content

Instantly share code, notes, and snippets.

@simonlindgren
Last active October 6, 2017 05:35
Show Gist options
  • Save simonlindgren/9a5b0f7147a2d0a43cf413784d8eb5e9 to your computer and use it in GitHub Desktop.
Save simonlindgren/9a5b0f7147a2d0a43cf413784d8eb5e9 to your computer and use it in GitHub Desktop.
'''
TOPIC TOP DOCUMENTS
'''
# Required libraries and settings
import re
import pandas as pd
pd.set_option('display.max_colwidth', -1)
# Input
file = open('doc-topics_50.txt', 'r')
lines = file.readlines()
# Make each line into a list of items to prepare for dataframe format
lines = [line.split(" ") for line in lines]
# Make dataframe
df = pd.DataFrame(lines)
df.columns = df.iloc[0] # set first line as column headers
df = df[1:] # remove the first line
# Make new dataframe based on the columns we need
df = df.loc[:,['#doc', 'topic', 'proportion']]
# Keep only the rows with the topic number we want
topic = "3"
df = df.loc[df['topic'] == topic]
# Set a threshold value for when a topic is 'strong'
median = df['proportion'].median() # we can use the median value
percentile = df['proportion'].astype(float).quantile(.75) # or a certain percentile
n_largest = df['proportion'].astype(float).nlargest(20+1).min() # or get, say, the top 20
split = 0.87 # or simply a manually set value
print(median, percentile, split,n_largest) # Inspect the values if needed
# In this example, we choose the median as threshold
# Getting the documents (rows) that are above the threshold for our chosen topic
df = df.loc[df['proportion'].astype(float) > median]
topdocs = df['#doc'].tolist()
# Output
outfile = open('topdocs.txt', 'w')
for doc in topdocs: # Write all doc numbers to file
#print(doc)
outfile.write(doc + '\n')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment