Last active
October 6, 2017 05:35
-
-
Save simonlindgren/9a5b0f7147a2d0a43cf413784d8eb5e9 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' | |
TOPIC TOP DOCUMENTS | |
''' | |
# Required libraries and settings | |
import re | |
import pandas as pd | |
pd.set_option('display.max_colwidth', -1) | |
# Input | |
file = open('doc-topics_50.txt', 'r') | |
lines = file.readlines() | |
# Make each line into a list of items to prepare for dataframe format | |
lines = [line.split(" ") for line in lines] | |
# Make dataframe | |
df = pd.DataFrame(lines) | |
df.columns = df.iloc[0] # set first line as column headers | |
df = df[1:] # remove the first line | |
# Make new dataframe based on the columns we need | |
df = df.loc[:,['#doc', 'topic', 'proportion']] | |
# Keep only the rows with the topic number we want | |
topic = "3" | |
df = df.loc[df['topic'] == topic] | |
# Set a threshold value for when a topic is 'strong' | |
median = df['proportion'].median() # we can use the median value | |
percentile = df['proportion'].astype(float).quantile(.75) # or a certain percentile | |
n_largest = df['proportion'].astype(float).nlargest(20+1).min() # or get, say, the top 20 | |
split = 0.87 # or simply a manually set value | |
print(median, percentile, split,n_largest) # Inspect the values if needed | |
# In this example, we choose the median as threshold | |
# Getting the documents (rows) that are above the threshold for our chosen topic | |
df = df.loc[df['proportion'].astype(float) > median] | |
topdocs = df['#doc'].tolist() | |
# Output | |
outfile = open('topdocs.txt', 'w') | |
for doc in topdocs: # Write all doc numbers to file | |
#print(doc) | |
outfile.write(doc + '\n') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment