estasney · June 29, 2018 01:47
diff --git a/analyze_tags.py b/analyze_tags.py
 # Given a StackOverflow tag, how often does it occur with other tags?
 # See example query https://data.stackexchange.com/stackoverflow/query/868423/co-occurrence-of-tags
 # Download CSV results

 import pandas as pd
 import re
 from sklearn.feature_extraction.text import CountVectorizer

 CSV_PATH = ""
 TAG = ""

 df = pd.read_csv(CSV_PATH)

 tag_search = re.compile(r"([a-z\. 0-9]+)")

 df['Tags'] = df['Tags'].apply(lambda x: tag_search.findall(x))
 tags = df['Tags'].apply(lambda x: [w.replace(" ", "_") for w in x]).apply(lambda x: " ".join(x))

 count_model = CountVectorizer() # default unigram model
 X = count_model.fit_transform(tags)
 X[X > 0] = 1
 Xc = (X.T * X)
 Xc.setdiag(0)

 df1 = pd.DataFrame(Xc.todense())
 df1 = df1.rename(columns={v: k for k, v in count_model.vocabulary_.items()},
                 index={v: k for k, v in count_model.vocabulary_.items()})
 df1 = df1.loc[df1.index==TAG].T.sort_values(TAG, ascending=False)
	# Given a StackOverflow tag, how often does it occur with other tags?
	# See example query https://data.stackexchange.com/stackoverflow/query/868423/co-occurrence-of-tags
	# Download CSV results

	import pandas as pd
	import re
	from sklearn.feature_extraction.text import CountVectorizer

	CSV_PATH = ""
	TAG = ""

	df = pd.read_csv(CSV_PATH)

	tag_search = re.compile(r"([a-z\. 0-9]+)")

	df['Tags'] = df['Tags'].apply(lambda x: tag_search.findall(x))
	tags = df['Tags'].apply(lambda x: [w.replace(" ", "_") for w in x]).apply(lambda x: " ".join(x))

	count_model = CountVectorizer() # default unigram model
	X = count_model.fit_transform(tags)
	X[X > 0] = 1
	Xc = (X.T * X)
	Xc.setdiag(0)

	df1 = pd.DataFrame(Xc.todense())
	df1 = df1.rename(columns={v: k for k, v in count_model.vocabulary_.items()},
	index={v: k for k, v in count_model.vocabulary_.items()})
	df1 = df1.loc[df1.index==TAG].T.sort_values(TAG, ascending=False)