Skip to content

Instantly share code, notes, and snippets.

View mizvol's full-sized avatar
🦒
Focusing

Volodymyr Miz mizvol

🦒
Focusing
View GitHub Profile
@mizvol
mizvol / Tags LDA topic analysis.ipynb
Created January 18, 2017 10:03
LDA topic analysis of Instagram hashtags for clustering. Analysis + Visualization in D3JS
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
import org.apache.spark.sql.SparkSession
object SparkWordCount extends App {
val spark = SparkSession.builder
.master("local[*]")
.appName("Spark Word Count")
.getOrCreate()
val lines = spark.sparkContext.parallelize(
@mizvol
mizvol / build.sbt
Last active October 26, 2018 15:29
name := "SparkScalaTest"
version := "1.0"
scalaVersion := "2.11.12"
libraryDependencies ++= Seq(
"org.apache.spark" %% "spark-core" % "2.3.2",
"org.apache.spark" %% "spark-sql" % "2.3.2"
)
db.allPosts.aggregate([
{$group: {_id: "$user.id", tags: {$addToSet: "$tags"}}},
{$unwind: "$tags"},
{$unwind: "$tags"},
{$group: {_id: "$_id", tags: {$addToSet: "$tags"}}},
{$out: "tags"}
])
import pymongo as pm
import unicodedata
client = pm.MongoClient()
db = client.instagram
tagsDB = db.tags
tagsList = []
for tag in tagsDB.find():
tagsList.append((str(tag['_id']), [unicodedata.normalize('NFKD', t).encode('ascii','ignore')
from pyspark.mllib.clustering import LDA, LDAModel
from pyspark.mllib.feature import IDF
from pyspark.ml.feature import CountVectorizer
#vectorize tags array for each user
vectorizer = CountVectorizer(inputCol="tokens", outputCol="features").fit(tagsListDF)
countVectors = vectorizer.transform(tagsListDF).select("id", "features")
#find TF-IDF coefficients for each tag
frequencyVectors = countVectors.map(lambda vector: vector[1])
topicIndices = ldaModel.describeTopics(maxTermsPerTopic=5)
vocablist = vectorizer.vocabulary
topicsRDD = sc.parallelize(topicIndices)
termsRDD = topicsRDD.map(lambda topic: (zip(itemgetter(*topic[0])(vocablist), topic[1])))
indexedTermsRDD = termsRDD.zipWithIndex()
termsRDD = indexedTermsRDD.flatMap(lambda term: [(t[0], t[1], term[1]) for t in term[0]])
termDF = termsRDD.toDF(['term', 'probability', 'topicId'])
from requests import get, Session, adapters
def getInstaPosts(latitude, longitude, distance, minTimestamp, maxTimestamp, count):
params = {
'lat': latitude,
'lng': longitude,
'distance': distance, # radius of requested area
'min_timestamp': str(minTimestamp), #start date
'max_timestamp': str(maxTimestamp), #end date
'count': COUNT, # number of posts(100 max)
db.allPosts.aggregate([
{$group: {_id: "$user.id"}},
{$out: "users"}
]);
def getFollowers(userId, nextCursor):
params = {
'cursor': nextCursor,
'access_token': INSTAGRAM_ACCESS_TOKEN
}
session = Session()
session.mount("https://", adapters.HTTPAdapter(max_retries=50))
response = session.get("https://api.instagram.com/v1/users/" + userId + "/followed-by", params = params, verify = True)