Last active
December 15, 2022 10:34
-
-
Save satomacoto/5290437 to your computer and use it in GitHub Desktop.
kNN on xvideos.com-db.csv
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding:utf-8 -*- | |
from pymongo import MongoClient | |
client = MongoClient() | |
db = client.xvideos | |
def create_db(): | |
f = open('xvideos.com-db.csv') | |
for line in f: | |
try: | |
row = line[:-1].split(';') | |
title = row[1] | |
tags = row[5].strip() | |
genre = row[6] | |
video = {'title': title, | |
'tags': tags.split(',') if tags else [], | |
'genre': genre} | |
db.videos.insert(video) | |
except: | |
pass | |
create_db() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding:utf-8 -*- | |
import heapq | |
import itertools | |
from pymongo import MongoClient | |
client = MongoClient() | |
db = client.xvideos | |
# http://stackoverflow.com/questions/1518522/python-most-common-element-in-a-list | |
def most_common(L): | |
''' find most common item ''' | |
groups = itertools.groupby(sorted(L)) | |
def _auxfun((item, iterable)): | |
return len(list(iterable)), -L.index(item) | |
return max(groups, key=_auxfun)[0] | |
def jaccard(L, M): | |
''' jaccard index ''' | |
a = set(L) | |
b = set(M) | |
return 1. * len(a & b) / len(a | b) | |
def knn(query_tags, k=10, dist=jaccard): | |
''' k nearest neighbor ''' | |
videos = db.videos.find({'tags': {'$in': query_tags}, 'genre': {'$ne': 'Unknow'}}) | |
query_tags = set(query_tags) | |
rank = [] | |
for video in videos: | |
tags = set(video['tags']) | |
rank.append((dist(query_tags, tags), video['genre'])) | |
return most_common([k for v, k in heapq.nlargest(k, rank)]) | |
def test_knn(): | |
query_tags = ['shower', 'morning', 'toy', 'sexy'] | |
print kNN(query_tags) | |
# queries = db.videos.find({'genre': {'$ne': 'Unknow'}}).limit(100) | |
# for query in queries: | |
# print query['genre'], kNN(query['tags']) | |
test_kNN() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment