satomacoto · December 15, 2022 10:34
diff --git a/create.py b/create.py
 # -*- coding:utf-8 -*-

 from pymongo import MongoClient

 client = MongoClient()
 db = client.xvideos

 def create_db():
    f = open('xvideos.com-db.csv')
    for line in f:
        try:
            row = line[:-1].split(';')
            title = row[1]
            tags = row[5].strip()
            genre = row[6]
            video = {'title': title,
                     'tags': tags.split(',') if tags else [],
                     'genre': genre}
            db.videos.insert(video)
        except:
            pass

 create_db()
diff --git a/knn.py b/knn.py
 # -*- coding:utf-8 -*-

 import heapq
 import itertools
 from pymongo import MongoClient

 client = MongoClient()
 db = client.xvideos

 # http://stackoverflow.com/questions/1518522/python-most-common-element-in-a-list
 def most_common(L):
    ''' find most common item '''
    groups = itertools.groupby(sorted(L))
    def _auxfun((item, iterable)):
        return len(list(iterable)), -L.index(item)
    return max(groups, key=_auxfun)[0]

 def jaccard(L, M):
    ''' jaccard index '''
    a = set(L)
    b = set(M)
    return 1. * len(a & b) / len(a | b)

 def knn(query_tags, k=10, dist=jaccard):
    ''' k nearest neighbor '''
    videos = db.videos.find({'tags': {'$in': query_tags}, 'genre': {'$ne': 'Unknow'}})
    query_tags = set(query_tags)

    rank = []
    for video in videos:
        tags = set(video['tags'])
        rank.append((dist(query_tags, tags), video['genre']))
    return most_common([k for v, k in heapq.nlargest(k, rank)])

 def test_knn():
    query_tags = ['shower', 'morning', 'toy', 'sexy']
    print kNN(query_tags)

    # queries = db.videos.find({'genre': {'$ne': 'Unknow'}}).limit(100)
    # for query in queries:
    #     print query['genre'], kNN(query['tags'])

 test_kNN()
	# -- coding:utf-8 --

	from pymongo import MongoClient

	client = MongoClient()
	db = client.xvideos

	def create_db():
	f = open('xvideos.com-db.csv')
	for line in f:
	try:
	row = line[:-1].split(';')
	title = row[1]
	tags = row[5].strip()
	genre = row[6]
	video = {'title': title,
	'tags': tags.split(',') if tags else [],
	'genre': genre}
	db.videos.insert(video)
	except:
	pass

	create_db()
	# -- coding:utf-8 --

	import heapq
	import itertools
	from pymongo import MongoClient

	client = MongoClient()
	db = client.xvideos

	# http://stackoverflow.com/questions/1518522/python-most-common-element-in-a-list
	def most_common(L):
	''' find most common item '''
	groups = itertools.groupby(sorted(L))
	def _auxfun((item, iterable)):
	return len(list(iterable)), -L.index(item)
	return max(groups, key=_auxfun)[0]

	def jaccard(L, M):
	''' jaccard index '''
	a = set(L)
	b = set(M)
	return 1. * len(a & b) / len(a \| b)

	def knn(query_tags, k=10, dist=jaccard):
	''' k nearest neighbor '''
	videos = db.videos.find({'tags': {'$in': query_tags}, 'genre': {'$ne': 'Unknow'}})
	query_tags = set(query_tags)

	rank = []
	for video in videos:
	tags = set(video['tags'])
	rank.append((dist(query_tags, tags), video['genre']))
	return most_common([k for v, k in heapq.nlargest(k, rank)])

	def test_knn():
	query_tags = ['shower', 'morning', 'toy', 'sexy']
	print kNN(query_tags)

	# queries = db.videos.find({'genre': {'$ne': 'Unknow'}}).limit(100)
	# for query in queries:
	# print query['genre'], kNN(query['tags'])

	test_kNN()