Created
November 21, 2019 23:24
-
-
Save diegotf30/067222c49db318d7b9ae9b338f304d28 to your computer and use it in GitHub Desktop.
Twitter ADB
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Proyecto de Twitter - Bases de Datos Avanzadas" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Inicializar MongoDB & API con Llaves" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 154, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import tweepy\n", | |
"\n", | |
"api_key = '-'\n", | |
"api_secret = '-'\n", | |
"access_token = '-'\n", | |
"access_secret = '-'\n", | |
"\n", | |
"auth = tweepy.OAuthHandler(api_key, api_secret)\n", | |
"auth.set_access_token(access_token, access_secret)\n", | |
"api = tweepy.API(auth)\n", | |
"\n", | |
"client = MongoClient('mongodb://localhost/TwitterADB')\n", | |
"db = client.Twitter" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Stream de Tweets" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 113, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from pymongo import MongoClient\n", | |
"import time\n", | |
"\n", | |
"\n", | |
"class StreamListener(tweepy.StreamListener):\n", | |
" def __init__(self, time_limit=60):\n", | |
" self.start_time = time.time()\n", | |
" self.limit = time_limit\n", | |
" self.api = api\n", | |
" self.start_amount = db.Tweets.count_documents({}) # used to print # tweets the stream added\n", | |
"\n", | |
" def on_status(self, status):\n", | |
" tweet = {\n", | |
" \"user\": {\n", | |
" \"accountName\": status.author.screen_name,\n", | |
" \"isVerified\": status.user.verified,\n", | |
" \"joinDate\": status.user.created_at,\n", | |
" \"followers\": status.user.followers_count,\n", | |
" \"headerColor\": status.user.profile_background_color,\n", | |
" \"location\": status.user.location,\n", | |
" \"isGeoEnabled\": status.user.geo_enabled\n", | |
" },\n", | |
" \"text\": status.text,\n", | |
" \"rts\": status.retweet_count,\n", | |
" \"favs\": status.favorite_count,\n", | |
" \"mightBeSensitive\": hasattr(status, 'possibly_sensitive') and status.possibly_sensitive,\n", | |
" \"containsMedia\": 'media' in status.entities,\n", | |
" \"hashtags\": [h['text'] for h in status.entities['hashtags']]\n", | |
" }\n", | |
" if (time.time() - self.start_time) < self.limit:\n", | |
" db.Tweets.insert_one(tweet)\n", | |
" return True\n", | |
" else:\n", | |
" actual_amount = db.Tweets.count_documents({})\n", | |
" print(f'Added {actual_amount - self.start_amount} tweets')\n", | |
" return False\n", | |
" \n", | |
" def on_error(self, status):\n", | |
" print(f'Error: {status}')\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 116, | |
"metadata": { | |
"scrolled": true | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"HASHTAG STREAM\n", | |
"Added 3239 tweets\n", | |
"-------------------\n", | |
"KEYWORD STREAM\n", | |
"Added 11414 tweets\n", | |
"-------------------\n", | |
"LOCATION STREAM\n", | |
"Added 12556 tweets\n", | |
"-------------------\n" | |
] | |
} | |
], | |
"source": [ | |
"listener = StreamListener(time_limit=300) # Stream for 5 min.\n", | |
"stream = tweepy.Stream(auth = api.auth, listener=listener)\n", | |
"\n", | |
"print('HASHTAG STREAM')\n", | |
"stream.filter(track=['#Trump', '#HalfLifeAlyx', '#ParoNacional'])\n", | |
"print('-------------------')\n", | |
"\n", | |
"print('KEYWORD STREAM')\n", | |
"listener.start_time = time.time()\n", | |
"stream.filter(track=['mongoDB', 'liga mx', 'tec de monterrey', 'grammys', 'impeachment'])\n", | |
"print('-------------------')\n", | |
"\n", | |
"print('LOCATION STREAM')\n", | |
"listener.start_time = time.time()\n", | |
"stream.filter(locations=[-4.62,41.97,10.49,51.1,-87.6,24.73,-75.41,32.12, -99.30,19.21, -98.85, 19.54])\n", | |
"print('-------------------')" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### 1. From all those stored tweets, how many are from “verified accounts”." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 118, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"106" | |
] | |
}, | |
"execution_count": 118, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"db.Tweets.count_documents({'user.isVerified': True}) " | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### 2. How many tweets you stored?" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 119, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"12556" | |
] | |
}, | |
"execution_count": 119, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"db.Tweets.count_documents({})" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### 3. How many different accounts are the tweets from?" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 181, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"10060" | |
] | |
}, | |
"execution_count": 181, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"len(db.Tweets.distinct('user.accountName'))" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### 4. How many of those tweets are location tagged." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 177, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"8297" | |
] | |
}, | |
"execution_count": 177, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"db.Tweets.count_documents({'user.location': { '$ne': None } })" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### 5. What was the most popular hashtag?" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 130, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[{'_id': 'ParoNacional', 'uses': 1421}]" | |
] | |
}, | |
"execution_count": 130, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"list(db.Tweets.aggregate([\n", | |
" {'$unwind': '$hashtags' },\n", | |
" {'$group': { '_id': '$hashtags', 'uses': { '$sum': 1 } } },\n", | |
" {'$sort': {'uses': -1}},\n", | |
" {'$limit': 1}\n", | |
"]))" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### 6. What is the oldest account from all the tweets you stored?" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 163, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"{\n", | |
" \"user\": {\n", | |
" \"accountName\": \"silas216\",\n", | |
" \"joinDate\": \"2007-01-02 23:59:35\"\n", | |
" }\n", | |
"}\n" | |
] | |
} | |
], | |
"source": [ | |
"import json\n", | |
"\n", | |
"q = db.Tweets.aggregate([\n", | |
" {'$sort': {'user.joinDate': 1}},\n", | |
" {'$limit': 1},\n", | |
" {'$project': {'_id': 0, 'user.accountName': 1, 'user.joinDate': 1}}\n", | |
"])\n", | |
"l = list(q)[0]\n", | |
"l['user']['joinDate'] = str(l['user']['joinDate']) # Cast to string so date can be pretty-printed\n", | |
"print(json.dumps(l, indent=4))" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### 7. The most used profile background color." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 170, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[{'_id': 'F5F8FA', 'uses': 4162}]" | |
] | |
}, | |
"execution_count": 170, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"list(db.Tweets.aggregate([\n", | |
" {'$group': {'_id': '$user.headerColor', 'uses': { '$sum': 1 } }},\n", | |
" {'$sort': {'uses': -1}},\n", | |
" {'$limit': 1}\n", | |
"]))" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### 8. How many of those tweets, are possibility sensitive?" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 165, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"128" | |
] | |
}, | |
"execution_count": 165, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"db.Tweets.count_documents({'mightBeSensitive': True})" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### 9. From all those accounts, how many of them have more than 2000 number of followers." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 120, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"3290" | |
] | |
}, | |
"execution_count": 120, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"db.Tweets.count_documents({'user.followers': {'$gt': 2000}})" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### 10. What percentage of those tweets included a media file (video, photo, gif.)?" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 164, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"822" | |
] | |
}, | |
"execution_count": 164, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"db.Tweets.count_documents({'containsMedia': True})" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.8" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment