Created
May 13, 2016 17:46
-
-
Save nicktimko/2b7e21c196d9e74c69fbd7f5b79014cb to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# SE Chat Scraper\n", | |
"Because http://meta.stackexchange.com/questions/129374/obtaining-full-chat-transcripts" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"import time\n", | |
"import requests\n", | |
"\n", | |
"MAX_MSG_COUNT = 500" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"def get_messages(room_id, before=None, n=MAX_MSG_COUNT, session=None):\n", | |
" \"\"\"SE Chat API endpoint\"\"\"\n", | |
" url = 'http://chat.stackexchange.com/chats/{}/events'.format(room_id)\n", | |
" params = {\n", | |
" 'mode': 'Messages',\n", | |
" 'msgCount': n,\n", | |
" }\n", | |
" if before is not None:\n", | |
" params['before'] = before\n", | |
" if session is None:\n", | |
" session = requests.session()\n", | |
" \n", | |
" messages = session.post(url, params).json()['events']\n", | |
" earliest = min(msg['message_id'] for msg in messages)\n", | |
" done = len(messages) != n\n", | |
" \n", | |
" return messages, earliest, done\n", | |
"\n", | |
"\n", | |
"def get_all_messages(room_id, throttle=1):\n", | |
" \"\"\"\n", | |
" Get all messages from the provided room ID. Throttle controls the inter-\n", | |
" request delay to avoid being too obnoxious.\n", | |
" \"\"\"\n", | |
" messages = []\n", | |
"\n", | |
" s = requests.session()\n", | |
"\n", | |
" new_messages, earliest, done = get_messages(room_id, session=s)\n", | |
" messages.extend(new_messages)\n", | |
" while not done:\n", | |
" time.sleep(throttle)\n", | |
" new_messages, earliest, done = get_messages(room_id, before=earliest, session=s)\n", | |
" messages.extend(new_messages)\n", | |
" \n", | |
" return messages" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"# room_id = 27364 # esolang/PCG room\n", | |
"room_id = 25038 # smaller JS\n", | |
"\n", | |
"messages = get_all_messages(room_id)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"{'content': 'Ok, legal!',\n", | |
" 'event_type': 1,\n", | |
" 'message_id': 22529004,\n", | |
" 'room_id': 25038,\n", | |
" 'time_stamp': 1435857785,\n", | |
" 'user_id': 144907,\n", | |
" 'user_name': 'gustavox'}" | |
] | |
}, | |
"execution_count": 4, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"messages[-1]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"import collections" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"most_active_users = collections.Counter(msg['user_name'] for msg in messages)\n", | |
"most_active_users = sorted(\n", | |
" (u[::-1] for u in most_active_users.items() if u[1] > 10), \n", | |
" reverse=True,\n", | |
")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[(296, 'KaduAmaral'),\n", | |
" (210, 'Sergio'),\n", | |
" (143, 'Luis'),\n", | |
" (89, 'gustavox'),\n", | |
" (88, 'Renan Rodrigues'),\n", | |
" (74, 'Maicon Carraro'),\n", | |
" (48, 'Xeoon'),\n", | |
" (37, 'Joao Vitor Farias Scheuermann'),\n", | |
" (24, 'Ricardo Henrique'),\n", | |
" (19, 'Ricardo'),\n", | |
" (16, 'stringnome'),\n", | |
" (16, 'Alexandre C. Caus'),\n", | |
" (15, 'Mayla Campos'),\n", | |
" (14, 'ricardo'),\n", | |
" (14, 'ctgPi'),\n", | |
" (13, 'Julio Santos'),\n", | |
" (12, 'Marciano.Andrade')]" | |
] | |
}, | |
"execution_count": 7, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"most_active_users # some people changed name, but have same user_id...but not as pretty to display :P" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.5.1" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 0 | |
} |
How can I get all of the messages, not just 500?
I think you'd have to look at the time of the last message, then set before
to that.
How can I get all of the messages, not just 500?
I think you'd have to look at the time of the last message, then set
before
to that.
@Anonymous941 Do you mean the time of the most recent message?
@Anonymous941 Do you mean the time of the most recent message?
@Tech-Expert-Wizard I mean set before
to the time of the last message (the 500th message) to get the next 500 messages.
How can I get all of the messages, not just 500?
Use get_all_messages
, it repeatedly calls get_messages
and sets before
to before whatever the last 500 were.
eu estou nessa lista 🤔
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
How can I get all of the messages, not just 500?