Skip to content

Instantly share code, notes, and snippets.

@enzo-santos
Created July 5, 2020 01:25
Show Gist options
  • Save enzo-santos/5393186a1ea0cbdd0dcb0c7a03cd2d57 to your computer and use it in GitHub Desktop.
Save enzo-santos/5393186a1ea0cbdd0dcb0c7a03cd2d57 to your computer and use it in GitHub Desktop.
Simple Twitter parser (mobile version)
import bs4
import requests
from typing import Dict, List
class MobileTwitterParser:
"""
Parses information from the main page of a Twitter account.
This parses information based on the mobile version of the Twitter webpage
because 1. it's fast and 2. it's simple, but it's contains less information
than the desktop version, such as the pinned tweet and the exact timestamp
of the tweets (it only shows how long ago the tweet was made, instead of an
HH:MM:SS format).
"""
def __init__(self, url: str):
"""
Creates a new Parser object.
Parameters
----------
url
The main page URL of this Twitter account. Because this parser parses
information from the mobile version of the Twitter webpage, the domain
name must be 'mobile.tweet.com'.
"""
self.soup = bs4.BeautifulSoup(requests.get(url).text, 'lxml')
@classmethod
def from_username(cls, username: str):
"""
Creates a new Parser object from an username.
Parameters
----------
username
The username of the account to be parsed, without the leading @.
"""
return cls(f'https://mobile.twitter.com/{username}')
@property
def profile_info(self) -> Dict[str, str]:
"""
Return the parsed information from this profile.
Returns
-------
The parsed data. It may contain the following keys:
- is_invalid: bool: if this account exists. If this value is
True, this is the only key in the dictionary.
- is_suspended: bool: if this account is suspended. If this
value is True, this is the only key in the dictionary.
- username: str: the Twitter handle of this account.
- fullname: str: the user fullname of this account.
- is_verified: bool: if this account is verified.
- location: str: the location of this account. If this value
is empty, the user did not set this field.
- url: str: the URL of this account. Note that this field is
user-defined, and it has nothing to do with the
constructor parameter of this class. If this value is
empty, the user did not set this field.
- is_protected: bool: if this account is protected.
- tweets: str: number of tweets made by this account.
- following: str: number of users this account is following.
- followers: str: number of users following this account.
"""
profile_info = {}
# If this element exists, something's wrong
if self.soup.find('div', {'class': 'blue'}):
if self.soup.find('link', {'rel': 'canonical'}):
profile_info['is_invalid'] = True
return profile_info
profile_info['is_suspended'] = True
return profile_info
elem = self.soup.find('table', {'class': 'profile-details'})
profile_info['username'] = elem.find('div', {'class': 'username'}).text.strip()[2:]
profile_info['fullname'] = elem.find('div', {'class': 'fullname'}).text.strip()
profile_info['is_verified'] = elem.find('a', {'class': 'badge'}) is not None
profile_info['location'] = elem.find('div', {'class': 'location'}).text.strip()
profile_info['bio'] = elem.find('div', {'class': 'bio'}).text.strip()
profile_info['url'] = elem.find('div', {'class': 'url'}).text.strip()
profile_info['is_protected'] = self.soup.find('div', {'class': 'protected'}) is not None
elems = self.soup.find('table', {'class': 'profile-stats'}).find_all('td')
for i, key in enumerate(('tweets', 'following', 'followers')):
profile_info[key] = elems[i].find('div', {'class': 'statnum'}).text.strip().replace(',', '')
return profile_info
@property
def tweets_info(self) -> List[Dict[str, str]]:
"""
Return the parsed information from the tweets of this account.
Note that not all tweets are loaded: only those that are in the main page,
except the pinned tweet.
Returns
-------
The parsed data. It may contain the following keys:
- id: str: this tweet id.
- sender: str: the username of the account that made this tweet.
- timestamp: str: the timestamp when this tweet were made. It is
in the format '(\d) (s|m|h|d|w|y)', where the 1st group
represents the time value and the 2nd group the time unit.
- text: str: the content of this tweet.
- replying_to: str: the username this tweet is replying to. If
this tweet is replying to no one, this field does not exist.
"""
tweets_info = []
for tweet_soup in self.soup.find_all('table', {'class': 'tweet'}):
tweet_info = {}
tweet_info['id'] = tweet_soup.find('div', {'class': 'tweet-text'}).get('data-id').strip()
tweet_info['sender'] = tweet_soup.find('div', {'class': 'username'}).text.strip()[1:]
tweet_info['timestamp'] = tweet_soup.find('td', {'class': 'timestamp'}).text.strip()
tweet_info['text'] = tweet_soup.find('div', {'class': 'tweet-text'}).text.strip()
elem = tweet_soup.find('div', {'class': 'tweet-reply-context'})
if elem:
tweet_info['replying_to'] = elem.find('a').text.strip()[1:]
tweets_info.append(tweet_info)
return tweets_info
if __name__ == '__main__':
parser = Parser.from_username('realDonaldTrump')
print(parser.profile_info)
print(parser.tweets_info)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment