Created
July 5, 2020 01:25
-
-
Save enzo-santos/5393186a1ea0cbdd0dcb0c7a03cd2d57 to your computer and use it in GitHub Desktop.
Simple Twitter parser (mobile version)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import bs4 | |
import requests | |
from typing import Dict, List | |
class MobileTwitterParser: | |
""" | |
Parses information from the main page of a Twitter account. | |
This parses information based on the mobile version of the Twitter webpage | |
because 1. it's fast and 2. it's simple, but it's contains less information | |
than the desktop version, such as the pinned tweet and the exact timestamp | |
of the tweets (it only shows how long ago the tweet was made, instead of an | |
HH:MM:SS format). | |
""" | |
def __init__(self, url: str): | |
""" | |
Creates a new Parser object. | |
Parameters | |
---------- | |
url | |
The main page URL of this Twitter account. Because this parser parses | |
information from the mobile version of the Twitter webpage, the domain | |
name must be 'mobile.tweet.com'. | |
""" | |
self.soup = bs4.BeautifulSoup(requests.get(url).text, 'lxml') | |
@classmethod | |
def from_username(cls, username: str): | |
""" | |
Creates a new Parser object from an username. | |
Parameters | |
---------- | |
username | |
The username of the account to be parsed, without the leading @. | |
""" | |
return cls(f'https://mobile.twitter.com/{username}') | |
@property | |
def profile_info(self) -> Dict[str, str]: | |
""" | |
Return the parsed information from this profile. | |
Returns | |
------- | |
The parsed data. It may contain the following keys: | |
- is_invalid: bool: if this account exists. If this value is | |
True, this is the only key in the dictionary. | |
- is_suspended: bool: if this account is suspended. If this | |
value is True, this is the only key in the dictionary. | |
- username: str: the Twitter handle of this account. | |
- fullname: str: the user fullname of this account. | |
- is_verified: bool: if this account is verified. | |
- location: str: the location of this account. If this value | |
is empty, the user did not set this field. | |
- url: str: the URL of this account. Note that this field is | |
user-defined, and it has nothing to do with the | |
constructor parameter of this class. If this value is | |
empty, the user did not set this field. | |
- is_protected: bool: if this account is protected. | |
- tweets: str: number of tweets made by this account. | |
- following: str: number of users this account is following. | |
- followers: str: number of users following this account. | |
""" | |
profile_info = {} | |
# If this element exists, something's wrong | |
if self.soup.find('div', {'class': 'blue'}): | |
if self.soup.find('link', {'rel': 'canonical'}): | |
profile_info['is_invalid'] = True | |
return profile_info | |
profile_info['is_suspended'] = True | |
return profile_info | |
elem = self.soup.find('table', {'class': 'profile-details'}) | |
profile_info['username'] = elem.find('div', {'class': 'username'}).text.strip()[2:] | |
profile_info['fullname'] = elem.find('div', {'class': 'fullname'}).text.strip() | |
profile_info['is_verified'] = elem.find('a', {'class': 'badge'}) is not None | |
profile_info['location'] = elem.find('div', {'class': 'location'}).text.strip() | |
profile_info['bio'] = elem.find('div', {'class': 'bio'}).text.strip() | |
profile_info['url'] = elem.find('div', {'class': 'url'}).text.strip() | |
profile_info['is_protected'] = self.soup.find('div', {'class': 'protected'}) is not None | |
elems = self.soup.find('table', {'class': 'profile-stats'}).find_all('td') | |
for i, key in enumerate(('tweets', 'following', 'followers')): | |
profile_info[key] = elems[i].find('div', {'class': 'statnum'}).text.strip().replace(',', '') | |
return profile_info | |
@property | |
def tweets_info(self) -> List[Dict[str, str]]: | |
""" | |
Return the parsed information from the tweets of this account. | |
Note that not all tweets are loaded: only those that are in the main page, | |
except the pinned tweet. | |
Returns | |
------- | |
The parsed data. It may contain the following keys: | |
- id: str: this tweet id. | |
- sender: str: the username of the account that made this tweet. | |
- timestamp: str: the timestamp when this tweet were made. It is | |
in the format '(\d) (s|m|h|d|w|y)', where the 1st group | |
represents the time value and the 2nd group the time unit. | |
- text: str: the content of this tweet. | |
- replying_to: str: the username this tweet is replying to. If | |
this tweet is replying to no one, this field does not exist. | |
""" | |
tweets_info = [] | |
for tweet_soup in self.soup.find_all('table', {'class': 'tweet'}): | |
tweet_info = {} | |
tweet_info['id'] = tweet_soup.find('div', {'class': 'tweet-text'}).get('data-id').strip() | |
tweet_info['sender'] = tweet_soup.find('div', {'class': 'username'}).text.strip()[1:] | |
tweet_info['timestamp'] = tweet_soup.find('td', {'class': 'timestamp'}).text.strip() | |
tweet_info['text'] = tweet_soup.find('div', {'class': 'tweet-text'}).text.strip() | |
elem = tweet_soup.find('div', {'class': 'tweet-reply-context'}) | |
if elem: | |
tweet_info['replying_to'] = elem.find('a').text.strip()[1:] | |
tweets_info.append(tweet_info) | |
return tweets_info | |
if __name__ == '__main__': | |
parser = Parser.from_username('realDonaldTrump') | |
print(parser.profile_info) | |
print(parser.tweets_info) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment