Last active
August 29, 2015 14:11
-
-
Save dan-gamble/43e5477651ac8e8b654c to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from django.conf import settings | |
from django.core.exceptions import ObjectDoesNotExist | |
from django.db.models.loading import get_model | |
from django.utils.text import slugify | |
from apps.leagues.models import League | |
from apps.nations.models import Nation | |
from datetime import datetime | |
import os | |
import requests | |
import shutil | |
import urllib.request | |
def get_total_pages(): | |
""" | |
Get the amount of pages defined | |
:return: The total amount of pages defined in the URL | |
""" | |
print('Trying base URL') | |
r = requests.get( | |
'https://www.easports.com/uk/fifa/ultimate-team/api/fut/item?jsonParamObject=%7B%22page%22:1%7D').json() | |
print('Success!') | |
total_pages = r['totalPages'] | |
print('There are {} pages to be looped over'.format(total_pages)) | |
return total_pages | |
def lets_get_dem_images(model, page, page_number, total_pages, url_blacklist, url_errors): | |
# We only want to get images for certain models | |
allowed_models = ['nation', 'league', 'club', 'player'] | |
# We can't do anything with the model itself so we need to make it usable | |
model_name = model.__name__.lower() | |
if model_name in allowed_models: | |
# Since EA are AWESOME the players images are under the headshot object, | |
# rest are fine | |
model_reference = model_name if model_name != 'player' else 'headshot' | |
# Directories need to exist before files are moved | |
dir_model = os.path.join(settings.BASE_DIR, 'static/img/{}'.format(model_name)) | |
if not os.path.exists(dir_model): | |
print('Making {}'.format(dir_model)) | |
os.makedirs(dir_model) | |
# Let's start getting the images | |
for item in page['items']: | |
# All but player models have the url on the same line as the id, player | |
# does not, same for the ID | |
url_image = item[model_reference]['imgUrl'] if model_name != 'player' else item[model_reference]['largeImgUrl'] | |
model_id = item[model_reference]['id'] if model_name != 'player' else item['baseId'] | |
# We check the url against a list of images that have already been | |
# scraped so we don't send out pointless requests to stuff we already | |
# have | |
if url_image not in url_blacklist: | |
# Where we are putting the file | |
item_destination = '{}/{}.png'.format(dir_model, model_id) | |
# We get alot of timeout errors so we put it in a try/except | |
# to pass them if this happens | |
try: | |
response = urllib.request.urlopen(url_image, timeout=5) | |
file_name, header = urllib.request.urlretrieve(url_image) | |
with response, open(file_name, 'wb') as out_file: | |
shutil.copyfileobj(response, out_file) | |
if not os.path.exists(item_destination): | |
shutil.move(file_name, item_destination) | |
print('Page {}/{} - Created {} image, added to URL blacklist'.format(page_number, total_pages, item_destination)) | |
else: | |
print('Page {}/{} - {} already exists, added to URL blacklist'.format(page_number, total_pages, item_destination)) | |
# If we have got this far it means it's a new URL since we | |
# don't want to request this again we put it in a blacklist | |
url_blacklist.append(url_image) | |
except urllib.error.URLError as error: | |
print(error.reason) | |
url_errors[item_destination] = error.reason | |
pass | |
except urllib.error.HTTPError as error: | |
print(error.code) | |
print(error.read()) | |
url_errors[item_destination] = error.read() | |
pass | |
except Exception as error: | |
url_errors[item_destination] = error | |
pass | |
else: | |
print('Page {}/{} - {} already scraped'.format(page_number, total_pages, model_id)) | |
return url_errors, url_blacklist | |
def lets_get_that_data(app, model): | |
# Set some baselines, we + 1 on the total_pages so we can use range and | |
# that stops 1 before the defined number | |
new = 0 | |
exists = 0 | |
base_page = 1 | |
total_pages = get_total_pages() + 1 | |
app_model = get_model(app, model) | |
start_time = datetime.now() | |
url_blacklist = [] | |
url_errors = {} | |
# If the app is clubs or leagues we need to get some extra data to help | |
# create our models | |
if app in ['clubs', 'leagues']: | |
# EA calls clubs teams so that explains the format if statement | |
ut_data = requests.get( | |
'https://fifa15.content.easports.com/fifa/fltOnlineAssets/8D941B48-51BB-4B87-960A-06A61A62EBC0/2015/fut/items/web/{}.json'.format( | |
app if app == 'leagues' else 'teams')).json() | |
# Let's loop through all the pages and get all our data | |
for i in range(base_page, total_pages): | |
print('Scraping page {}'.format(i)) | |
try: | |
# Let's get the page | |
page = requests.get( | |
'https://www.easports.com/uk/fifa/ultimate-team/api/fut/item?jsonParamObject=%7B%22page%22:{}%7D'.format( | |
i)).json() | |
print('Got page {}'.format(i)) | |
except Exception as error: | |
print(error) | |
# We pass this dictionary to the get_or_create as there is some variable | |
# data | |
object_data = {} | |
images = lets_get_dem_images(app_model, page, i, total_pages, url_blacklist, url_errors) | |
if images[0]: | |
url_errors[i] = images[0] | |
if images[1]: | |
url_blacklist.append(images[1]) | |
for item in page['items']: | |
if model is not 'player': | |
# All models but the 'player' model share the same base data | |
data = item[model] | |
object_data['asset_id'] = data['id'] | |
object_data['name'] = data['name'] | |
object_data['name_abbr'] = data['abbrName'] | |
object_data['slug'] = slugify(data['name']) | |
else: | |
# The 'player' model has alot more and different data so we | |
# seperate this out | |
# TODO: We can potentially rename all these fields to the same as the JSON then forloop over instead of all this.. | |
object_data['asset_id'] = item['baseId'] | |
object_data['first_name'] = item['firstName'] | |
object_data['last_name'] = item['lastName'] | |
object_data['common_name'] = item['commonName'] if item[ | |
'commonName'] else '{} {}'.format(item['firstName'], | |
item['lastName']) | |
object_data['birth_date'] = item['birthdate'] | |
object_data['overall_rating'] = item['rating'] | |
object_data['potential_rating'] = item['potential'] | |
object_data['height'] = item['height'] | |
object_data['weight'] = item['weight'] | |
object_data['position'] = item['position'] | |
object_data['preferred_foot'] = item['foot'] | |
object_data['skill_moves'] = item['skillMoves'] | |
object_data['weak_foot'] = item['weakFoot'] | |
object_data['workrate_att'] = item['atkWorkRate'] | |
object_data['workrate_def'] = item['defWorkRate'] | |
object_data['card_att1'] = item['attributes'][0]['value'] | |
object_data['card_att2'] = item['attributes'][1]['value'] | |
object_data['card_att3'] = item['attributes'][2]['value'] | |
object_data['card_att4'] = item['attributes'][3]['value'] | |
object_data['card_att5'] = item['attributes'][4]['value'] | |
object_data['card_att6'] = item['attributes'][5]['value'] | |
object_data['quality'] = item['quality'] | |
object_data['color'] = item['color'] | |
object_data['is_special'] = item['isSpecialType'] | |
object_data['item_type'] = item['itemType'] | |
object_data['acceleration'] = item['acceleration'] | |
object_data['aggression'] = item['aggression'] | |
object_data['agility'] = item['agility'] | |
object_data['balance'] = item['balance'] | |
object_data['ball_control'] = item['ballcontrol'] | |
object_data['crossing'] = item['crossing'] | |
object_data['curve'] = item['curve'] | |
object_data['dribbling'] = item['dribbling'] | |
object_data['finishing'] = item['finishing'] | |
object_data['free_kick_accuracy'] = item['freekickaccuracy'] | |
object_data['gk_diving'] = item['gkdiving'] | |
object_data['gk_handling'] = item['gkhandling'] | |
object_data['gk_kicking'] = item['gkkicking'] | |
object_data['gk_positioning'] = item['gkpositioning'] | |
object_data['gk_reflexes'] = item['gkreflexes'] | |
object_data['heading_accuracy'] = item['headingaccuracy'] | |
object_data['interceptions'] = item['interceptions'] | |
object_data['jumping'] = item['jumping'] | |
object_data['long_passing'] = item['longpassing'] | |
object_data['long_shots'] = item['longshots'] | |
object_data['marking'] = item['marking'] | |
object_data['penalties'] = item['penalties'] | |
object_data['positioning'] = item['positioning'] | |
object_data['reactions'] = item['reactions'] | |
object_data['short_passing'] = item['shortpassing'] | |
object_data['shot_power'] = item['shotpower'] | |
object_data['sliding_tackle'] = item['slidingtackle'] | |
object_data['sprint_speed'] = item['sprintspeed'] | |
object_data['standing_tackle'] = item['standingtackle'] | |
object_data['stamina'] = item['stamina'] | |
object_data['strength'] = item['strength'] | |
object_data['vision'] = item['vision'] | |
object_data['volleys'] = item['volleys'] | |
object_data['traits'] = item['traits'] | |
object_data['specialities'] = item['specialities'] | |
object_data['player_type'] = item['playerType'] | |
object_data['is_goalkeeper'] = item['isGK'] | |
if app == 'leagues': | |
# If the app is 'leagues' we will do some loops over the extra | |
# data we got earlier to find the models depdency. In this case | |
# a 'league' needs a Nation | |
# For some reason EA like to have a random LegendsLeague object | |
# instead of putting it in with the others so we need to do 2 | |
# checks to get the nation_id | |
for thing in ut_data['Leagues']['League']: | |
if int(data['id'] == int(thing['LeagueId'])): | |
nation_id = thing['NationId'] | |
if int(data['id']) == int( | |
ut_data['Leagues']['LegendsLeague']['LeagueId']): | |
nation_id = int( | |
ut_data['Leagues']['LegendsLeague']['NationId']) | |
# Lets get the model object to pass into the object_data | |
dependency = Nation.objects.get(asset_id=nation_id) | |
if app == 'clubs': | |
# Same thing as above if the app is 'clubs' but they are even | |
# more stupid with this so we have to do 4 checks. | |
for thing in ut_data['Teams']['Team']: | |
if int(data['id']) == int(thing['TeamId']): | |
league_id = int(thing['LeagueId']) | |
if int(data['id']) == int( | |
ut_data['Teams']['LegendsTeam']['TeamId']): | |
league_id = int(ut_data['Teams']['LegendsTeam']['LeagueId']) | |
for thing in ut_data['Teams']['IcebreakerTeam']: | |
if int(data['id']) == int(thing['TeamId']): | |
league_id = int(thing['LeagueId']) | |
for thing in ut_data['Teams']['InternationalTeam']: | |
if int(data['id']) == int(thing['TeamId']): | |
league_id = int(thing['LeagueId']) | |
# Lets get the model object to pass into the object_data | |
dependency = League.objects.get(asset_id=league_id) | |
# All but the Nation model has some ForeignKey dependencies so we | |
# create a map for them | |
dependencies = { | |
'club': 'league', | |
'league': 'nation', | |
'player': ['club', 'league', 'nation'] | |
} | |
# Let's pass the model dependencies through to the object_data | |
if app in ['clubs', 'leagues', 'players']: | |
# Since the 'players' app has 3 dependencies we need to loop | |
if app == 'players': | |
for player_dependency in dependencies['player']: | |
# Apps are just a plural of the model name so we just add 's' | |
player_model = get_model(player_dependency + 's', player_dependency) | |
# Add the dependency to the object_data dict | |
object_data[player_dependency] = player_model.objects.get( | |
asset_id=item[player_dependency]['id'] | |
) | |
else: | |
# Simple here, other apps have 1 dependency. Add it. | |
object_data[dependencies[model]] = dependency | |
try: | |
# Lets create or get the object based on our object_data dict | |
obj, created = app_model.objects.get_or_create( | |
**object_data | |
) | |
# Just give some terminal output to show progress | |
# TODO: These need fixing just return 0 / 24 atm.. | |
if created: | |
new += 1 | |
print('Page {}/{} - Created {}'.format(i, total_pages, obj)) | |
else: | |
print('Page {}/{} - {} exists'.format(i, total_pages, obj)) | |
exists += 1 | |
# TODO: Not sure what this actually does, need to check. | |
except ObjectDoesNotExist as error: | |
print(error) | |
break | |
# Let's help the terminal | |
print('There were {} {} created'.format(new, app)) | |
print('There were {} {} existing'.format(exists, app)) | |
print(url_errors) | |
print('Total runtime was: {}'.format(datetime.now() - start_time)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment