Skip to content

Instantly share code, notes, and snippets.

@dan-gamble
Last active August 29, 2015 14:11
Show Gist options
  • Save dan-gamble/43e5477651ac8e8b654c to your computer and use it in GitHub Desktop.
Save dan-gamble/43e5477651ac8e8b654c to your computer and use it in GitHub Desktop.
from django.conf import settings
from django.core.exceptions import ObjectDoesNotExist
from django.db.models.loading import get_model
from django.utils.text import slugify
from apps.leagues.models import League
from apps.nations.models import Nation
from datetime import datetime
import os
import requests
import shutil
import urllib.request
def get_total_pages():
"""
Get the amount of pages defined
:return: The total amount of pages defined in the URL
"""
print('Trying base URL')
r = requests.get(
'https://www.easports.com/uk/fifa/ultimate-team/api/fut/item?jsonParamObject=%7B%22page%22:1%7D').json()
print('Success!')
total_pages = r['totalPages']
print('There are {} pages to be looped over'.format(total_pages))
return total_pages
def lets_get_dem_images(model, page, page_number, total_pages, url_blacklist, url_errors):
# We only want to get images for certain models
allowed_models = ['nation', 'league', 'club', 'player']
# We can't do anything with the model itself so we need to make it usable
model_name = model.__name__.lower()
if model_name in allowed_models:
# Since EA are AWESOME the players images are under the headshot object,
# rest are fine
model_reference = model_name if model_name != 'player' else 'headshot'
# Directories need to exist before files are moved
dir_model = os.path.join(settings.BASE_DIR, 'static/img/{}'.format(model_name))
if not os.path.exists(dir_model):
print('Making {}'.format(dir_model))
os.makedirs(dir_model)
# Let's start getting the images
for item in page['items']:
# All but player models have the url on the same line as the id, player
# does not, same for the ID
url_image = item[model_reference]['imgUrl'] if model_name != 'player' else item[model_reference]['largeImgUrl']
model_id = item[model_reference]['id'] if model_name != 'player' else item['baseId']
# We check the url against a list of images that have already been
# scraped so we don't send out pointless requests to stuff we already
# have
if url_image not in url_blacklist:
# Where we are putting the file
item_destination = '{}/{}.png'.format(dir_model, model_id)
# We get alot of timeout errors so we put it in a try/except
# to pass them if this happens
try:
response = urllib.request.urlopen(url_image, timeout=5)
file_name, header = urllib.request.urlretrieve(url_image)
with response, open(file_name, 'wb') as out_file:
shutil.copyfileobj(response, out_file)
if not os.path.exists(item_destination):
shutil.move(file_name, item_destination)
print('Page {}/{} - Created {} image, added to URL blacklist'.format(page_number, total_pages, item_destination))
else:
print('Page {}/{} - {} already exists, added to URL blacklist'.format(page_number, total_pages, item_destination))
# If we have got this far it means it's a new URL since we
# don't want to request this again we put it in a blacklist
url_blacklist.append(url_image)
except urllib.error.URLError as error:
print(error.reason)
url_errors[item_destination] = error.reason
pass
except urllib.error.HTTPError as error:
print(error.code)
print(error.read())
url_errors[item_destination] = error.read()
pass
except Exception as error:
url_errors[item_destination] = error
pass
else:
print('Page {}/{} - {} already scraped'.format(page_number, total_pages, model_id))
return url_errors, url_blacklist
def lets_get_that_data(app, model):
# Set some baselines, we + 1 on the total_pages so we can use range and
# that stops 1 before the defined number
new = 0
exists = 0
base_page = 1
total_pages = get_total_pages() + 1
app_model = get_model(app, model)
start_time = datetime.now()
url_blacklist = []
url_errors = {}
# If the app is clubs or leagues we need to get some extra data to help
# create our models
if app in ['clubs', 'leagues']:
# EA calls clubs teams so that explains the format if statement
ut_data = requests.get(
'https://fifa15.content.easports.com/fifa/fltOnlineAssets/8D941B48-51BB-4B87-960A-06A61A62EBC0/2015/fut/items/web/{}.json'.format(
app if app == 'leagues' else 'teams')).json()
# Let's loop through all the pages and get all our data
for i in range(base_page, total_pages):
print('Scraping page {}'.format(i))
try:
# Let's get the page
page = requests.get(
'https://www.easports.com/uk/fifa/ultimate-team/api/fut/item?jsonParamObject=%7B%22page%22:{}%7D'.format(
i)).json()
print('Got page {}'.format(i))
except Exception as error:
print(error)
# We pass this dictionary to the get_or_create as there is some variable
# data
object_data = {}
images = lets_get_dem_images(app_model, page, i, total_pages, url_blacklist, url_errors)
if images[0]:
url_errors[i] = images[0]
if images[1]:
url_blacklist.append(images[1])
for item in page['items']:
if model is not 'player':
# All models but the 'player' model share the same base data
data = item[model]
object_data['asset_id'] = data['id']
object_data['name'] = data['name']
object_data['name_abbr'] = data['abbrName']
object_data['slug'] = slugify(data['name'])
else:
# The 'player' model has alot more and different data so we
# seperate this out
# TODO: We can potentially rename all these fields to the same as the JSON then forloop over instead of all this..
object_data['asset_id'] = item['baseId']
object_data['first_name'] = item['firstName']
object_data['last_name'] = item['lastName']
object_data['common_name'] = item['commonName'] if item[
'commonName'] else '{} {}'.format(item['firstName'],
item['lastName'])
object_data['birth_date'] = item['birthdate']
object_data['overall_rating'] = item['rating']
object_data['potential_rating'] = item['potential']
object_data['height'] = item['height']
object_data['weight'] = item['weight']
object_data['position'] = item['position']
object_data['preferred_foot'] = item['foot']
object_data['skill_moves'] = item['skillMoves']
object_data['weak_foot'] = item['weakFoot']
object_data['workrate_att'] = item['atkWorkRate']
object_data['workrate_def'] = item['defWorkRate']
object_data['card_att1'] = item['attributes'][0]['value']
object_data['card_att2'] = item['attributes'][1]['value']
object_data['card_att3'] = item['attributes'][2]['value']
object_data['card_att4'] = item['attributes'][3]['value']
object_data['card_att5'] = item['attributes'][4]['value']
object_data['card_att6'] = item['attributes'][5]['value']
object_data['quality'] = item['quality']
object_data['color'] = item['color']
object_data['is_special'] = item['isSpecialType']
object_data['item_type'] = item['itemType']
object_data['acceleration'] = item['acceleration']
object_data['aggression'] = item['aggression']
object_data['agility'] = item['agility']
object_data['balance'] = item['balance']
object_data['ball_control'] = item['ballcontrol']
object_data['crossing'] = item['crossing']
object_data['curve'] = item['curve']
object_data['dribbling'] = item['dribbling']
object_data['finishing'] = item['finishing']
object_data['free_kick_accuracy'] = item['freekickaccuracy']
object_data['gk_diving'] = item['gkdiving']
object_data['gk_handling'] = item['gkhandling']
object_data['gk_kicking'] = item['gkkicking']
object_data['gk_positioning'] = item['gkpositioning']
object_data['gk_reflexes'] = item['gkreflexes']
object_data['heading_accuracy'] = item['headingaccuracy']
object_data['interceptions'] = item['interceptions']
object_data['jumping'] = item['jumping']
object_data['long_passing'] = item['longpassing']
object_data['long_shots'] = item['longshots']
object_data['marking'] = item['marking']
object_data['penalties'] = item['penalties']
object_data['positioning'] = item['positioning']
object_data['reactions'] = item['reactions']
object_data['short_passing'] = item['shortpassing']
object_data['shot_power'] = item['shotpower']
object_data['sliding_tackle'] = item['slidingtackle']
object_data['sprint_speed'] = item['sprintspeed']
object_data['standing_tackle'] = item['standingtackle']
object_data['stamina'] = item['stamina']
object_data['strength'] = item['strength']
object_data['vision'] = item['vision']
object_data['volleys'] = item['volleys']
object_data['traits'] = item['traits']
object_data['specialities'] = item['specialities']
object_data['player_type'] = item['playerType']
object_data['is_goalkeeper'] = item['isGK']
if app == 'leagues':
# If the app is 'leagues' we will do some loops over the extra
# data we got earlier to find the models depdency. In this case
# a 'league' needs a Nation
# For some reason EA like to have a random LegendsLeague object
# instead of putting it in with the others so we need to do 2
# checks to get the nation_id
for thing in ut_data['Leagues']['League']:
if int(data['id'] == int(thing['LeagueId'])):
nation_id = thing['NationId']
if int(data['id']) == int(
ut_data['Leagues']['LegendsLeague']['LeagueId']):
nation_id = int(
ut_data['Leagues']['LegendsLeague']['NationId'])
# Lets get the model object to pass into the object_data
dependency = Nation.objects.get(asset_id=nation_id)
if app == 'clubs':
# Same thing as above if the app is 'clubs' but they are even
# more stupid with this so we have to do 4 checks.
for thing in ut_data['Teams']['Team']:
if int(data['id']) == int(thing['TeamId']):
league_id = int(thing['LeagueId'])
if int(data['id']) == int(
ut_data['Teams']['LegendsTeam']['TeamId']):
league_id = int(ut_data['Teams']['LegendsTeam']['LeagueId'])
for thing in ut_data['Teams']['IcebreakerTeam']:
if int(data['id']) == int(thing['TeamId']):
league_id = int(thing['LeagueId'])
for thing in ut_data['Teams']['InternationalTeam']:
if int(data['id']) == int(thing['TeamId']):
league_id = int(thing['LeagueId'])
# Lets get the model object to pass into the object_data
dependency = League.objects.get(asset_id=league_id)
# All but the Nation model has some ForeignKey dependencies so we
# create a map for them
dependencies = {
'club': 'league',
'league': 'nation',
'player': ['club', 'league', 'nation']
}
# Let's pass the model dependencies through to the object_data
if app in ['clubs', 'leagues', 'players']:
# Since the 'players' app has 3 dependencies we need to loop
if app == 'players':
for player_dependency in dependencies['player']:
# Apps are just a plural of the model name so we just add 's'
player_model = get_model(player_dependency + 's', player_dependency)
# Add the dependency to the object_data dict
object_data[player_dependency] = player_model.objects.get(
asset_id=item[player_dependency]['id']
)
else:
# Simple here, other apps have 1 dependency. Add it.
object_data[dependencies[model]] = dependency
try:
# Lets create or get the object based on our object_data dict
obj, created = app_model.objects.get_or_create(
**object_data
)
# Just give some terminal output to show progress
# TODO: These need fixing just return 0 / 24 atm..
if created:
new += 1
print('Page {}/{} - Created {}'.format(i, total_pages, obj))
else:
print('Page {}/{} - {} exists'.format(i, total_pages, obj))
exists += 1
# TODO: Not sure what this actually does, need to check.
except ObjectDoesNotExist as error:
print(error)
break
# Let's help the terminal
print('There were {} {} created'.format(new, app))
print('There were {} {} existing'.format(exists, app))
print(url_errors)
print('Total runtime was: {}'.format(datetime.now() - start_time))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment