Created
April 17, 2021 17:52
-
-
Save Mahelita/a6a934071f926a944d57ad0c6c99852d to your computer and use it in GitHub Desktop.
Scrape tonie tracks
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"id": "nasty-denver", | |
"metadata": {}, | |
"source": [ | |
"## Get JSON and first part of tonie urls (the series part)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"id": "strong-louisiana", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from bs4 import BeautifulSoup\n", | |
"from fuzzywuzzy import fuzz\n", | |
"import json\n", | |
"import numpy as np\n", | |
"import pandas as pd\n", | |
"import re\n", | |
"import requests\n", | |
"\n", | |
"url = 'http://gt-blog.de/JSON/tonies.json'\n", | |
"data = requests.get(url).json()\n", | |
"\n", | |
"url_base = 'https://tonies.de'\n", | |
"special_char_map = {ord('!'): '', ord('?'): '', ord('’'): '', ord('&'): '', ord('.'): '', ord(','): '', ord(' '): '-', ord('ä'):'ae', ord('ü'):'ue', ord('ö'):'oe', ord('ß'):'ss'}\n", | |
"series_names_url = []\n", | |
"episode_names = []\n", | |
"episode_names_url = []\n", | |
"series_urls = []\n", | |
"for tonie in data:\n", | |
" if tonie['language'] == 'de':\n", | |
" series_names_url.append(re.sub('--+', '-', tonie['series'].translate(special_char_map).lower()))\n", | |
" episode_names.append(tonie['episodes'])\n", | |
" episode_names_url.append(re.sub('--+', '-', episode_names[-1].translate(special_char_map).lower()))\n", | |
" series_urls.append('https://tonies.de/shop/tonies/{series}/'.format(series=series_names_url[-1]))\n", | |
" #TODO Instead of collecting series_urls, directly get track names and add to tonies, then save json file\n", | |
"series_urls = np.unique(series_urls)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "metallic-greek", | |
"metadata": {}, | |
"source": [ | |
"## Check if the first part of the url is correct and get the listed episodes" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"id": "worldwide-sensitivity", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"https://tonies.de/shop/tonies/der-raeuber-hotzenplotz/\n", | |
"https://tonies.de/shop/tonies/die-maus/\n", | |
"https://tonies.de/shop/tonies/heule-eule-und-andere-geschichten/\n", | |
"https://tonies.de/shop/tonies/kosmo-klax/\n", | |
"https://tonies.de/shop/tonies/kreativ-tonie/\n", | |
"https://tonies.de/shop/tonies/nola-note/\n", | |
"https://tonies.de/shop/tonies/rotzn-roll-radio/\n" | |
] | |
} | |
], | |
"source": [ | |
"all_episode_urls = []\n", | |
"for url,series,episode in zip(series_urls, series_names_url, episode_names):\n", | |
" r = requests.get(url)\n", | |
" if r.status_code == 200:\n", | |
" soup = BeautifulSoup(r.content)\n", | |
" href_all = soup.find_all('a', href=True)\n", | |
" request_urls = [href['href'] for href in href_all]\n", | |
" episode_urls = [a.find(url[17:]) for a in request_urls]\n", | |
" episode_urls = [0 if a == -1 else 1 for a in episode_urls]\n", | |
" all_episode_urls.append(np.unique(np.array(request_urls)[np.where(episode_urls)[0]]))\n", | |
" \n", | |
" else:\n", | |
" print(url)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "regulated-cargo", | |
"metadata": {}, | |
"source": [ | |
"## Get the track names of all episodes" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"id": "special-batch", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"episode_list = []\n", | |
"for series_array in all_episode_urls:\n", | |
" for partial_url in series_array:\n", | |
" full_url = url_base + partial_url\n", | |
" r = requests.get(full_url)\n", | |
"\n", | |
" soup = BeautifulSoup(r.content)\n", | |
" titlelist = soup.find_all(id=\"tabs--large-up__titelliste\")\n", | |
" if titlelist:\n", | |
" titlelist = [a.get_text() for a in titlelist[0].find_all('p')]\n", | |
" episode_list.append([full_url, partial_url.split('/')[-3], partial_url.split('/')[-2], titlelist])\n", | |
"df = pd.DataFrame(episode_list, columns=['full_url', 'series_url', 'episode_url', 'tracks'])" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "successful-closure", | |
"metadata": {}, | |
"source": [ | |
"## Match each episode back to the JSON" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "intensive-native", | |
"metadata": {}, | |
"source": [ | |
"The nesting order is relevant as otherwise many low-quality matches will happen" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"id": "mature-retreat", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"better match found!\n", | |
"better match found!\n", | |
"better match found!\n", | |
"better match found!\n", | |
"better match found!\n", | |
"better match found!\n", | |
"better match found!\n", | |
"better match found!\n" | |
] | |
} | |
], | |
"source": [ | |
"for index, row in df.iterrows():\n", | |
" fuzzy_ratio = []\n", | |
" for tonie in data:\n", | |
"# if tonie['language'] == 'de':\n", | |
" full_name = ' - '.join(row['full_url'].replace('-', ' ').lower().split('/')[-3:-1])\n", | |
" fuzzy_ratio.append(fuzz.ratio(full_name, tonie['title'].lower()))\n", | |
" matching_tonie = np.argsort(fuzzy_ratio)[-1]\n", | |
" if 'fuzzy_ratio' not in data[matching_tonie]:\n", | |
" data[matching_tonie]['tracks'] = row['tracks']\n", | |
" data[matching_tonie]['url'] = row['full_url']\n", | |
" data[matching_tonie]['fuzzy_ratio'] = str(np.max(fuzzy_ratio))\n", | |
" elif int(data[matching_tonie]['fuzzy_ratio']) < np.max(fuzzy_ratio):\n", | |
" print('better match found!')\n", | |
" data[matching_tonie]['tracks'] = row['tracks']\n", | |
" data[matching_tonie]['url'] = row['full_url']\n", | |
" data[matching_tonie]['fuzzy_ratio'] = str(np.max(fuzzy_ratio))\n", | |
"\n", | |
"with open('tonies.json', 'w') as f:\n", | |
" json.dump(data, f)" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.9.1" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 5 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Unfortunately, retrieving a "series" using requests.get() for me doesn't return the same information I'd get via the browser. Example: https://tonies.com/de-de/tonies/?series=anne-kaffeekanne ("tonies.de" gets replaced by "tonies.com/de-de" and "tonies/${series}" becomes "tonies/?series=${series}") - in the browser I get 1 hit while the python code returns some random, and unrelated, stuff :(
Any suggestions what might go wrong here?
Also I'm still trying to find out what's happening in the 4th stage... I'm getting some rather bad matches