Created
January 27, 2022 09:09
-
-
Save simonlindgren/4e0efed4bb35f2dd6cea530ef30bb5fb to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"id": "bcedc959-7660-4d48-82cd-464205a092c9", | |
"metadata": {}, | |
"source": [ | |
"Required packages." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "87a92b7a-f334-4ff1-9812-ef745966a303", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"!pip install wayback-machine-scraper" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 195, | |
"id": "098a8587-86bc-44e1-9563-02e74e23089e", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import glob\n", | |
"from bs4 import BeautifulSoup\n", | |
"import pandas as pd\n", | |
"import urllib.request\n", | |
"import os\n", | |
"from PIL import Image, ImageDraw, ImageFont\n", | |
"import cv2\n", | |
"import numpy as np" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "0cb4890b-5c27-4157-9163-c571c597d545", | |
"metadata": {}, | |
"source": [ | |
"Scrape wayback machine snapshots of SVT corona feed start page." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "ebc70bc1-2afd-4538-906b-24afbdeab233", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"!wayback-machine-scraper -a 'https://www.svt.se/nyheter/inrikes/senaste-nytt-om-coronaviruset' https://www.svt.se/nyheter/inrikes/senaste-nytt-om-coronaviruset --verbose" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "181ee7e5-b9f3-4d19-ac1b-9916dc039f3a", | |
"metadata": {}, | |
"source": [ | |
"Parse the html with Beautiful Soup to get urls of header images." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 36, | |
"id": "1172ecb3-d1f9-4efa-a566-e4c700067d0d", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"files = glob.glob('/mnt/data/simon/wayback/website/www.svt.se/website/www.svt.se/nyheter/inrikes/senaste-nytt-om-coronaviruset/*')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 76, | |
"id": "d7c5dfce-c037-4930-afec-1cc4a386712c", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"img_urls = []\n", | |
"\n", | |
"for f in files:\n", | |
" with open(f,'r') as raw:\n", | |
" soup = BeautifulSoup(raw, \"html.parser\")\n", | |
" img_url = str(soup.find_all(\"div\", class_=\"nyh_breaking__top-picture__image\")[0].find_all(class_=\"pic__img pic__img--preloaded pic__img--wide\")[0]).split('\"')[-4]\n", | |
" img_urls.append(img_url)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "751d3aba-6049-4c23-8ddc-c654c117e8ea", | |
"metadata": {}, | |
"source": [ | |
"Download images and save them with date of publication in filename." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 80, | |
"id": "c0a1a63d-327b-40ea-a160-42731bb38c19", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df = pd.DataFrame(zip(files,img_urls))\n", | |
"df.columns = ['date', 'image']\n", | |
"\n", | |
"df['day'] = [i.split('.')[-2].split('/')[-1][2:-6] for i in df.date]\n", | |
"\n", | |
"df.drop(['date'], axis = 1, inplace = True)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 105, | |
"id": "fd8201d2-6f1b-4626-84e7-829b4038c604", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"for d,i in zip(df.day, df.image):\n", | |
" filename = str(d) + \".png\"\n", | |
" urllib.request.urlretrieve(i, filename) " | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "b9930e38-9dd0-4d4d-8df9-b1d3d5a8cd93", | |
"metadata": {}, | |
"source": [ | |
"Go through images and find the first use of a new image to not have repeated images." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 125, | |
"id": "9061f8db-3825-480f-8f75-d0cb4e11a1ec", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"images = glob.glob('*png')\n", | |
"images.sort()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 126, | |
"id": "23a8ce4c-8e2c-4210-82bc-17df74f682f7", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"prevsize = 9999999999999999\n", | |
"\n", | |
"for i in images:\n", | |
" size = os.path.getsize(i)\n", | |
" if size == prevsize:\n", | |
" os.remove(i)\n", | |
" continue\n", | |
" else:\n", | |
" prevsize = size \n", | |
" " | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "6ac252aa-8e53-429b-b603-7c8ecbd16072", | |
"metadata": {}, | |
"source": [ | |
"Write dates on the images." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 193, | |
"id": "e1726dd4-c6c9-4d85-b6b0-aa128f6fd094", | |
"metadata": { | |
"tags": [] | |
}, | |
"outputs": [], | |
"source": [ | |
"images = glob.glob('*png')\n", | |
"images.sort()\n", | |
"\n", | |
"for i in images:\n", | |
" a_caption = i.split('.')[0]\n", | |
" caption = \"20\" + a_caption[0:2] + \"/\" + a_caption[2:4] + \"/\" + a_caption[4:]\n", | |
" img = Image.open(i)\n", | |
" d1 = ImageDraw.Draw(img)\n", | |
" \n", | |
" # Rectangle\n", | |
" w, h = 250, 70\n", | |
" shape = [(10, 10), (w - 10, h - 10)]\n", | |
" img1 = ImageDraw.Draw(img) \n", | |
" img1.rectangle(shape, fill =\"black\")\n", | |
" \n", | |
" # Caption on rectangle\n", | |
" myFont = ImageFont.truetype('/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf', 40)\n", | |
" d1.text((10, 10), caption, font=myFont, fill = \"white\")\n", | |
" img.save(a_caption + \".jpg\")" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "f25b0ba9-f26f-43ff-84db-0dff7c2516f8", | |
"metadata": {}, | |
"source": [ | |
"Make a movie displaying the images chronologically." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 202, | |
"id": "43489b62-89ad-4253-82ab-be467f40970c", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"images = glob.glob('*jpg')\n", | |
"images.sort()\n", | |
"\n", | |
"img_array = []\n", | |
"\n", | |
"for filename in images:\n", | |
" img = cv2.imread(filename)\n", | |
" height, width, layers = img.shape\n", | |
" size = (width,height)\n", | |
" img_array.append(img)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 203, | |
"id": "06e2f3a8-cd19-41c4-a7a2-7841bbb3603a", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"OpenCV: FFMPEG: tag 0x5634504d/'MP4V' is not supported with codec id 12 and format 'mp4 / MP4 (MPEG-4 Part 14)'\n", | |
"OpenCV: FFMPEG: fallback to use tag 0x7634706d/'mp4v'\n" | |
] | |
} | |
], | |
"source": [ | |
"out = cv2.VideoWriter('project.mp4',cv2.VideoWriter_fourcc(*'MP4V'), 1, size)\n", | |
" \n", | |
"for i in range(len(img_array)):\n", | |
" out.write(img_array[i])\n", | |
"out.release()" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3 (ipykernel)", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.8.8" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 5 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment