simonlindgren · January 27, 2022 09:09
diff --git a/c19images.ipynb b/c19images.ipynb
 {
 "cells": [
  {
   "cell_type": "markdown",
   "id": "bcedc959-7660-4d48-82cd-464205a092c9",
   "metadata": {},
   "source": [
    "Required packages."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "87a92b7a-f334-4ff1-9812-ef745966a303",
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip install wayback-machine-scraper"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 195,
   "id": "098a8587-86bc-44e1-9563-02e74e23089e",
   "metadata": {},
   "outputs": [],
   "source": [
    "import glob\n",
    "from bs4 import BeautifulSoup\n",
    "import pandas as pd\n",
    "import urllib.request\n",
    "import os\n",
    "from PIL import Image, ImageDraw, ImageFont\n",
    "import cv2\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0cb4890b-5c27-4157-9163-c571c597d545",
   "metadata": {},
   "source": [
    "Scrape wayback machine snapshots of SVT corona feed start page."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ebc70bc1-2afd-4538-906b-24afbdeab233",
   "metadata": {},
   "outputs": [],
   "source": [
    "!wayback-machine-scraper -a 'https://www.svt.se/nyheter/inrikes/senaste-nytt-om-coronaviruset' https://www.svt.se/nyheter/inrikes/senaste-nytt-om-coronaviruset --verbose"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "181ee7e5-b9f3-4d19-ac1b-9916dc039f3a",
   "metadata": {},
   "source": [
    "Parse the html with Beautiful Soup to get urls of header images."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "id": "1172ecb3-d1f9-4efa-a566-e4c700067d0d",
   "metadata": {},
   "outputs": [],
   "source": [
    "files = glob.glob('/mnt/data/simon/wayback/website/www.svt.se/website/www.svt.se/nyheter/inrikes/senaste-nytt-om-coronaviruset/*')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "id": "d7c5dfce-c037-4930-afec-1cc4a386712c",
   "metadata": {},
   "outputs": [],
   "source": [
    "img_urls = []\n",
    "\n",
    "for f in files:\n",
    "    with open(f,'r') as raw:\n",
    "        soup = BeautifulSoup(raw, \"html.parser\")\n",
    "        img_url = str(soup.find_all(\"div\", class_=\"nyh_breaking__top-picture__image\")[0].find_all(class_=\"pic__img pic__img--preloaded pic__img--wide\")[0]).split('\"')[-4]\n",
    "        img_urls.append(img_url)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "751d3aba-6049-4c23-8ddc-c654c117e8ea",
   "metadata": {},
   "source": [
    "Download images and save them with date of publication in filename."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 80,
   "id": "c0a1a63d-327b-40ea-a160-42731bb38c19",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.DataFrame(zip(files,img_urls))\n",
    "df.columns = ['date', 'image']\n",
    "\n",
    "df['day'] = [i.split('.')[-2].split('/')[-1][2:-6] for i in df.date]\n",
    "\n",
    "df.drop(['date'], axis = 1, inplace = True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 105,
   "id": "fd8201d2-6f1b-4626-84e7-829b4038c604",
   "metadata": {},
   "outputs": [],
   "source": [
    "for d,i in zip(df.day, df.image):\n",
    "    filename = str(d) + \".png\"\n",
    "    urllib.request.urlretrieve(i, filename)    "
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b9930e38-9dd0-4d4d-8df9-b1d3d5a8cd93",
   "metadata": {},
   "source": [
    "Go through images and find the first use of a new image to not have repeated images."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 125,
   "id": "9061f8db-3825-480f-8f75-d0cb4e11a1ec",
   "metadata": {},
   "outputs": [],
   "source": [
    "images = glob.glob('*png')\n",
    "images.sort()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 126,
   "id": "23a8ce4c-8e2c-4210-82bc-17df74f682f7",
   "metadata": {},
   "outputs": [],
   "source": [
    "prevsize = 9999999999999999\n",
    "\n",
    "for i in images:\n",
    "    size = os.path.getsize(i)\n",
    "    if size == prevsize:\n",
    "        os.remove(i)\n",
    "        continue\n",
    "    else:\n",
    "        prevsize = size      \n",
    "    "
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6ac252aa-8e53-429b-b603-7c8ecbd16072",
   "metadata": {},
   "source": [
    "Write dates on the images."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 193,
   "id": "e1726dd4-c6c9-4d85-b6b0-aa128f6fd094",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "images = glob.glob('*png')\n",
    "images.sort()\n",
    "\n",
    "for i in images:\n",
    "    a_caption = i.split('.')[0]\n",
    "    caption = \"20\" + a_caption[0:2] + \"/\" + a_caption[2:4] + \"/\" + a_caption[4:]\n",
    "    img = Image.open(i)\n",
    "    d1 = ImageDraw.Draw(img)\n",
    "    \n",
    "    # Rectangle\n",
    "    w, h = 250, 70\n",
    "    shape = [(10, 10), (w - 10, h - 10)]\n",
    "    img1 = ImageDraw.Draw(img)  \n",
    "    img1.rectangle(shape, fill =\"black\")\n",
    "    \n",
    "    # Caption on rectangle\n",
    "    myFont = ImageFont.truetype('/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf', 40)\n",
    "    d1.text((10, 10), caption, font=myFont, fill = \"white\")\n",
    "    img.save(a_caption + \".jpg\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f25b0ba9-f26f-43ff-84db-0dff7c2516f8",
   "metadata": {},
   "source": [
    "Make a movie displaying the images chronologically."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 202,
   "id": "43489b62-89ad-4253-82ab-be467f40970c",
   "metadata": {},
   "outputs": [],
   "source": [
    "images = glob.glob('*jpg')\n",
    "images.sort()\n",
    "\n",
    "img_array = []\n",
    "\n",
    "for filename in images:\n",
    "    img = cv2.imread(filename)\n",
    "    height, width, layers = img.shape\n",
    "    size = (width,height)\n",
    "    img_array.append(img)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 203,
   "id": "06e2f3a8-cd19-41c4-a7a2-7841bbb3603a",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "OpenCV: FFMPEG: tag 0x5634504d/'MP4V' is not supported with codec id 12 and format 'mp4 / MP4 (MPEG-4 Part 14)'\n",
      "OpenCV: FFMPEG: fallback to use tag 0x7634706d/'mp4v'\n"
     ]
    }
   ],
   "source": [
    "out = cv2.VideoWriter('project.mp4',cv2.VideoWriter_fourcc(*'MP4V'), 1, size)\n",
    " \n",
    "for i in range(len(img_array)):\n",
    "    out.write(img_array[i])\n",
    "out.release()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
	{
	"cells": [
	{
	"cell_type": "markdown",
	"id": "bcedc959-7660-4d48-82cd-464205a092c9",
	"metadata": {},
	"source": [
	"Required packages."
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "87a92b7a-f334-4ff1-9812-ef745966a303",
	"metadata": {},
	"outputs": [],
	"source": [
	"!pip install wayback-machine-scraper"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 195,
	"id": "098a8587-86bc-44e1-9563-02e74e23089e",
	"metadata": {},
	"outputs": [],
	"source": [
	"import glob\n",
	"from bs4 import BeautifulSoup\n",
	"import pandas as pd\n",
	"import urllib.request\n",
	"import os\n",
	"from PIL import Image, ImageDraw, ImageFont\n",
	"import cv2\n",
	"import numpy as np"
	]
	},
	{
	"cell_type": "markdown",
	"id": "0cb4890b-5c27-4157-9163-c571c597d545",
	"metadata": {},
	"source": [
	"Scrape wayback machine snapshots of SVT corona feed start page."
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "ebc70bc1-2afd-4538-906b-24afbdeab233",
	"metadata": {},
	"outputs": [],
	"source": [
	"!wayback-machine-scraper -a 'https://www.svt.se/nyheter/inrikes/senaste-nytt-om-coronaviruset' https://www.svt.se/nyheter/inrikes/senaste-nytt-om-coronaviruset --verbose"
	]
	},
	{
	"cell_type": "markdown",
	"id": "181ee7e5-b9f3-4d19-ac1b-9916dc039f3a",
	"metadata": {},
	"source": [
	"Parse the html with Beautiful Soup to get urls of header images."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 36,
	"id": "1172ecb3-d1f9-4efa-a566-e4c700067d0d",
	"metadata": {},
	"outputs": [],
	"source": [
	"files = glob.glob('/mnt/data/simon/wayback/website/www.svt.se/website/www.svt.se/nyheter/inrikes/senaste-nytt-om-coronaviruset/*')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 76,
	"id": "d7c5dfce-c037-4930-afec-1cc4a386712c",
	"metadata": {},
	"outputs": [],
	"source": [
	"img_urls = []\n",
	"\n",
	"for f in files:\n",
	" with open(f,'r') as raw:\n",
	" soup = BeautifulSoup(raw, \"html.parser\")\n",
	" img_url = str(soup.find_all(\"div\", class_=\"nyh_breaking__top-picture__image\")[0].find_all(class_=\"pic__img pic__img--preloaded pic__img--wide\")[0]).split('\"')[-4]\n",
	" img_urls.append(img_url)"
	]
	},
	{
	"cell_type": "markdown",
	"id": "751d3aba-6049-4c23-8ddc-c654c117e8ea",
	"metadata": {},
	"source": [
	"Download images and save them with date of publication in filename."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 80,
	"id": "c0a1a63d-327b-40ea-a160-42731bb38c19",
	"metadata": {},
	"outputs": [],
	"source": [
	"df = pd.DataFrame(zip(files,img_urls))\n",
	"df.columns = ['date', 'image']\n",
	"\n",
	"df['day'] = [i.split('.')[-2].split('/')[-1][2:-6] for i in df.date]\n",
	"\n",
	"df.drop(['date'], axis = 1, inplace = True)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 105,
	"id": "fd8201d2-6f1b-4626-84e7-829b4038c604",
	"metadata": {},
	"outputs": [],
	"source": [
	"for d,i in zip(df.day, df.image):\n",
	" filename = str(d) + \".png\"\n",
	" urllib.request.urlretrieve(i, filename) "
	]
	},
	{
	"cell_type": "markdown",
	"id": "b9930e38-9dd0-4d4d-8df9-b1d3d5a8cd93",
	"metadata": {},
	"source": [
	"Go through images and find the first use of a new image to not have repeated images."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 125,
	"id": "9061f8db-3825-480f-8f75-d0cb4e11a1ec",
	"metadata": {},
	"outputs": [],
	"source": [
	"images = glob.glob('*png')\n",
	"images.sort()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 126,
	"id": "23a8ce4c-8e2c-4210-82bc-17df74f682f7",
	"metadata": {},
	"outputs": [],
	"source": [
	"prevsize = 9999999999999999\n",
	"\n",
	"for i in images:\n",
	" size = os.path.getsize(i)\n",
	" if size == prevsize:\n",
	" os.remove(i)\n",
	" continue\n",
	" else:\n",
	" prevsize = size \n",
	" "
	]
	},
	{
	"cell_type": "markdown",
	"id": "6ac252aa-8e53-429b-b603-7c8ecbd16072",
	"metadata": {},
	"source": [
	"Write dates on the images."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 193,
	"id": "e1726dd4-c6c9-4d85-b6b0-aa128f6fd094",
	"metadata": {
	"tags": []
	},
	"outputs": [],
	"source": [
	"images = glob.glob('*png')\n",
	"images.sort()\n",
	"\n",
	"for i in images:\n",
	" a_caption = i.split('.')[0]\n",
	" caption = \"20\" + a_caption[0:2] + \"/\" + a_caption[2:4] + \"/\" + a_caption[4:]\n",
	" img = Image.open(i)\n",
	" d1 = ImageDraw.Draw(img)\n",
	" \n",
	" # Rectangle\n",
	" w, h = 250, 70\n",
	" shape = [(10, 10), (w - 10, h - 10)]\n",
	" img1 = ImageDraw.Draw(img) \n",
	" img1.rectangle(shape, fill =\"black\")\n",
	" \n",
	" # Caption on rectangle\n",
	" myFont = ImageFont.truetype('/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf', 40)\n",
	" d1.text((10, 10), caption, font=myFont, fill = \"white\")\n",
	" img.save(a_caption + \".jpg\")"
	]
	},
	{
	"cell_type": "markdown",
	"id": "f25b0ba9-f26f-43ff-84db-0dff7c2516f8",
	"metadata": {},
	"source": [
	"Make a movie displaying the images chronologically."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 202,
	"id": "43489b62-89ad-4253-82ab-be467f40970c",
	"metadata": {},
	"outputs": [],
	"source": [
	"images = glob.glob('*jpg')\n",
	"images.sort()\n",
	"\n",
	"img_array = []\n",
	"\n",
	"for filename in images:\n",
	" img = cv2.imread(filename)\n",
	" height, width, layers = img.shape\n",
	" size = (width,height)\n",
	" img_array.append(img)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 203,
	"id": "06e2f3a8-cd19-41c4-a7a2-7841bbb3603a",
	"metadata": {},
	"outputs": [
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"OpenCV: FFMPEG: tag 0x5634504d/'MP4V' is not supported with codec id 12 and format 'mp4 / MP4 (MPEG-4 Part 14)'\n",
	"OpenCV: FFMPEG: fallback to use tag 0x7634706d/'mp4v'\n"
	]
	}
	],
	"source": [
	"out = cv2.VideoWriter('project.mp4',cv2.VideoWriter_fourcc(*'MP4V'), 1, size)\n",
	" \n",
	"for i in range(len(img_array)):\n",
	" out.write(img_array[i])\n",
	"out.release()"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3 (ipykernel)",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.8.8"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 5
	}