Skip to content

Instantly share code, notes, and snippets.

@btahir
Last active January 2, 2022 18:01
Show Gist options
  • Save btahir/af1c57771fc4318792689a11122e6388 to your computer and use it in GitHub Desktop.
Save btahir/af1c57771fc4318792689a11122e6388 to your computer and use it in GitHub Desktop.
medium-backup-script.ipynb
Display the source blob
Display the rendered blob
Raw
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "medium-backup-script.ipynb",
"version": "0.3.2",
"provenance": [],
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/gist/btahir/af1c57771fc4318792689a11122e6388/medium-backup-script.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "ffvp_f9fbYNV",
"colab_type": "text"
},
"source": [
"# Backup Your Medium Articles\n",
"\n",
"This script is a quick and dirty way to backup the contents of your medium stories. "
]
},
{
"cell_type": "code",
"metadata": {
"id": "jG9TcDStrhLe",
"colab_type": "code",
"colab": {}
},
"source": [
"# import stuff\n",
"import requests\n",
"import bs4\n",
"import os\n",
"import shutil\n",
"from PIL import Image"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "fjsF_K5lronE",
"colab_type": "code",
"colab": {}
},
"source": [
"#@title Make A Folder\n",
"FOLDER_NAME = 'medium' #@param {type:\"string\"}\n",
"os.mkdir(FOLDER_NAME)"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "pRVDffnZro9O",
"colab_type": "code",
"colab": {}
},
"source": [
"# cd into that directory\n",
"os.chdir(FOLDER_NAME)"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "BnLdQjrirpD9",
"colab_type": "code",
"colab": {}
},
"source": [
"#@title Enter Medium Story URL\n",
"MEDIUM_URL = 'https://medium.com/@btahir/using-python-to-backup-your-medium-articles-6f581a0c0d90' #@param {type:\"string\"}\n",
"\n",
"response = requests.get(MEDIUM_URL)\n",
"soup = bs4.BeautifulSoup(response.text,'lxml')"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "CqQZrN00sJxz",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 34
},
"outputId": "3984c200-0053-43cf-8f51-5eba645f449e"
},
"source": [
"# get title\n",
"title = soup.find('h1').get_text()\n",
"print(title)"
],
"execution_count": 5,
"outputs": [
{
"output_type": "stream",
"text": [
"Using Python to Backup your Medium Articles\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "-7_fFpO8sZm1",
"colab_type": "code",
"colab": {}
},
"source": [
"# get text\n",
"paragraphs = soup.find_all('p')\n",
"\n",
"with open('content.txt', 'w') as f:\n",
" f.write(title + '\\n\\n')\n",
" for p in paragraphs:\n",
" f.write( p.get_text() )\n",
" f.write('\\n\\n')"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "LMUAvCklsmD0",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 119
},
"outputId": "51266a5a-f3da-4e6e-a4c7-ebedf5364a75"
},
"source": [
"# get images\n",
"images = soup.find('body').find_all('img')\n",
"i = 0\n",
"\n",
"for im in images:\n",
" i+=1\n",
" url = im.get('src')\n",
" print('Downloading: ', url) \n",
" try:\n",
" response = requests.get(url, stream=True)\n",
" with open(str(i) + '.jpg', 'wb') as out_file:\n",
" shutil.copyfileobj(response.raw, out_file)\n",
" del response\n",
" except:\n",
" print('Could not download: ', url)"
],
"execution_count": 7,
"outputs": [
{
"output_type": "stream",
"text": [
"Downloading: https://miro.medium.com/fit/c/96/96/1*r5v9ysUAacoGCpxgMkjLiQ.jpeg\n",
"Downloading: https://miro.medium.com/max/60/1*2qHXJ3-C6-RGphhlak9X5Q.jpeg?q=20\n",
"Downloading: None\n",
"Could not download: None\n",
"Downloading: https://miro.medium.com/max/1400/1*2qHXJ3-C6-RGphhlak9X5Q.jpeg\n",
"Downloading: https://miro.medium.com/fit/c/160/160/1*r5v9ysUAacoGCpxgMkjLiQ.jpeg\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "Y7jaY3SJ_jkn",
"colab_type": "code",
"colab": {}
},
"source": [
"# delete smaller thumbnail photos - let's say any images below 500 height + width\n",
"for file in os.listdir('.'):\n",
" if file.endswith(\".jpg\"):\n",
" im = Image.open(file)\n",
" size = im.height + im.width\n",
" if size < 500:\n",
" os.remove(file)"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "Z42OPGS9ukHW",
"colab_type": "code",
"colab": {}
},
"source": [
"videos = soup.find_all('iframe')\n",
"\n",
"youtubeStarterURL = 'https://www.youtube.com/watch?v='\n",
"vlist = []\n",
"\n",
"for v in videos:\n",
" try:\n",
" src = v.get('src')\n",
" start = src.index('watch%3Fv%3D')\n",
" end = src.index('&image')\n",
" vlist.append(youtubeStarterURL + src[start+12:end])\n",
" except:\n",
" continue\n",
"\n",
"if len(vlist) > 0:\n",
" with open('video-links.txt', 'w') as f:\n",
" for vid in vlist:\n",
" f.write( str(vid) )\n",
" f.write('\\n\\n')"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "OY5Tb6SI4aSe",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 34
},
"outputId": "4fe6fc49-ed1a-4b47-cd2f-7641b4074b09"
},
"source": [
"cd .."
],
"execution_count": 10,
"outputs": [
{
"output_type": "stream",
"text": [
"/content\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "J-MJqeeuvMJw",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 34
},
"outputId": "dfd8e1f7-7097-48c0-b415-03b5ed7f8f27"
},
"source": [
"#@title ZIP Folder\n",
"ZIP_FILE = 'medium-zipped' #@param {type:\"string\"}\n",
"shutil.make_archive(ZIP_FILE, 'zip', FOLDER_NAME)"
],
"execution_count": 11,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"'/content/medium-zipped.zip'"
]
},
"metadata": {
"tags": []
},
"execution_count": 11
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "HwY2ISxhaoYu",
"colab_type": "text"
},
"source": [
"You can download the final zipped file manually form the left or send to dropbox/google drive/ other storage place."
]
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment