Last active
January 2, 2022 18:01
-
-
Save btahir/af1c57771fc4318792689a11122e6388 to your computer and use it in GitHub Desktop.
medium-backup-script.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"nbformat": 4, | |
"nbformat_minor": 0, | |
"metadata": { | |
"colab": { | |
"name": "medium-backup-script.ipynb", | |
"version": "0.3.2", | |
"provenance": [], | |
"include_colab_link": true | |
}, | |
"kernelspec": { | |
"name": "python3", | |
"display_name": "Python 3" | |
} | |
}, | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "view-in-github", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"<a href=\"https://colab.research.google.com/gist/btahir/af1c57771fc4318792689a11122e6388/medium-backup-script.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "ffvp_f9fbYNV", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"# Backup Your Medium Articles\n", | |
"\n", | |
"This script is a quick and dirty way to backup the contents of your medium stories. " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "jG9TcDStrhLe", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"# import stuff\n", | |
"import requests\n", | |
"import bs4\n", | |
"import os\n", | |
"import shutil\n", | |
"from PIL import Image" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "fjsF_K5lronE", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"#@title Make A Folder\n", | |
"FOLDER_NAME = 'medium' #@param {type:\"string\"}\n", | |
"os.mkdir(FOLDER_NAME)" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "pRVDffnZro9O", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"# cd into that directory\n", | |
"os.chdir(FOLDER_NAME)" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "BnLdQjrirpD9", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"#@title Enter Medium Story URL\n", | |
"MEDIUM_URL = 'https://medium.com/@btahir/using-python-to-backup-your-medium-articles-6f581a0c0d90' #@param {type:\"string\"}\n", | |
"\n", | |
"response = requests.get(MEDIUM_URL)\n", | |
"soup = bs4.BeautifulSoup(response.text,'lxml')" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "CqQZrN00sJxz", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 34 | |
}, | |
"outputId": "3984c200-0053-43cf-8f51-5eba645f449e" | |
}, | |
"source": [ | |
"# get title\n", | |
"title = soup.find('h1').get_text()\n", | |
"print(title)" | |
], | |
"execution_count": 5, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"Using Python to Backup your Medium Articles\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "-7_fFpO8sZm1", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"# get text\n", | |
"paragraphs = soup.find_all('p')\n", | |
"\n", | |
"with open('content.txt', 'w') as f:\n", | |
" f.write(title + '\\n\\n')\n", | |
" for p in paragraphs:\n", | |
" f.write( p.get_text() )\n", | |
" f.write('\\n\\n')" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "LMUAvCklsmD0", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 119 | |
}, | |
"outputId": "51266a5a-f3da-4e6e-a4c7-ebedf5364a75" | |
}, | |
"source": [ | |
"# get images\n", | |
"images = soup.find('body').find_all('img')\n", | |
"i = 0\n", | |
"\n", | |
"for im in images:\n", | |
" i+=1\n", | |
" url = im.get('src')\n", | |
" print('Downloading: ', url) \n", | |
" try:\n", | |
" response = requests.get(url, stream=True)\n", | |
" with open(str(i) + '.jpg', 'wb') as out_file:\n", | |
" shutil.copyfileobj(response.raw, out_file)\n", | |
" del response\n", | |
" except:\n", | |
" print('Could not download: ', url)" | |
], | |
"execution_count": 7, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"Downloading: https://miro.medium.com/fit/c/96/96/1*r5v9ysUAacoGCpxgMkjLiQ.jpeg\n", | |
"Downloading: https://miro.medium.com/max/60/1*2qHXJ3-C6-RGphhlak9X5Q.jpeg?q=20\n", | |
"Downloading: None\n", | |
"Could not download: None\n", | |
"Downloading: https://miro.medium.com/max/1400/1*2qHXJ3-C6-RGphhlak9X5Q.jpeg\n", | |
"Downloading: https://miro.medium.com/fit/c/160/160/1*r5v9ysUAacoGCpxgMkjLiQ.jpeg\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "Y7jaY3SJ_jkn", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"# delete smaller thumbnail photos - let's say any images below 500 height + width\n", | |
"for file in os.listdir('.'):\n", | |
" if file.endswith(\".jpg\"):\n", | |
" im = Image.open(file)\n", | |
" size = im.height + im.width\n", | |
" if size < 500:\n", | |
" os.remove(file)" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "Z42OPGS9ukHW", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"videos = soup.find_all('iframe')\n", | |
"\n", | |
"youtubeStarterURL = 'https://www.youtube.com/watch?v='\n", | |
"vlist = []\n", | |
"\n", | |
"for v in videos:\n", | |
" try:\n", | |
" src = v.get('src')\n", | |
" start = src.index('watch%3Fv%3D')\n", | |
" end = src.index('&image')\n", | |
" vlist.append(youtubeStarterURL + src[start+12:end])\n", | |
" except:\n", | |
" continue\n", | |
"\n", | |
"if len(vlist) > 0:\n", | |
" with open('video-links.txt', 'w') as f:\n", | |
" for vid in vlist:\n", | |
" f.write( str(vid) )\n", | |
" f.write('\\n\\n')" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "OY5Tb6SI4aSe", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 34 | |
}, | |
"outputId": "4fe6fc49-ed1a-4b47-cd2f-7641b4074b09" | |
}, | |
"source": [ | |
"cd .." | |
], | |
"execution_count": 10, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"/content\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "J-MJqeeuvMJw", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 34 | |
}, | |
"outputId": "dfd8e1f7-7097-48c0-b415-03b5ed7f8f27" | |
}, | |
"source": [ | |
"#@title ZIP Folder\n", | |
"ZIP_FILE = 'medium-zipped' #@param {type:\"string\"}\n", | |
"shutil.make_archive(ZIP_FILE, 'zip', FOLDER_NAME)" | |
], | |
"execution_count": 11, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"'/content/medium-zipped.zip'" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 11 | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "HwY2ISxhaoYu", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"You can download the final zipped file manually form the left or send to dropbox/google drive/ other storage place." | |
] | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment