stephenhouser · August 31, 2018 18:18
diff --git a/README.md b/README.md
diff --git a/ExtractRussianGrantData b/ExtractRussianGrantData
 Jupyter Notebook to scrape and translate Russian grant website
diff --git a/index.ipynb b/index.ipynb
 {
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Original Project Specification\n",
    "\n",
    "Scrape (Russian) Presidential Grants for Civil Society Development Foundation site to extract proposal metadata for further research and analysis.\n",
    "\n",
    "Subject: Link\n",
    " \n",
    "https://президентскиегранты.рф/public/application/table?RegionId=\n",
    "\n",
    "\"...Data from the first competition of 2017 and first competition of 2018.\"\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Set up the base URL and the number of pages to fetch. Choose one, comment the others.\n",
    "\n",
    "# First 2017 competition\n",
    "#url = 'https://президентскиегранты.рф/public/application/table?OnlyWinners=false&CompetitionId=1'\n",
    "#pages = 309\n",
    "\n",
    "# First 2018 competition\n",
    "#url = 'https://президентскиегранты.рф/public/application/table?OnlyWinners=false&CompetitionId=3'\n",
    "#pages = 428\n",
    "\n",
    "# Everything (1100+ pages)\n",
    "url = 'https://президентскиегранты.рф/public/application/table?OnlyWinners=false'\n",
    "#pages = 1179\n",
    "pages = 2"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Utility functions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "\n",
    "# Clean up textual data with spaces, newlines and trailing parens.\n",
    "def clean_text(dirty):\n",
    "    clean = dirty\n",
    "    clean = clean.replace('\\n', ' ')\n",
    "    clean = re.sub(' +', ' ', clean)\n",
    "    clean = clean.strip()    \n",
    "    return clean\n",
    "\n",
    "# Clean up (actually convert) numbers to space-less numbers\n",
    "def clean_number(dirty):\n",
    "    clean = dirty.strip()\n",
    "    clean = re.sub(' *', '', clean)\n",
    "    clean = clean.replace(',', '.')\n",
    "    return clean"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Simple progress bar -- https://github.com/alexanderkuk/log-progress\n",
    "# Not working on Bowdoin JupyterHub w/Python v3.6\n",
    "def log_progress(sequence, every=None, size=None, name='Items'):\n",
    "    from ipywidgets import IntProgress, HTML, VBox\n",
    "    from IPython.display import display\n",
    "\n",
    "    is_iterator = False\n",
    "    if size is None:\n",
    "        try:\n",
    "            size = len(sequence)\n",
    "        except TypeError:\n",
    "            is_iterator = True\n",
    "    if size is not None:\n",
    "        if every is None:\n",
    "            if size <= 200:\n",
    "                every = 1\n",
    "            else:\n",
    "                every = int(size / 200)     # every 0.5%\n",
    "    else:\n",
    "        assert every is not None, 'sequence is iterator, set every'\n",
    "\n",
    "    if is_iterator:\n",
    "        progress = IntProgress(min=0, max=1, value=1)\n",
    "        progress.bar_style = 'info'\n",
    "    else:\n",
    "        progress = IntProgress(min=0, max=size, value=0)\n",
    "    label = HTML()\n",
    "    box = VBox(children=[label, progress])\n",
    "    display(box)\n",
    "\n",
    "    index = 0\n",
    "    try:\n",
    "        for index, record in enumerate(sequence, 1):\n",
    "            if index == 1 or index % every == 0:\n",
    "                if is_iterator:\n",
    "                    label.value = '{name}: {index} / ?'.format(\n",
    "                        name=name,\n",
    "                        index=index\n",
    "                    )\n",
    "                else:\n",
    "                    progress.value = index\n",
    "                    label.value = u'{name}: {index} / {size}'.format(\n",
    "                        name=name,\n",
    "                        index=index,\n",
    "                        size=size\n",
    "                    )\n",
    "            yield record\n",
    "    except:\n",
    "        progress.bar_style = 'danger'\n",
    "        raise\n",
    "    else:\n",
    "        progress.bar_style = 'success'\n",
    "        progress.value = index\n",
    "        label.value = \"{name}: {index}\".format(\n",
    "            name=name,\n",
    "            index=str(index or '?')\n",
    "        )"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Download data from website\n",
    "\n",
    "Fetches each page, one at a time using `cURL` system utility. Then reads the HTML in and uses `BeautifulSoup` HTML parser to extract out the `table__cell`s for each `table__row` and store them into an array of arrays (aka data-frame like data structure).\n",
    "\n",
    "`cURL` was used as there were problems with `urllib` and the non-ascii domain name of the site. Otherwise, `urllib` is the best choice to keep things all \"Pythonic\".\n",
    "\n",
    "`BeautifulSoup` was used as it provides a very easy way to load and then navigate the HTML document."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Fetching page 1 https://президентскиегранты.рф/public/application/table?OnlyWinners=false&page=1\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "e4525e85ab5d44ff86e277c501788005",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "VBox(children=(HTML(value=''), IntProgress(value=0, max=21)))"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Fetching page 2 https://президентскиегранты.рф/public/application/table?OnlyWinners=false&page=2\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "460bd566faff4b139497fabb4b6cc758",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "VBox(children=(HTML(value=''), IntProgress(value=0, max=21)))"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "done.\n"
     ]
    }
   ],
   "source": [
    "import subprocess\n",
    "from bs4 import BeautifulSoup\n",
    "\n",
    "# The 2D array within which we will collect all scraped tabular data\n",
    "data_rows = []\n",
    "\n",
    "# For each page (set the ending page number (+1) manually)\n",
    "for page in range(1, pages+1):\n",
    "    # Fetch the page via cURL external process.\n",
    "    # (There were a number of problems using Python urllib including\n",
    "    # SSL certificate with non-ascii URL (actually IRI) and translation)\n",
    "    page_url = url + '&page=' + str(page)\n",
    "    print(\"Fetching page\", page, page_url)\n",
    "    result = subprocess.run(['curl', page_url], stdout=subprocess.PIPE)\n",
    "\n",
    "    # Parse the resulting HTML with BeautifulSoup into a `soup` object we can traverse\n",
    "    soup = BeautifulSoup(result.stdout, 'html.parser')\n",
    "    \n",
    "    # Find all div's with class table__row, these are the data rows\n",
    "    table_rows = soup.find_all('div', attrs={'class': 'table__row'})\n",
    "    for table_row in log_progress(table_rows, every=10):\n",
    "        # Ignore header row\n",
    "        if 'table__row--header' in  table_row.attrs['class']:\n",
    "            continue;\n",
    "\n",
    "        # Build an array (row) to add to our array of arrays\n",
    "        # This is to collect just one row/line of the tabular data\n",
    "        data_row = []\n",
    "\n",
    "        # Within each row, find all data cells (divs) with class table__cell\n",
    "        table_cells = table_row.find_all('div', attrs={'class': 'table__cell'})        \n",
    "        for table_cell in table_cells:\n",
    "            # Here we get really specific to each cells makeup.\n",
    "            if len(table_cell.contents) == 1:\n",
    "                # Cells with with one child, just copy over the text\n",
    "                data_row.append(clean_text(table_cell.text))\n",
    "            else:\n",
    "                # Cells with more then one child, copy out the row number\n",
    "                # Not entirely sure what this data is, but it's mostly numeric\n",
    "                text = ''.join([x.string for x in table_cell.contents[2:5]])\n",
    "                \n",
    "                # A little extra clean up on this field\n",
    "                text = text.replace('от', 'from')\n",
    "                if text[-1] == '(':\n",
    "                    text = text[:-2]\n",
    "\n",
    "                data_row.append(clean_text(text))\n",
    "                \n",
    "                # Append new row with winner/not winner of competition text\n",
    "                # as parsed from the first column\n",
    "                data_row.append(clean_text(table_cell.contents[5].string))\n",
    "\n",
    "        # Append this row to our collection of rows/data\n",
    "        data_rows.append(data_row)\n",
    "        \n",
    "print(\"done.\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Show a sample (the last row) of what we just fetched and parsed."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[['18-1-024742 from 26.03.2018' 'Проект не получил поддержку'\n",
      "  'РЕГИОНАЛЬНОЕ ОТДЕЛЕНИЕ ОБЩЕРОССИЙСКОЙ ОБЩЕСТВЕННОЙ ОРГАНИЗАЦИИ \"ФЕДЕРАЦИЯ СНОУБОРДА РОССИИ\" В ТЮМЕНСКОЙ ОБЛАСТИ Тюменская область'\n",
      "  'охрана здоровья граждан, пропаганда здорового образа жизни'\n",
      "  'Фестиваль экстремальных видов спорта Экстрим ЭКСПО - 2019'\n",
      "  '2 117 647,50']]\n"
     ]
    }
   ],
   "source": [
    "#from pandas import DataFrame\n",
    "#print(DataFrame(data_rows[0:2]))\n",
    "\n",
    "import numpy as np\n",
    "print(np.matrix(data_rows[-1]))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Save downloaded (Russian) data into CSV file for possible later use"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "import csv\n",
    "\n",
    "with open('russian.csv', 'w') as csv_file:\n",
    "    writer = csv.writer(csv_file)\n",
    "    for data_row in data_rows:\n",
    "        writer.writerow(data_row)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Translate downloaded data rows into English\n",
    "Google Translate costs money, so there's a few optimizations here to avoid re-translations of common fields."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## To cut down on translations needed (and cost)\n",
    "\n",
    "1. Don't translate numeric only columns 0 and 5)\n",
    "2. Use \"pretranslated\" table for fixed, limited colums (e.g. winner not winner and subject areas)\n",
    "\n",
    "## Colums\n",
    "\n",
    "0. Proposal number (not translated)\n",
    "1. Status (table-translated)\n",
    "2. Title or \"Name of Company\" (translated)\n",
    "3. Subject Area or \"Grant Direction\" (table-translated)\n",
    "4. Project Name (translated)\n",
    "5. The amount requested (not-translated)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Table-based local translation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Internal hash of common / known translations\n",
    "local_translations = {\n",
    "    'от': 'from',\n",
    "    'Не допущена до экспертизы': 'Not admitted to the examination', \n",
    "    'Проект не получил поддержку': 'The project did not receive support', \n",
    "    'Победитель конкурса': 'Winner of the competition', \n",
    "    'социальное обслуживание, социальная поддержка и защита граждан': 'social services, social support and citizen protection', \n",
    "    'охрана здоровья граждан, пропаганда здорового образа жизни': \"protection of citizens' health, promotion of a healthy lifestyle\", \n",
    "    'поддержка семьи, материнства, отцовства и детства': 'support for family, motherhood, paternity and childhood', \n",
    "    'поддержка молодёжных проектов, реализация которых охватывает виды деятельности, предусмотренные статьёй 31.1 федерального закона от 12 января 1996 г. № 7-фз «о некоммерческих организациях»': 'support of youth projects, the implementation of which covers the activities provided for in Article 31.1 of the federal law of January 12, 1996, No. 7-FZ on non-profit organizations\"\" ', \n",
    "    'поддержка молодёжных проектов, реализация которых охватывает виды деятельности, предусмотренные статьёй 31.1 Федерального закона от 12 января 1996 г. № 7-ФЗ «О некоммерческих организациях»': 'support of youth projects, the implementation of which covers the types of activities provided for in Article 31.1 of the Federal Law of January 12, 1996, No. 7-FZ On Non-Commercial Organizations\"\" ', \n",
    "    'поддержка проектов в области науки, образования, просвещения': 'support for projects in science, education, education', \n",
    "    'поддержка проектов в области культуры и искусства': 'support for projects in the field of culture and art', \n",
    "    'сохранение исторической памяти': 'saving historical memory', \n",
    "    'защита прав и свобод человека и гражданина, в том числе защита прав заключённых': \"protection of human and civil rights and freedoms, including protection of prisoners' rights\", \n",
    "    'охрана окружающей среды и защита животных': 'environmental protection and animal protection', \n",
    "    'укрепление межнационального и межрелигиозного согласия': 'Strengthening interethnic and interreligious harmony', \n",
    "    'развитие общественной дипломатии и поддержка соотечественников': 'development of public diplomacy and support of compatriots', \n",
    "    'развитие институтов гражданского общества': 'development of civil society institutions', \n",
    "    'выявление и поддержка молодых талантов в области культуры и искусства': 'Identifying and supporting young talents in the field of culture and art', \n",
    "    'поддержка проектов в области науки, образования, просвещения - долгосрочный проект': 'support for projects in science, education, education - long-term project', \n",
    "    'выявление и поддержка молодых талантов в области культуры и искусства - долгосрочный проект': 'Identifying and supporting young talents in the field of culture and art - long-term project', \n",
    "    'развитие институтов гражданского общества - долгосрочный проект': 'development of civil society institutions - long-term project'\n",
    "}\n",
    "\n",
    "# Alternate version to load from a CSV file\n",
    "#local_translations = {}\n",
    "#with open('translations.csv', 'r') as trans_file:\n",
    "#    reader = csv.reader(trans_file)\n",
    "#    for row in reader:\n",
    "#        (russian, english) = row\n",
    "#        local_translations[russian] = english\n",
    "\n",
    "def translate_local(text):\n",
    "    if text in local_translations:\n",
    "        return local_translations[text]\n",
    "    else:\n",
    "        print(\"WARNING: Local translation not found for: [\" + text + \"]\")\n",
    "        return text"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Cloud-based Google Translation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "#https://cloud.google.com/translate/docs/quickstart\n",
    "from google.cloud import translate\n",
    "from pathlib import Path\n",
    "import os\n",
    "import html\n",
    "\n",
    "# Set up API Access and translation client\n",
    "# GOOGLE_APPLICATION_CREDENTIALS value is a filename of the Access token (see link above)\n",
    "\n",
    "os.environ[\"GOOGLE_APPLICATION_CREDENTIALS\"] = str(Path(Path.home(), \"api-project-credentials.json\"))\n",
    "translate_client = translate.Client()\n",
    "\n",
    "def translate_google(text):\n",
    "    result = translate_client.translate(text, target_language='en')    \n",
    "    return html.unescape(result['translatedText'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Perform the translation on all rows collected"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "0dcbb650aed34ceab8a3291f8af17899",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "VBox(children=(HTML(value=''), IntProgress(value=0, max=40)))"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "18-1-026011 from 26.03.2018\n",
      "18-1-025866 from 26.03.2018\n",
      "18-1-025763 from 26.03.2018\n",
      "18-1-025640 from 26.03.2018\n",
      "18-1-025506 from 26.03.2018\n",
      "18-1-025474 from 26.03.2018\n",
      "18-1-025472 from 26.03.2018\n",
      "18-1-025413 from 26.03.2018\n",
      "18-1-025391 from 26.03.2018\n",
      "18-1-025382 from 26.03.2018\n",
      "18-1-025380 from 26.03.2018\n",
      "18-1-025343 from 26.03.2018\n",
      "18-1-025245 from 26.03.2018\n",
      "18-1-025218 from 26.03.2018\n",
      "18-1-025213 from 26.03.2018\n",
      "18-1-025189 from 26.03.2018\n",
      "18-1-025145 from 26.03.2018\n",
      "18-1-025095 from 26.03.2018\n",
      "18-1-025084 from 26.03.2018\n",
      "18-1-025055 from 26.03.2018\n",
      "18-1-025051 from 26.03.2018\n",
      "18-1-025021 from 26.03.2018\n",
      "18-1-025001 from 26.03.2018\n",
      "18-1-024967 from 26.03.2018\n",
      "18-1-024957 from 26.03.2018\n",
      "18-1-024939 from 26.03.2018\n",
      "18-1-024935 from 26.03.2018\n",
      "18-1-024913 from 26.03.2018\n",
      "18-1-024860 from 26.03.2018\n",
      "18-1-024856 from 26.03.2018\n",
      "18-1-024833 from 26.03.2018\n",
      "18-1-024817 from 26.03.2018\n",
      "18-1-024814 from 26.03.2018\n",
      "18-1-024806 from 26.03.2018\n",
      "18-1-024799 from 26.03.2018\n",
      "18-1-024792 from 26.03.2018\n",
      "18-1-024763 from 26.03.2018\n",
      "18-1-024758 from 26.03.2018\n",
      "18-1-024756 from 26.03.2018\n",
      "18-1-024742 from 26.03.2018\n"
     ]
    }
   ],
   "source": [
    "# Array to hold translated data, same 2D format as original data.\n",
    "translated_rows = []\n",
    "\n",
    "# Loop through all the rows, and cells...\n",
    "for data_row in log_progress(data_rows):\n",
    "    print(data_row[0])\n",
    "    translated_row = (data_row[0],\n",
    "                      translate_local(data_row[1]),\n",
    "                      translate_google(data_row[2]),\n",
    "                      translate_local(data_row[3]),\n",
    "                      translate_google(data_row[4]),\n",
    "                      clean_number(data_row[5]))\n",
    "    translated_rows.append(translated_row)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## In case the daily quota runs out, save intermediate results\n",
    "Only needed if translation stops prematurely. Otherwise, **ignore the next THREE run-cells**."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Save already translated items to a pickle file.\n",
    "import pickle\n",
    "\n",
    "#print(len(data_rows))\n",
    "with open('raw.pickle', 'wb') as handle:\n",
    "    pickle.dump(data_rows, handle, protocol=pickle.HIGHEST_PROTOCOL)\n",
    "\n",
    "#print(len(translated_rows))\n",
    "with open('translated.pickle', 'wb') as handle:\n",
    "    pickle.dump(translated_rows, handle, protocol=pickle.HIGHEST_PROTOCOL)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "40\n",
      "40\n"
     ]
    }
   ],
   "source": [
    "# To restart/reload from where we left off, load them back into translated_rows\n",
    "with open('raw.pickle', 'rb') as handle:\n",
    "    pickeled_data_rows = pickle.load(handle)   \n",
    "    #print(data_rows == pickeled_data_rows)\n",
    "\n",
    "with open('translated.pickle', 'rb') as handle:\n",
    "    pickeled_translated_rows = pickle.load(handle)   \n",
    "    #print(translated_rows == pickeled_translated_rows)\n",
    "    \n",
    "print(len(pickeled_translated_rows))\n",
    "print(len(pickeled_data_rows))\n",
    "\n",
    "   "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "# Translate untranslated rows.\n",
    "#Stopped at 18-1-011143 from 19.03.2018\n",
    "data_rows = pickeled_data_rows\n",
    "translated_rows = pickeled_translated_rows\n",
    "\n",
    "go = False\n",
    "count = 0\n",
    "# Loop through all the rows, and cells...\n",
    "for data_row in log_progress(data_rows):\n",
    "    count = count + 1\n",
    "    if go == True:    \n",
    "        translated_row = (data_row[0],\n",
    "                          translate_local(data_row[1]),\n",
    "                          translate_google(data_row[2]),\n",
    "                          translate_local(data_row[3]),\n",
    "                          translate_google(data_row[4]),\n",
    "                          clean_number(data_row[5]))\n",
    "        translated_rows.append(translated_row)\n",
    "        \n",
    "    if data_row[0] == '18-1-011143 from 19.03.2018':\n",
    "        go = True"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Print a sample of some translated rows"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "40\n",
      "                             0                                    1  \\\n",
      "0  18-1-026011 from 26.03.2018  The project did not receive support   \n",
      "1  18-1-025866 from 26.03.2018  The project did not receive support   \n",
      "\n",
      "                                                   2  \\\n",
      "0  REGIONAL PUBLIC ORGANIZATION OF REPUBLIC INGUS...   \n",
      "1  CHARITABLE FUND FOR ASSISTANCE TO CHILDREN-ORP...   \n",
      "\n",
      "                                                   3                       4  \\\n",
      "0  social services, social support and citizen pr...                STOP-HIV   \n",
      "1  social services, social support and citizen pr...  \"The ball of goodness\"   \n",
      "\n",
      "            5  \n",
      "0  2552800.01  \n",
      "1   468296.00  \n"
     ]
    }
   ],
   "source": [
    "from pandas import DataFrame\n",
    "print(len(translated_rows))\n",
    "print(DataFrame(translated_rows[0:2]))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Save translated (English) rows into CSV file as the final result!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "2a9e93fc087f4312bd4e0016eb9b7da8",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "VBox(children=(HTML(value=''), IntProgress(value=0, max=40)))"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "import csv\n",
    "\n",
    "with open('english.csv', 'w') as csv_file:\n",
    "    writer = csv.writer(csv_file)\n",
    "    for translated_row in log_progress(translated_rows, every=25):\n",
    "        writer.writerow(translated_row)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "18-1-026011 from 26.03.2018,The project did not receive support,\"REGIONAL PUBLIC ORGANIZATION OF REPUBLIC INGUSHETIA \"\"ACADEMY OF SOCIAL DEVELOPMENT OF CHILDREN AND YOUTH\"\" Republic of Ingushetia\",\"social services, social support and citizen protection\",STOP-HIV,2552800.01\r",
      "\r\n",
      "18-1-025866 from 26.03.2018,The project did not receive support,\"CHARITABLE FUND FOR ASSISTANCE TO CHILDREN-ORPHANS \"\"TIME TO HELP\"\" St. Petersburg\",\"social services, social support and citizen protection\",\"\"\"The ball of goodness\"\"\",468296.00\r",
      "\r\n"
     ]
    }
   ],
   "source": [
    "!head -2 english.csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3.6",
   "language": "python",
   "name": "python3.6"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }
diff --git a/requirements.txt b/requirements.txt
 requests==2.19.1
 beautifulsoup4==4.6.3