{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Word Frequencies from YouTube Playlists"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Library imports and function to extract videos from playlist"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from youtube_transcript_api import YouTubeTranscriptApi\n",
    "from nltk.corpus import stopwords \n",
    "from nltk.tokenize import word_tokenize \n",
    "from bs4 import BeautifulSoup\n",
    "import pandas as pd\n",
    "import requests\n",
    "import re\n",
    "\n",
    "video_list = []\n",
    "def playlistUrls(list, url):\n",
    "    sourceCode = requests.get(url).text\n",
    "    soup = BeautifulSoup(sourceCode, 'html.parser')\n",
    "    domain = 'https://www.youtube.com'\n",
    "    for link in soup.find_all(\"a\", {\"dir\": \"ltr\"}):\n",
    "        href = link.get('href')\n",
    "        if href.startswith('/watch?'):\n",
    "            list.append(domain + href)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "User inputs playlist URL"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "playlist_url = \"https://www.youtube.com/playlist?list=PLbKcy9p3mh_HGUdL2u8mrNi_ZNaHgNoja\""
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Execute function to extract videos from playlist. Iterate over list and download auto-generate subtitles."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "nOJhzSEMf4I\n",
      "4kp687KxgI4\n",
      "X-npNde0c1I\n",
      "PrMh2hSYxw4\n",
      "vG_llu1Fzeg\n",
      "ZOFAopeC4SM\n",
      "okb-xGMvuCU\n",
      "cGVqD9IUq80\n",
      "2xdhqCs6TjE\n",
      "5EBjscrTU7c\n",
      "O4gnrx2Rh5U\n",
      "ZOefXB0jflc\n",
      "2Yywh-z2tk4\n",
      "RdevkvzqlMM\n",
      "Vw_e63HfsaY\n",
      "_71WrcOFq_A\n",
      "Y3v5gcV5Hu0\n",
      "o2BPboxtJnw\n",
      "yynPyQ_PDoM\n",
      "BAupV8QH-Zg\n",
      "qRY0i_SShBc\n",
      "rK-jJAJfdfg\n",
      "UppcaSJPPMo\n",
      "QwjKWXxNFYE\n",
      "14A4_YQJGG4\n",
      "Uqsf8EQEoGc\n"
     ]
    }
   ],
   "source": [
    "subtitle_list = []\n",
    "\n",
    "playlistUrls(video_list, playlist_url)\n",
    "\n",
    "for x in video_list:\n",
    "    video_id = re.search('watch\\?v=([a-zA-Z0-9\\-\\_]*)', x).group(1)\n",
    "    print(video_id)\n",
    "    get_subs = YouTubeTranscriptApi.get_transcript(video_id)\n",
    "    subtitle_list.append(\" \".join([x[\"text\"] for x in get_subs]))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Combine all subtitle text together, remove stop word with NLTK, add NLTK words together and determine frequencies"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "all_text = \" \".join([x for x in subtitle_list])\n",
    "stop_words = set(stopwords.words('english')) \n",
    "word_tokens = word_tokenize(all_text) \n",
    "filtered_sentence = [w for w in word_tokens if not w in stop_words] \n",
    "    \n",
    "for w in word_tokens: \n",
    "    if w not in stop_words: \n",
    "        filtered_sentence.append(w) \n",
    "        \n",
    "stop_words_removed = \" \".join([x for x in filtered_sentence])\n",
    "\n",
    "frequency = {}\n",
    "text_string = stop_words_removed.lower()\n",
    "match_pattern = re.findall(r'\\b[a-z]{3,15}\\b', text_string)\n",
    " \n",
    "for word in match_pattern:\n",
    "    count = frequency.get(word,0)\n",
    "    frequency[word] = count + 1"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Word frequency to dataframe and sort descending"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.DataFrame(list(frequency.items()), columns=['Word', 'Frequency'])\n",
    "df.sort_values(\"Frequency\", axis = 0, ascending = False, inplace = True, na_position ='last')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Print dataframe"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Word</th>\n",
       "      <th>Frequency</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>like</td>\n",
       "      <td>2200</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>55</th>\n",
       "      <td>people</td>\n",
       "      <td>1504</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>49</th>\n",
       "      <td>know</td>\n",
       "      <td>1428</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>172</th>\n",
       "      <td>really</td>\n",
       "      <td>1356</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>259</th>\n",
       "      <td>get</td>\n",
       "      <td>1182</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>112</th>\n",
       "      <td>right</td>\n",
       "      <td>1144</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>28</th>\n",
       "      <td>one</td>\n",
       "      <td>1052</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>843</th>\n",
       "      <td>want</td>\n",
       "      <td>1016</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>gon</td>\n",
       "      <td>918</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>43</th>\n",
       "      <td>things</td>\n",
       "      <td>804</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>86</th>\n",
       "      <td>google</td>\n",
       "      <td>776</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>168</th>\n",
       "      <td>content</td>\n",
       "      <td>730</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>32</th>\n",
       "      <td>think</td>\n",
       "      <td>716</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>33</th>\n",
       "      <td>actually</td>\n",
       "      <td>704</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>59</th>\n",
       "      <td>see</td>\n",
       "      <td>698</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>349</th>\n",
       "      <td>going</td>\n",
       "      <td>598</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>165</th>\n",
       "      <td>time</td>\n",
       "      <td>592</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>38</th>\n",
       "      <td>lot</td>\n",
       "      <td>588</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>369</th>\n",
       "      <td>something</td>\n",
       "      <td>570</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>145</th>\n",
       "      <td>well</td>\n",
       "      <td>566</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>611</th>\n",
       "      <td>make</td>\n",
       "      <td>532</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>646</th>\n",
       "      <td>use</td>\n",
       "      <td>528</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>281</th>\n",
       "      <td>kind</td>\n",
       "      <td>506</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>158</th>\n",
       "      <td>site</td>\n",
       "      <td>500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>388</th>\n",
       "      <td>say</td>\n",
       "      <td>498</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>77</th>\n",
       "      <td>look</td>\n",
       "      <td>496</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>397</th>\n",
       "      <td>way</td>\n",
       "      <td>486</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>534</th>\n",
       "      <td>need</td>\n",
       "      <td>480</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>got</td>\n",
       "      <td>472</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>231</th>\n",
       "      <td>good</td>\n",
       "      <td>452</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2331</th>\n",
       "      <td>colored</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5039</th>\n",
       "      <td>wrapped</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2328</th>\n",
       "      <td>captured</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5041</th>\n",
       "      <td>duplication</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5042</th>\n",
       "      <td>exceptions</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5019</th>\n",
       "      <td>detract</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5018</th>\n",
       "      <td>amole</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5017</th>\n",
       "      <td>archived</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4996</th>\n",
       "      <td>alternatively</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2404</th>\n",
       "      <td>contextualize</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2403</th>\n",
       "      <td>lifecycle</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4986</th>\n",
       "      <td>amongst</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2396</th>\n",
       "      <td>demos</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4989</th>\n",
       "      <td>surefire</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4991</th>\n",
       "      <td>stein</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2394</th>\n",
       "      <td>entertain</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2391</th>\n",
       "      <td>reconfirms</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2390</th>\n",
       "      <td>shiny</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4997</th>\n",
       "      <td>outweigh</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2351</th>\n",
       "      <td>giller</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4999</th>\n",
       "      <td>variant</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2380</th>\n",
       "      <td>enlarge</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2370</th>\n",
       "      <td>invent</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5008</th>\n",
       "      <td>scrutiny</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5009</th>\n",
       "      <td>pvc</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2357</th>\n",
       "      <td>intermix</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5012</th>\n",
       "      <td>tabbed</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2355</th>\n",
       "      <td>mail</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2352</th>\n",
       "      <td>quacks</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7221</th>\n",
       "      <td>triskin</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>7222 rows × 2 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "               Word  Frequency\n",
       "11             like       2200\n",
       "55           people       1504\n",
       "49             know       1428\n",
       "172          really       1356\n",
       "259             get       1182\n",
       "112           right       1144\n",
       "28              one       1052\n",
       "843            want       1016\n",
       "22              gon        918\n",
       "43           things        804\n",
       "86           google        776\n",
       "168         content        730\n",
       "32            think        716\n",
       "33         actually        704\n",
       "59              see        698\n",
       "349           going        598\n",
       "165            time        592\n",
       "38              lot        588\n",
       "369       something        570\n",
       "145            well        566\n",
       "611            make        532\n",
       "646             use        528\n",
       "281            kind        506\n",
       "158            site        500\n",
       "388             say        498\n",
       "77             look        496\n",
       "397             way        486\n",
       "534            need        480\n",
       "7               got        472\n",
       "231            good        452\n",
       "...             ...        ...\n",
       "2331        colored          2\n",
       "5039        wrapped          2\n",
       "2328       captured          2\n",
       "5041    duplication          2\n",
       "5042     exceptions          2\n",
       "5019        detract          2\n",
       "5018          amole          2\n",
       "5017       archived          2\n",
       "4996  alternatively          2\n",
       "2404  contextualize          2\n",
       "2403      lifecycle          2\n",
       "4986        amongst          2\n",
       "2396          demos          2\n",
       "4989       surefire          2\n",
       "4991          stein          2\n",
       "2394      entertain          2\n",
       "2391     reconfirms          2\n",
       "2390          shiny          2\n",
       "4997       outweigh          2\n",
       "2351         giller          2\n",
       "4999        variant          2\n",
       "2380        enlarge          2\n",
       "2370         invent          2\n",
       "5008       scrutiny          2\n",
       "5009            pvc          2\n",
       "2357       intermix          2\n",
       "5012         tabbed          2\n",
       "2355           mail          2\n",
       "2352         quacks          2\n",
       "7221        triskin          2\n",
       "\n",
       "[7222 rows x 2 columns]"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}