Last active
May 3, 2021 23:03
-
-
Save allisonmorgan/a04c316ca6bf6e231545e758818f720b to your computer and use it in GitHub Desktop.
Scraping US Grad Hotline
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "code", | |
| "execution_count": 1, | |
| "id": "following-pillow", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "import requests\n", | |
| "from bs4 import BeautifulSoup\n", | |
| "import pandas as pd" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 2, | |
| "id": "thermal-indicator", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "url_string = \"https://university.graduateshotline.com/ubystate.html\"\n", | |
| "\n", | |
| "r = requests.get(url_string)\n", | |
| "html = r.text\n", | |
| "\n", | |
| "soup = BeautifulSoup(html, 'html.parser')\n", | |
| "links = soup.find_all(\"a\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 3, | |
| "id": "ruled-style", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "data = []\n", | |
| "for link in links:\n", | |
| " data.append([link.get_text(strip=True), link.get('href')])" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 4, | |
| "id": "manufactured-clothing", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "df = pd.DataFrame(data)\n", | |
| "df.columns = ['name', 'url']" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 5, | |
| "id": "hungarian-appeal", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>name</th>\n", | |
| " <th>url</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>graduateshotline</td>\n", | |
| " <td>https://www.graduateshotline.com</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>GRE</td>\n", | |
| " <td>https://www.graduateshotline.com/gre.html</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td>GRE Verbal Practice</td>\n", | |
| " <td>https://gre.graduateshotline.com</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>3</th>\n", | |
| " <td>GRE Word list</td>\n", | |
| " <td>https://www.graduateshotline.com/gre-word-list...</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>4</th>\n", | |
| " <td>Besk Books for GRE</td>\n", | |
| " <td>https://gre.graduateshotline.com/top-4-books-f...</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " name url\n", | |
| "0 graduateshotline https://www.graduateshotline.com\n", | |
| "1 GRE https://www.graduateshotline.com/gre.html\n", | |
| "2 GRE Verbal Practice https://gre.graduateshotline.com\n", | |
| "3 GRE Word list https://www.graduateshotline.com/gre-word-list...\n", | |
| "4 Besk Books for GRE https://gre.graduateshotline.com/top-4-books-f..." | |
| ] | |
| }, | |
| "execution_count": 5, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "df.head()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 6, | |
| "id": "tutorial-first", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "df.to_csv('us_grad_hotline.tsv', sep='\\t', index=False)" | |
| ] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "Python 3", | |
| "language": "python", | |
| "name": "python3" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.9.1" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 5 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment