Created
July 3, 2020 19:18
-
-
Save sai-teja-ponugoti/386620e1f8a6866f7c8811996c4a22bc to your computer and use it in GitHub Desktop.
Beautiful_Soup_example.ipynb
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"nbformat": 4, | |
"nbformat_minor": 0, | |
"metadata": { | |
"colab": { | |
"name": "Beautiful_Soup_example.ipynb", | |
"provenance": [], | |
"collapsed_sections": [], | |
"authorship_tag": "ABX9TyPjXSlVEE81ZJmbmOC7vmP1", | |
"include_colab_link": true | |
}, | |
"kernelspec": { | |
"name": "python3", | |
"display_name": "Python 3" | |
} | |
}, | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "view-in-github", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"<a href=\"https://colab.research.google.com/gist/sai-teja-ponugoti/386620e1f8a6866f7c8811996c4a22bc/beautiful_soup_example.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "jL7sVv-nqVhQ", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"### **Extracting a question and its best-answer pair from a Stack Overflow web page**\n", | |
"\n", | |
"**An example of how to clean html pages using BS4 to ectract required content for NLP data set pre-processign step.**" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "gzP2QKeRpwp8", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"from bs4 import BeautifulSoup\n", | |
"from urllib.request import urlopen" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "ngq4Vklzpxhi", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"# enter required url\n", | |
"myurl = \"https://stackoverflow.com/questions/415511/how-to-get-the-current-time-in-python\"\n", | |
"html = urlopen(myurl).read()\n", | |
"soupified = BeautifulSoup(html, \"html.parser\")" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "g4ClOhC6qnuL", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 51 | |
}, | |
"outputId": "a311b925-93bb-4ae2-e4dd-be2645414207" | |
}, | |
"source": [ | |
"# extract required tags from the html\n", | |
"# should have knowedge about the html structure of the page\n", | |
"# extracting the question part\n", | |
"question = soupified.find(\"div\", {\"class\": \"question\"})\n", | |
"questiontext = question.find(\"div\", {\"class\": \"post-text\"})\n", | |
"print(\"Question: \\n\", questiontext.get_text().strip())" | |
], | |
"execution_count": null, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"Question: \n", | |
" What is the module/method used to get the current time?\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "DG_H7mTcqzPp", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 374 | |
}, | |
"outputId": "a8362d49-ba13-4f9c-9429-bb1fed8b8f37" | |
}, | |
"source": [ | |
"# extracting the answer part\n", | |
"answer = soupified.find(\"div\", {\"class\": \"answer\"})\n", | |
"answertext = answer.find(\"div\", {\"class\": \"post-text\"})\n", | |
"print(\"Best answer: \\n\", answertext.get_text().strip())" | |
], | |
"execution_count": null, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"Best answer: \n", | |
" Use:\n", | |
">>> import datetime\n", | |
">>> datetime.datetime.now()\n", | |
"datetime.datetime(2009, 1, 6, 15, 8, 24, 78915)\n", | |
"\n", | |
">>> print(datetime.datetime.now())\n", | |
"2009-01-06 15:08:24.789150\n", | |
"\n", | |
"And just the time:\n", | |
">>> datetime.datetime.now().time()\n", | |
"datetime.time(15, 8, 24, 78915)\n", | |
"\n", | |
">>> print(datetime.datetime.now().time())\n", | |
"15:08:24.789150\n", | |
"\n", | |
"See the documentation for more information.\n", | |
"To save typing, you can import the datetime object from the datetime module:\n", | |
">>> from datetime import datetime\n", | |
"\n", | |
"Then remove the leading datetime. from all of the above.\n" | |
], | |
"name": "stdout" | |
} | |
] | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment