-
-
Save yig/bc29935d22845dc02bf5000bcf18ba25 to your computer and use it in GitHub Desktop.
## All the categories on <https://www.nirfindia.org/2023/Ranking.html> | |
## pip install requests beautifulsoup4 | |
## Author: Yotam Gingold <[email protected]> | |
## License: CC0 | |
## URL: <https://gist.github.com/yig/bc29935d22845dc02bf5000bcf18ba25> | |
# from pathlib import Path | |
import csv | |
import os | |
import requests | |
from bs4 import BeautifulSoup | |
def main(): | |
urls = [ | |
'https://www.nirfindia.org/2023/OverallRanking.html', | |
'https://www.nirfindia.org/2023/UniversityRanking.html', | |
'https://www.nirfindia.org/2023/CollegeRanking.html', | |
'https://www.nirfindia.org/2023/ResearchRanking.html', | |
'https://www.nirfindia.org/2023/EngineeringRanking.html', | |
'https://www.nirfindia.org/2023/ManagementRanking.html', | |
'https://www.nirfindia.org/2023/PharmacyRanking.html', | |
'https://www.nirfindia.org/2023/MedicalRanking.html', | |
'https://www.nirfindia.org/2023/DentalRanking.html', | |
'https://www.nirfindia.org/2023/LawRanking.html', | |
'https://www.nirfindia.org/2023/ArchitectureRanking.html', | |
'https://www.nirfindia.org/2023/AgricultureRanking.html', | |
'https://www.nirfindia.org/2023/InnovationRanking.html' | |
] | |
for url in urls: | |
## Pathlib isn't great with | |
# category = Path(url).stem.removesuffix('Ranking') | |
category = url.removesuffix('Ranking.html').split('/')[-1] | |
outpath = category + '.csv' | |
if os.path.exists( outpath ): | |
print( "Path exists, skipping:", outpath ) | |
continue | |
table = table_from_URL( url, '#tbl_overall' ) | |
## Keep only the columns we want (Name, City, State, Rank) | |
if category == 'Innovation': | |
table = [ [ row[1], '', row[2], row[3] ] for row in table ] | |
else: | |
table = [ row[1:4] + [row[5]] for row in table ] | |
for rank, suffix in ( ( '101-150', '150.html' ), ( '151-200', '200.html' ) ): | |
try: | |
table2 = table_from_URL( url.removesuffix('.html') + suffix ) | |
## Add a rank column for this data. | |
table2 = [ row + [rank] for row in table2 ] | |
## Combine tables | |
table.extend( table2 ) | |
except KeyError as k: | |
print( k ) | |
with open( outpath, 'w' ) as f: | |
out = csv.writer( f ) | |
out.writerow( ['Name', 'City', 'State', 'Rank', 'Category'] ) | |
for row in table: | |
out.writerow( row + [category] ) | |
print( "Wrote:", outpath ) | |
## Thanks, ChatGPT | |
def table_from_URL( url, prefix = '' ): | |
# Fetch HTML content from the URL | |
print( "Fetching:", url ) | |
response = requests.get(url) | |
if response.status_code != 200: | |
raise KeyError( f"URL not found: <{url}>. Status code: {response.status_code}." ) | |
return None | |
# Parse HTML content with BeautifulSoup | |
soup = BeautifulSoup(response.text, 'html.parser') | |
# Extract text contents of the rows | |
table_data = [] | |
for row in soup.select( prefix + '.table-condensed > tbody > tr' ): | |
row_data = [ list(cell.stripped_strings)[0] for cell in row.find_all('td', recursive = False) ] | |
table_data.append( row_data ) | |
print( f"Fetched {len(table_data)} row{'s' if len(table_data) != 1 else ''}." ) | |
return table_data | |
if __name__ == '__main__': main() |
I recently joined this community and I really like it.
The 2023 data highlights institutions' performance across categories, helping students make informed decisions about their education. One notable trend is the emphasis on research and innovation, which is crucial for the future. Also, in the modern world, AI is developing rapidly and sometimes it is difficult to distinguish it from a person. Such resources as https://edubrain.ai/ perfectly perform homework, which accurately makes its corrections in the educational process. This tool not only offers solutions but also improves understanding, making learning more effective. There are many topics, including programming. Overall, NIRF data is a vital tool for anyone investing in India's education system.
I was looking for this for a long time, thank you!