Skip to content

Instantly share code, notes, and snippets.

@neubig
Created October 18, 2025 21:12
Show Gist options
  • Select an option

  • Save neubig/f287ad6f73dbe6721acfbe51948e34be to your computer and use it in GitHub Desktop.

Select an option

Save neubig/f287ad6f73dbe6721acfbe51948e34be to your computer and use it in GitHub Desktop.
Plot affiliations of people publishing at ICLR/ICML/NeurIPS
# Get the data from here: https://github.com/martenlienen/icml-neurips-iclr-dataset
import pandas as pd
import re
from collections import Counter
import matplotlib.pyplot as plt
def classify_affiliation(affiliation):
"""Classify an affiliation into university, industry, or other."""
if pd.isna(affiliation) or affiliation == 'None' or affiliation.strip() == '':
return 'other'
# Replace HTML entities
affiliation = affiliation.replace('&', '&').replace('&amp', '&')
affiliation_lower = affiliation.lower()
# Handle dual affiliations - classify based on first institution mentioned
# Split on common dual affiliation separators
# First, handle comma-separated multiple affiliations (4+ commas suggests a list)
comma_count = affiliation_lower.count(',')
if comma_count >= 3:
# This is likely multiple affiliations listed
first_affiliation = affiliation_lower.split(',')[0].strip()
return classify_affiliation(first_affiliation)
# Handle forward slash dual affiliations
if ' / ' in affiliation_lower:
parts = affiliation_lower.split(' / ')
# Take first part
return classify_affiliation(parts[0].strip())
# Handle pipe dual affiliations
if ' | ' in affiliation_lower:
parts = affiliation_lower.split(' | ')
return classify_affiliation(parts[0].strip())
# Handle semicolon dual affiliations
if '; ' in affiliation_lower and '; ' != affiliation_lower:
parts = affiliation_lower.split('; ')
return classify_affiliation(parts[0].strip())
# Special handling for & - only split if there are clear institution indicators on both sides
if ' & ' in affiliation_lower:
parts = affiliation_lower.split(' & ')
if len(parts) == 2:
first, second = parts[0].strip(), parts[1].strip()
# List of known universities/institutions that might appear as second affiliation
second_institution_patterns = [
'university', 'institute', 'college', 'school',
'google', 'microsoft', 'amazon', 'meta', 'facebook', 'apple',
'cmu', 'mit', 'stanford', 'berkeley', 'oxford', 'cambridge',
'fair', 'deepmind', 'openai', 'vector', 'mila',
'lab', 'laboratory', 'research', 'center', 'centre',
'mbzuai', 'neuralmagic', 'criteo', 'strategic machine',
'abridge', 'inverted.ai', 'macro-eyes',
]
# Check if this looks like a dual affiliation
second_has_institution = any(pattern in second for pattern in second_institution_patterns)
first_has_institution = any(pattern in first for pattern in second_institution_patterns)
if second_has_institution and first_has_institution:
# Both sides have institutions, this is a dual affiliation
# Classify based on first
return classify_affiliation(first)
elif second_has_institution and not first_has_institution:
# First part is just a department, second part is the actual institution
# Use the full string (don't split)
pass # Continue to rest of function
elif first_has_institution and not second_has_institution:
# First part is institution, second part is something else
return classify_affiliation(first)
# Otherwise, & is part of the institution name (like "Korea Advanced Institute of Science & Technology")
# Direct matches for common abbreviations and specific names (case-insensitive)
university_exact_matches = {
'mit', 'cmu', 'nyu', 'ucla', 'ucsd', 'ucsb', 'uiuc', 'epfl',
'eth', 'caltech', 'gatech', 'georgia tech', 'ut austin',
'uw-madison', 'umass amherst', 'unc chapel hill', 'usc',
'cornell', 'stanford', 'mila', 'technion', 'kaust',
'hkust', 'kaist', 'postech', 'snu', 'nus',
'mit csail', 'tti-chicago', 'toyota technological institute at chicago',
'weizmann institute of science', 'ist austria', 'ista',
'indian institute of science', 'skolkovo institute of science and technology',
'cispa helmholtz center for information security',
'korea advanced institute of science and technology',
'korea advanced institute of science & technology',
'korea advanced institute of science and technology (kaist)',
'moscow institute of physics and technology',
'ecole polytechnique', 'école polytechnique',
'ku leuven', 'tu berlin', 'tu darmstadt', 'tu munich', 'lmu munich',
'virginia tech', 'texas a&m', 'politecnico di milano',
'oxford', 'duke', 'cuhk', 'skoltech', 'mbzuai',
'telecom paristech', 'télécom paristech', 'télécom paris', 'ubc', 'ut-austin',
'ustc', 'kth', 'casia', 'ttic', 'academia sinica',
'tu wien', 'eurecom', 'cornell tech', 'u. montreal',
'mila - quebec ai institute', 'mila / u. montreal',
'vector institute', 'idsia', 'istituto italiano di tecnologia',
'uw madison', 'fair', 'ut dallas', 'u oxford', 'ens',
'unist', 'tu delft', 'uw', 'u washington', 'upenn', 'columbia',
'tu dortmund', 'tu dresden', 'tu kaiserslautern',
'école polytechnique fédérale de lausanne',
'ecole normale superieure', 'hasso plattner institute',
'politecnico di torino', 'karlsruher institut für technologie',
'national institute of informatics',
'institute of science and technology austria',
'ulsan national institute of science and technology',
'mpi-sws', 'alan turing institute', 'the alan turing institute',
'tti chicago', 'cea', 'rutgers', 'ntu',
'hec montreal', 'polytechnique montreal', 'jhu', 'suny buffalo',
'ethz', 'nudt', 'ens lyon', 'weizmann institute', 'uestc',
'ets montreal', 'anu', 'tum', 'uic',
'mila, u. montreal', 'ecole normale supérieure de paris',
'ecole normale supérieure', 'école normale supérieure',
'idiap research institute', 'max-planck institute',
'mpi tübingen', 'hec montreal & mila',
'institute for advanced study', 'cold spring harbor laboratory',
'rpi', 'brown', 'penn state', 'cispa',
'helmholtz munich', 'purdue', 'centralesupelec',
'zuse institute berlin', 'ucsc', 'uts',
'gatsby computational neuroscience unit',
'computer vision center barcelona',
'saarland informatics campus',
'johns hopkins', 'uci', 'dtu', 'tu eindhoven',
'courant institute', 'gwangju institute',
'unc-chapel hill', 'lip6', 'wisconsin',
'institute of statistical mathematics',
'mpi informatics', 'fu berlin', 'mcgill', 'mipt',
'irisa', 'gran sasso science institute', 'cwi',
'sjtu', 'tamu', 'centrum wiskunde',
'shanghai new york univeristy',
'u. michigan', 'univ. grenoble alpes', 'standord',
'sutd', 'ens paris saclay', 'gist',
'ecole des ponts paristech', 'rwth aachen', 'cambridge',
'iit', 'hku', 'puc-rio', 'dartmouth', 'hkbu',
'okinawa institute', 'oist', 'ucsf', 'zju',
'northwestern u', 'cornell univeristy', 'lmu',
'usi', 'iupui', 'stony brook', 'njust',
'uoft', 'umd', 'northwestern', 'u chicago',
'jku linz', 'unsw', 'cityu', 'ucas',
'lse', 'njit', 'umass', 'unc', 'hust',
'ensta paris', 'tata institute', 'ecole polytechnique de montreal',
'korea institute for advanced study',
'national ict australia', 'atr',
'institut de physique théorique',
'international institute of information technology hyderabad',
'fondazione bruno kessler', 'instituto superior técnico',
'mines paristech', 'gatsby unit', 'scut',
'ensae paris', 'vu amsterdam', 'nyu shanghai',
'australian institute for machine learning',
'ecole centrale paris', 'samsi', 'enpc',
'tu graz', 'cu boulder', 'dgist', 'uchicago',
'iiis', 'us naval academy', 'umass, amherst',
'suny albany', 'cuny graduate center',
'nankai univeristy', 'nara institute',
'ntu, singapore', 'chung-ang univ',
'area science park', 'indian statistical institute',
'cern', 'telecom sudparis', 'instituto tecnico',
'donders institute', 'barcelona supercomputing center',
'champalimaud centre', 'mpi informatik',
'nankai', 'montreal institute for learning algorithms',
'international school for advanced studies',
'instituto de telecomunicações', 'qatar computing research institute',
'ecole normale supérieure de cachan',
'mila/mcgill', 'mcgill - mila', 'mcgill/mila', 'mcgill, mila',
'polytechnique montréal', 'institute of neuroinformatics',
'indraprastha institute', 'institute of science and technology austria (ista)',
'georgia institution of technology', 'georgia institute of techology',
'ut southwestern', 'institute of software, cas',
'hkust(gz)', 'computer vision center',
'mila / école polytechnique de montréal',
'shanghai jiao tong unviersity', 'mila, udem',
'u michigan', 'école des ponts paristech',
'helmholtz ai', 'unsw, sydney',
'lmu münchen', 'lmu munich, mcml',
'national univ. of singapore',
'korea institute of science and technology',
'the technion', 'ecole polytechnique federale de lausanne',
'ens paris-saclay', 'mila / u.montreal', 'mila/udem',
'zib', 'champalimaud centre for the unknown',
'toyota technnological institute', 'tianjin unibersity',
'the skolkovo institute of science and technology',
'helmholtz zentrum münchen', 'shanghai qizhi institute',
'korea advanced institute of science & technology (kaist)',
'istituto italiano di tecnologia, genova',
'korea advanced institute of science & technology',
'ecole des ponts - paristech', 'statslab cambridge',
'centralsupélec', 'weierstrass institute, berlin',
'state univerisity of new york at buffalo',
'irt saint exupéry', 'beijing jiaotong univercity',
'assistant professor', 'deutsches krebsforschungszentrum',
'international higher school for advanced studies trieste',
'hebrew univeristy of jerusalem', 'mpi tuebingen',
'umass lowell', 'institut curie',
'max-planck institute for informatics',
'netherlands institute for neuroscience',
'william and mary', 'huji', 'santa fe institute',
'univeristy at buffalo', 'u of toronto', 'u. rochester',
'ucb', 'polytechnique', 'hec montréal',
'basque center for applied mathematics',
'ensae paristech', 'bar-ilan',
'kth stockholm', 'tata institute of fundamental research',
'german aerospace center', 'fundacao champalimaud',
'electronics and telecommunications research institute',
'weierstrass institute', 'ecole polytechnique, ipparis',
'national institutes of health',
'télécom paris, institut polytechnique de paris',
'singapore institute of manufacturing technology',
'laboratory for physical sciences',
'ecole polytechnique fédérale de lausanne',
'national institute of advanced industrial science and technology',
'isir, umr 7222',
'ecole nationale de la statistique et de l\'administration economique',
'national research council canada',
'max-planck-institut für informatik',
'nara institute of science and technology',
'crest, ensae', 'laboratoire de physique de l\'ecole normale supérieure paris',
'chung-ang univ', 'usc/isi', 'zib/tub',
'skolkovo institute of science and technology',
'central institute for mental health mannheim',
'eastern institute for advanced study',
'max delbrück center for molecular medicine',
'ecole nationale des ponts et chausees',
'ecole centrale de paris', 'mpi for biological cybernetics',
'centrum voor wiskunde en informatica',
'columbia business school', 'crest-ensae',
'ensae, institut polytechnique de paris',
'friedrich miescher institute', 'univ. of washington',
'dtu and KU', 'national observatory of athens',
'eth-z, mpi-is', 'alfréd rényi institute of mathematics',
'heidelberg institute for theoretical studies',
'nyu langone', 'institute for basic science',
'simula research laboratory', 'mpi-is',
'state key laboratory of cad&cg',
'mrc laboratory of molecular biology',
'unsw sydney', 'ltci, télécom paris, institut polytechnique de paris',
'cispa helmholtz center',
'german aerospace center, institute of data science',
'fgv emap', 'helmholtz center munich',
'dkfz', 'umich',
'technion, technion', 'kaist ai',
'usc information sciences institute',
'unversity of texas at austin',
'indian institute of science, bangalore',
'courant institute of mathematical sciences',
'gwangju institute of science and technology',
'suny at buffalo', 'irit',
'institute of science and technology',
'indraprastha institute of information technology, delhi',
'institute of information engineering, cas',
'international school for advanced studies (sissa)',
'ut southwestern medical center',
'okinawa institute of science and technology (oist)',
'fundacao champalimaud pt507131827',
'irt saint exupery',
'ict', 'tianjin unibersity, china',
'iiai', 'cas',
'crest, ensae',
'laboratoire de physique de l\'ecole normale supérieure paris',
'institute of software, chinese academy',
'international institute of information technology, hyderabad',
'cripac, casia', 'bcm/rice',
'institute for information transmission problems',
'institute of information engineering, cas, china',
'chinese academic of sciences',
'weizmann inst.',
'iie,cas', 'alfréd rényi institute of mathematics',
'institute for basic science',
'state key laboratory of cad&cg',
'mrc laboratory of molecular biology',
'korea advanced institute of science & technology',
'korea advanced institute of science & technology',
'korea advanced institute of science and technology',
'korea advanced institute of science & technology (kaist)',
'korea advanced institute of science & technology',
'fgv emap',
'electrical engineering & computer science department',
'memorial sloan kettering cancer center',
'centrum wiskunde & informatica',
'centrum wiskunde & informatica, amsterdam',
'centrum wiskunde informatica',
'faculty',
'telecom paris',
'the institute of statistical mathematics',
'courant institute of mathematical sciences, nyu',
'electrical engineering & computer science department, university of california, berkeley',
'electrical engineering & computer science department, university of california berkeley',
'okinawa institute of science and technology',
'montreal institute of learning algorithms',
'laboratoire de physique de l\'ecole normale supérieure paris',
'chung-ang univ., korea',
'skolkovo institute of science and technology, moscow institute of physics and technology',
'assistant professor - unc chapel hill',
'mpi for informatics',
'ens paris',
'biocomplexity institute & initiative, university of virginia',
'ecole des ponts',
'oregon health & science university',
'hong kong ust',
'iiia-csic',
'kit',
'carnegie mellon',
'toyota tech institute chicago',
'frankfurt institute for advanced studies',
'cispa helmholtz center i.g.',
'gnt, ecole normale superieure',
'suny at albany',
'lcsl iit/mit',
'ens cachan - cmla',
'nuist',
'utdallas',
'weizmann',
'lbl/nersc',
'georgia institute of technolog',
'saarland informatics campus, max-planck institute',
'cambridge, alan turing institute',
'ellis institute tübingen & university of freiburg',
'ellis unit / university linz',
'ellis unit / university linz',
'kaist, deepauto',
'tum & helmholtz ai',
'cispa, helmholtz center, saarland informatics campus',
'mpi & ellis institute tübingen',
'ellis institute tübingen, mpi-is',
'ens',
'crest',
'mila / udem',
'mila/u. montreal',
'computer science',
'hkust-gz',
'univ of mass',
'oregon health & science university',
'georgia tech.',
'tau',
'univ. of southern california',
'univ iowa',
'centrale-supelec',
'polytechnique montréal & mila',
'upc barcelona',
'instituto de telecomunicacoes',
'heidelberg collaboratory for image processing',
'idiap',
'vub',
'centre de visió per computador (cvc)',
'cnam',
'yonsei univ.',
'imar',
'max plank institute for software systems',
'hec paris / hi!paris',
'korea advanced institute of science & technology',
'korea advanced institute of science & technology',
'igsnrr, chinese academy of sciences, beijing, china',
'ias; purdue university',
'the univ. of tokyo / riken',
'dtu and ku',
'ellis unit linz, lit ai lab, institute for machine learning, johannes kepler university, institute for advanced research in artificial intelligence (iarai)',
'lif, iuf, aix-marseille university, cnrs',
'dtu compute',
'sfu',
'nara institute of science and technology, japan',
'swiss data science center',
'shanghaitech',
'insa rouen',
'mila, montreal',
'sklois, institute of information engineering, chinese academy of sciences; scs, university of chinese academy of sciences',
'univ. notre dame',
'institut de mathématiques de toulouse',
'unc - chapel hill',
'xjtu',
'ncsu',
'univ michigan, ann arbor',
'u. colorado, boulder',
'cnam (conservatoire national des arts et métiers)',
'asu',
'isae-supaero / university of toulouse',
'msu',
'usc institute for creative technology',
'karolinska institute',
'umd-cp & unc-ch',
'sustech',
'departments of applied mathematics and electrical & computer engineering, university of washington seattle',
'crest, ensae, institut polytechnique de paris',
'basque center for applied mathematics (bcam)',
'mpi-mis',
'computer vision center, uab',
'weill cornell medicine',
'national key lab for novel software technology',
'brigham and women\'s hospital',
'mpi-inf',
'puc-chile',
'ensae - iit',
'institut für informatik',
'scuola normale superiore',
'essec business school',
'tu braunschweig',
'nyu courant',
'uc, santa barbara',
'bits pilani',
'max-planck-institute for informatics',
'fudan univerisity',
'crest/ensae, ip paris',
'ucr',
'chalmers',
'amss',
'ellis institute & mpi intelligent systems, tübingen ai center',
'tti at chicago',
'msc lab',
'univ. modena reg.',
'pcg',
'mics centralesupelec',
'academy of mathematics and systems science, cas',
'sissa, trieste, italy',
'ruhr-universtät bochum',
'vidyasirimedhi institute of science and technology',
'imt atlantique',
'ecole polytechnique, france',
'cuhk-shenzhen',
'hkust (gz)',
'centai institute',
'u. amsterdam',
'champalimaud foundation',
'complexity science hub vienna',
}
# Government research institutions
government_exact_matches = {
'u.s. army research laboratory',
'german aerospace center (dlr)',
'german aerospace center',
'german aerospace center, institute of data science',
'national institutes of health',
'national institute of mental health',
'national research council canada',
'institute for infocomm research',
'institute for infocomm research, singapore',
'singapore institute of manufacturing technology',
'electronics and telecommunications research institute',
'laboratory for physical sciences',
'inrae',
'aist',
'national institute of advanced industrial science and technology',
'agency for science, technology and research',
}
# Check exact matches first (after stripping and lowercasing)
affiliation_stripped = affiliation.strip().lower()
if affiliation_stripped in government_exact_matches:
return 'government'
if affiliation_stripped in university_exact_matches:
return 'university'
# University keywords - comprehensive list
university_keywords = [
'university', 'universi', 'college', 'institute of technology',
'school of', 'polytechnic', 'université', 'universidad', 'universit',
'eth zurich', 'eth zürich', 'caltech', 'epfl',
'stanford', 'berkeley', 'harvard', 'princeton', 'yale',
'ucl', 'uc ', 'u.c.', 'imperial college', 'king\'s college',
'akademi', 'hochschule', 'faculty of',
'dept of', 'department of', 'graduate school', 'research school',
'iit ', 'iiit', 'tsinghua', 'peking university',
'chinese academy of sciences', 'academy of sciences',
'institute of computing technology', 'institute of automation',
]
# Industry keywords - tech companies and research labs
industry_keywords = [
'google', 'deepmind', 'microsoft', 'amazon', 'meta', 'facebook',
'apple', 'nvidia', 'intel', 'ibm', 'oracle', 'adobe',
'baidu', 'alibaba', 'tencent', 'huawei', 'samsung',
'openai', 'anthropic', 'cohere',
'bloomberg', 'sony', 'siemens',
'bosch', 'qualcomm', 'uber', 'twitter', 'netflix',
'linkedin', 'salesforce', 'spotify', 'snap inc',
'bytedance', 'waymo', 'cruise', 'zoox',
'renaissance technologies', 'two sigma', 'citadel',
'jane street', 'jump trading',
'ant group', 'yandex', 'ntt', 'sea ai lab', 'shanghai ai lab',
'flatiron institute', 'international business machines',
'lg ai research', 'instadeep', 'aws', 'borealis ai',
'sensetime', 'naver', 'genentech', 'nec labs', 'horizon robotics',
'kuaishou', 'element ai', 'hugging face', 'prowler',
'vinai', 'meituan', 'jd explore', 'csiro',
'layer 6 ai', 'international digital economy academy',
'toyota research institute', 'jd ai research', 'mediatek',
'peng cheng laboratory', 'graphcore', 'ant financial',
'yahoo research', 'kakao brain', 'booz allen',
'morgan stanley', 'zhejiang lab', 'shanghai qi zhi',
'shenzhen research institute of big data', 'disney research',
'eleutherai', 'aitrics', 'criteo', 'shannon.ai',
'servicenow', 'deeplearning.ai', 'nicta', 'aig',
'walmart labs', 'merl', 'yahoo! research',
'nec laboratories', 'nec labs',
'tiktok', 'scale ai', 'stability ai', 'yahoo!',
'preferred networks', 'jd.com', 'ant research',
'lg electronics', 'accenture', 'etri', 'a*star',
'bigai', 'msr', 'msra', 'layer6 ai',
'j.p. morgan', 'jpmorgan', 'tinkoff', 'together ai',
'cerebras', 'nokia bell labs', 'autodesk',
'megvii', 'midea group', 'visa', 'amd',
'data61', 'face++', 'vivo', 'wechat',
'mobileye', 'mitsubishi electric', 'nlpr',
'airi', 'center for ai safety',
'curious ai', 'vicarious', 'ai2',
'astrazeneca', 'oppo research', 'krafton',
'insitro', 'benevolent', 'vmware',
'hikvision', 'valeo', 'simons foundation',
'technology innovation institute',
'recital', 'abacus.ai', 'inspir.ai', 'owkin',
'antgroup', 'nnaisense', 'snap research',
'zalando', 'wayve', 'volkswagen', 'redwood research',
'aurora innovation', 'kakao enterprise', 'pengcheng laboratory',
'dyson', 'dp technology', 'cisco', 'xai',
'valence labs', 'yahoo labs', 'sk t-brain',
'4paradigm', 'sap', 'jp morgan', 'neural magic',
'pingan', 'capital one', 'cyberagent', 'lenovo',
'the voleon group', 'babylon health',
'tal education', 'mosaicml', 'zhipu ai', 'mistral ai',
'yahoo', 'tusimple', 'speechmatics',
'bayer', 'cainiao', 'inverted ai',
'hewlett packard', 'panasonic', 'aleph alpha',
'squirrel ai', 'honda research', 'airbnb',
'habana labs', 'pinterest', 'asapp',
'layer6', 'recursion', 'ericsson', 'secondmind',
'inflection ai', 'goodai', 'helixon',
'finvolution', 'didi', 'merantix', 'apollo research',
'originai', 'stability.ai', 'prescient design',
'lg display', 'epoch ai', 'duolingo',
'philips research', 'imagen technologies',
'northrop grumman', 'gracenote', 'fred hutchinson',
'edf lab', 'mayo clinic', 'didi chuxing',
'plumerai', 'recurrent ai', 'lighton',
'bitdefender', 'octoml', 'wormpex',
'unity technologies', 'american family insurance',
'optum', 'cleanlab', 'terminus group',
'waabi', 'pasqal', 'samaya', 'runway',
'coupang', 'si analytics', 'medarc',
'general motors', 'damo academy',
'du xiaoman', 'man truck', 'tomtom',
'telefonica', 'airbus', 'duolingo',
'deep mind', 'far ai', 'sri international',
'ideas ncbr', 'microsft research', 'basf se',
'sgit ai', 'cellarity', 'dfki',
'advanced micro devices', 'banner alzheimer',
'basf', 'nec', 'cea saclay',
'ambiata', 'bell labs', 'tno',
'noah\'s ark lab', 'ml collective',
'aniti', 'dfki', 'valence discovery',
'mlcommons', 'machine discovery', 'iqvia',
'angelalign', 'verses', 'service now',
'black sesame', 'dotphoton',
'disneyresearch|studios',
'baai', 'fivecent', 'dreamfold ai', 'iscas',
'hko', 'layer 6', 'd. e. shaw', 'xtx markets',
'arthur ai', 'algoritmica', 'orcam', 'realai',
'orange labs', 'd-wave', 'hrl laboratories',
'denso', 'whizbang labs', 'janelia research campus',
'webank', 'nokia labs', 'deepauto',
'neuralmagic', 'inverted.ai', 'vector institute',
'macro-eyes', 'abridge', 'openstax',
'draper', 'draper laboratory',
'covariant.ai', 'open ai',
'hudson river trading', 'argo ai',
'prophesee', 'radiance technologies',
'royal caliber', 'vintech technology development joint stock company',
'omron sinic x', 'bcai',
'descript', 'charm therapeutics',
'idemia', 'arm research',
'feedzai', 'pathai',
'a9.com', 'sait',
'sharechat', 'phigent robotics',
'isomorphic labs',
'sinopac holdings', 'qiyuan lab',
'augment computing', 'biomap',
'rain', 'iflytek research',
'huggingface', 'jpmc',
'zhengzhou tobacco research institute of cntc',
'xiaohongshu', 'mercedes-benz tech innovation',
'yazhouwan lab', 'the fin ai',
'qihoo 360', 'pengcheng lab',
]
# First check for university indicators
for keyword in university_keywords:
if keyword in affiliation_lower:
return 'university'
# Then check for industry indicators
for keyword in industry_keywords:
if keyword in affiliation_lower:
return 'industry'
# Government research labs and agencies
government_keywords = [
'national laboratory', 'national lab', 'brookhaven',
'sandia', 'argonne', 'los alamos', 'lawrence berkeley',
'lawrence livermore', 'oak ridge', 'pacific northwest',
'army research', 'air force research', 'naval research',
'nist', 'nih', 'national institute',
'etri', 'a*star', 'dsta',
'national research council', 'csiro',
'peng cheng lab', 'peng cheng laboratory',
]
for keyword in government_keywords:
if keyword in affiliation_lower:
return 'government'
# Academic research institutes
research_keywords = [
'max planck', 'fraunhofer', 'cnrs', 'inria', 'riken',
'allen institute', 'broad institute',
'salk institute', 'howard hughes', 'hhmi',
'research center', 'research centre', 'research foundation',
'ai lab', 'ai laboratory', 'artificial intelligence laboratory',
]
for keyword in research_keywords:
if keyword in affiliation_lower:
# Check if it's a company AI lab (industry) or academic AI lab
if any(company in affiliation_lower for company in ['shanghai', 'alibaba', 'tencent', 'baidu', 'microsoft', 'google', 'facebook', 'meta']):
return 'industry'
return 'university' # Classify research institutes as university
# Check for common industry suffixes
if any(x in affiliation_lower for x in ['inc', 'inc.', 'corp', 'corp.', 'ltd', 'ltd.', 'llc', 'gmbh', 'co.', 'co,', 'limited']):
return 'industry'
# Fallback checks for things that slipped through
# If it contains "university" or "college" or "institute of technology", likely a university
fallback_university_patterns = [
'university', 'universit', 'univeristy', 'universtät', 'universität',
'college', 'école', 'ecole',
'institute of technology', 'institut de', 'instituto de',
'department of', 'dept of', 'dept.', 'faculty of',
'school of', 'graduate school',
'polytechnic', 'polytech',
]
for pattern in fallback_university_patterns:
if pattern in affiliation_lower:
return 'university'
# If it contains obvious company indicators
fallback_industry_patterns = [
'.ai', '.com', 'labs', ' lab ', 'laboratory',
'technologies', 'technology inc', 'systems inc',
'therapeutics', 'robotics',
]
for pattern in fallback_industry_patterns:
if pattern in affiliation_lower:
# Make sure it's not "National Laboratory" or similar (government)
if 'national lab' not in affiliation_lower and 'national laboratory' not in affiliation_lower:
return 'industry'
return 'other'
def load_and_classify_data(csv_path):
"""Load data and classify all affiliations."""
print("Loading dataset...")
df = pd.read_csv(csv_path)
print(f"Loaded {len(df)} rows")
print("\nClassifying affiliations...")
df['affiliation_type'] = df['Affiliation'].apply(classify_affiliation)
return df
def show_top_affiliations_by_category(df, n=50):
"""Show top N affiliations from each category."""
categories = ['university', 'industry', 'government', 'other']
for category in categories:
print(f"\n{'='*80}")
print(f"TOP {n} AFFILIATIONS - {category.upper()}")
print(f"{'='*80}")
category_df = df[df['affiliation_type'] == category]
affiliation_counts = Counter(category_df['Affiliation'].dropna())
for i, (affiliation, count) in enumerate(affiliation_counts.most_common(n), 1):
print(f"{i:3d}. [{count:5d}] {affiliation}")
# Show category distribution
print(f"\n{'='*80}")
print("CATEGORY DISTRIBUTION")
print(f"{'='*80}")
category_counts = df['affiliation_type'].value_counts()
total = len(df)
for category in categories:
count = category_counts.get(category, 0)
percentage = (count / total) * 100
print(f"{category:12s}: {count:7d} ({percentage:5.2f}%)")
print(f"{'Total':12s}: {total:7d}")
def analyze_by_author_position(df):
"""Analyze papers by first and last author affiliation type over time."""
# Group by paper (Conference, Year, Title) and get first and last author
print("\nAnalyzing author positions...")
grouped = df.groupby(['Conference', 'Year', 'Title'])
first_authors = []
last_authors = []
for (conf, year, title), group in grouped:
if len(group) > 0:
first_author = group.iloc[0]
last_author = group.iloc[-1]
first_authors.append({
'Year': year,
'affiliation_type': first_author['affiliation_type']
})
last_authors.append({
'Year': year,
'affiliation_type': last_author['affiliation_type']
})
first_df = pd.DataFrame(first_authors)
last_df = pd.DataFrame(last_authors)
return first_df, last_df
def plot_author_affiliation_trends(first_df, last_df):
"""Create plots showing trends over time."""
print("\nGenerating plots...")
# Calculate counts
first_counts = first_df.groupby(['Year', 'affiliation_type']).size().unstack(fill_value=0)
last_counts = last_df.groupby(['Year', 'affiliation_type']).size().unstack(fill_value=0)
# Plot 1: Absolute counts
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(18, 6))
first_counts.plot(kind='line', ax=ax1, marker='o', linewidth=2)
ax1.set_xlabel('Year', fontsize=12)
ax1.set_ylabel('Number of Papers', fontsize=12)
ax1.set_title('Papers by First Author Affiliation Type Over Time (Absolute)', fontsize=14, fontweight='bold')
ax1.legend(title='Affiliation Type', fontsize=10)
ax1.grid(True, alpha=0.3)
last_counts.plot(kind='line', ax=ax2, marker='o', linewidth=2)
ax2.set_xlabel('Year', fontsize=12)
ax2.set_ylabel('Number of Papers', fontsize=12)
ax2.set_title('Papers by Last Author Affiliation Type Over Time (Absolute)', fontsize=14, fontweight='bold')
ax2.legend(title='Affiliation Type', fontsize=10)
ax2.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('/workspace/project/affiliation_trends_absolute.png', dpi=300, bbox_inches='tight')
print("Absolute count plots saved to: /workspace/project/affiliation_trends_absolute.png")
plt.close()
# Plot 2: Percentages
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(18, 6))
first_pct = first_counts.div(first_counts.sum(axis=1), axis=0) * 100
first_pct.plot(kind='line', ax=ax1, marker='o', linewidth=2)
ax1.set_xlabel('Year', fontsize=12)
ax1.set_ylabel('Percentage of Papers (%)', fontsize=12)
ax1.set_title('Papers by First Author Affiliation Type Over Time (Percentage)', fontsize=14, fontweight='bold')
ax1.legend(title='Affiliation Type', fontsize=10)
ax1.grid(True, alpha=0.3)
ax1.set_ylim(0, 100)
last_pct = last_counts.div(last_counts.sum(axis=1), axis=0) * 100
last_pct.plot(kind='line', ax=ax2, marker='o', linewidth=2)
ax2.set_xlabel('Year', fontsize=12)
ax2.set_ylabel('Percentage of Papers (%)', fontsize=12)
ax2.set_title('Papers by Last Author Affiliation Type Over Time (Percentage)', fontsize=14, fontweight='bold')
ax2.legend(title='Affiliation Type', fontsize=10)
ax2.grid(True, alpha=0.3)
ax2.set_ylim(0, 100)
plt.tight_layout()
plt.savefig('/workspace/project/affiliation_trends_percentage.png', dpi=300, bbox_inches='tight')
print("Percentage plots saved to: /workspace/project/affiliation_trends_percentage.png")
plt.close()
# Plot 3: Academia/Industry Ratio
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(18, 6))
# Calculate ratios for first author
if 'university' in first_counts.columns and 'industry' in first_counts.columns:
first_ratio = first_counts['university'] / first_counts['industry']
first_ratio.plot(kind='line', ax=ax1, marker='o', linewidth=2, color='purple')
ax1.set_xlabel('Year', fontsize=12)
ax1.set_ylabel('Academia/Industry Ratio', fontsize=12)
ax1.set_title('First Author: Academia/Industry Ratio Over Time', fontsize=14, fontweight='bold')
ax1.grid(True, alpha=0.3)
ax1.axhline(y=1, color='red', linestyle='--', linewidth=1, label='Equal (1:1)')
ax1.legend(fontsize=10)
# Calculate ratios for last author
if 'university' in last_counts.columns and 'industry' in last_counts.columns:
last_ratio = last_counts['university'] / last_counts['industry']
last_ratio.plot(kind='line', ax=ax2, marker='o', linewidth=2, color='purple')
ax2.set_xlabel('Year', fontsize=12)
ax2.set_ylabel('Academia/Industry Ratio', fontsize=12)
ax2.set_title('Last Author: Academia/Industry Ratio Over Time', fontsize=14, fontweight='bold')
ax2.grid(True, alpha=0.3)
ax2.axhline(y=1, color='red', linestyle='--', linewidth=1, label='Equal (1:1)')
ax2.legend(fontsize=10)
plt.tight_layout()
plt.savefig('/workspace/project/affiliation_trends_ratio.png', dpi=300, bbox_inches='tight')
print("Academia/Industry ratio plots saved to: /workspace/project/affiliation_trends_ratio.png")
plt.close()
def main():
csv_path = '/workspace/project/icml-neurips-iclr-dataset/papers.csv'
# Load and classify data
df = load_and_classify_data(csv_path)
# Show top affiliations by category
show_top_affiliations_by_category(df, n=50)
# Analyze by author position
first_df, last_df = analyze_by_author_position(df)
# Create plots
plot_author_affiliation_trends(first_df, last_df)
print("\n" + "="*80)
print("Analysis complete!")
print("="*80)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment