Last active
December 21, 2023 07:32
-
-
Save Ebrahim-Ramadan/82b4fc3365a4c89cafea83aa36dfaad9 to your computer and use it in GitHub Desktop.
paper-citation-project
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import time | |
from collections import defaultdict | |
# readign nodes data from nodes.csv | |
def read_nodes(file): | |
with open(file, 'r', encoding='utf-8') as f: | |
lines = f.readlines()[1:] # Skip header | |
nodes = {} | |
for line in lines: | |
parts = line.strip().split(',') | |
nodes[parts[0]] = parts[1] | |
return nodes | |
# read edges data from edges.csv | |
def read_edges(file): | |
with open(file, 'r', encoding='utf-8') as f: | |
lines = f.readlines()[1:] # Skip header | |
edges = [] | |
for line in lines: | |
parts = line.strip().split(',') | |
edges.append((parts[0], parts[1])) | |
return edges | |
# retrieving highest cited papers | |
def highest_cited_papers(nodes, edges): | |
citation_count = {} | |
for edge in edges: | |
citation_id = edge[1] | |
citation_count[citation_id] = citation_count.get(citation_id, 0) + 1 | |
sorted_papers = sorted(citation_count.items(), | |
key=lambda x: x[1], reverse=True) | |
highest_cited = [] | |
for paper_id, _ in sorted_papers: | |
if paper_id in nodes: | |
highest_cited.append((paper_id, nodes[paper_id])) | |
return highest_cited | |
#o find closest group of papers | |
def closest_group_of_papers(edges, threshold): | |
graph = defaultdict(set) | |
for edge in edges: | |
paper_id, citation_id = edge | |
graph[paper_id].add(citation_id) | |
graph[citation_id].add(paper_id) | |
closest_group = [] | |
processed = set() | |
for start_node in graph: | |
if start_node not in processed: | |
stack = [start_node] | |
group = set() | |
while stack: | |
node = stack.pop() | |
if node not in processed: | |
processed.add(node) | |
group.add(node) | |
stack.extend(graph[node] - processed) | |
if len(group) > 1: | |
closest_group.append(list(group)) | |
return [group for group in closest_group if len(group) >= threshold] | |
nodes_file = 'nodes.csv' | |
edges_file = 'edges.csv' | |
start_time = time.time() | |
nodes_data = read_nodes(nodes_file) | |
edges_data = read_edges(edges_file) | |
highest_cited = highest_cited_papers(nodes_data, edges_data) | |
print("Highest cited papers:") | |
for paper_id, paper_title in highest_cited[:10]: | |
print(f"Paper ID: {paper_id}, Title: {paper_title}") | |
threshold_value = 5 # You may adjust the threshold value | |
closest_group = closest_group_of_papers(edges_data, threshold_value) | |
print( | |
f"\nclosest group of papers with at least {threshold_value} common citations:") | |
print(closest_group) | |
end_time = time.time() | |
execution_time = end_time - start_time | |
print(f"\nExecution time: {execution_time} s") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment