-
-
Save admariner/87ece2fe8cb45b42f7e8fb4517c84e86 to your computer and use it in GitHub Desktop.
Scripts for Python Programs to Start Digging Into JSTOR’s Early Journals
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# PROGRAM NAME: journal_list.py | |
# Displays the title of every journal in the JSTOR Early Journal Content Data Bundle. | |
# Highlights those with "Psych" or "Philos" in the title (+ two general science journals). | |
# Displays the filename of the first article for each journal. | |
# Written by Christopher D Green | |
# January 2014 | |
# Modified by (if you modify this program, put your name here.) | |
# Date Modified (if you modify this program, put the date of the modification here.) | |
#START OF PROGRAM | |
import os #You need to import this in order to read the filenames from the disk below. | |
#Below is the directory path to where I am keeping the files on my computer. | |
#You will need to change it to suit your own system. | |
dirname='/Users/chriso/Desktop/JSTOR-journals/bundle/' | |
#Puts list of filename in the variable "filenames" | |
filenames=os.listdir(dirname) | |
#Initializing a few variables. | |
contents=''; title=''; lasttitle1=''; lasttitle2=''; lasttitle3=''; i=0 | |
for i in range(len(filenames)): #Looping through each file... | |
if filenames[i]!='.DS_Store': #Don't read this file (which causes Python to crash) | |
filename=dirname+filenames[i] #Create the full path to ech file | |
file=open(filename,'r', encoding='utf-8') #Open the file for reading (in unicode) | |
contents=file.read() #Put the entire contents of the file in a variable called "contents" | |
file.close() #Close the file (so you don't overwrite it later) | |
if '<journaltitle>' in contents: #If the file comes from a journal... | |
start=contents.find('<journaltitle>')+14 #Find the start of the journal's title | |
end=contents.find('</journaltitle>') #Find the end of the journal's title | |
title=contents[start:end] #Put everything in between in a variable called "title" | |
#First, check that you haven't seen this title recently | |
if title!=lasttitle1 and title!=lasttitle2 and title!=lasttitle3: | |
#Then, check if the journal title is one of those in which you are particularly interested. | |
if 'Psych' in title or 'Philos' in title or 'Monist' in title or title=='Science' or 'Scientific Monthly' in title: | |
print() | |
print('***',title, filenames[i][8:]) #If it is, highlight the title on the screen. | |
print() | |
else: | |
print(title, filenames[i][8:]) #Display other journal title on the screen too. | |
lasttitle3=lasttitle2 #Move your recently-seen titles up the line. | |
lasttitle2=lasttitle1 | |
lasttitle1=title | |
#END OF PROGRAM |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#PROGAM NAME: move_psychphil.py (ver 1.0) | |
# Extracts article titles, author names, volume numbers, and page numbers | |
# for Psych, Philos, and some science journals in the JSTOR EARLY JOURNAL | |
# CONTENT DATA BUNDLE. Displays these on screen and creates file names to | |
# copy them to their own folder (but does not actually do the copying). | |
# Also creates .csv list of articles from The Monist only. | |
# Written by Christopher D Green | |
# January 2014 | |
# Modified April 2014 to not move files, but just list their original file names in the Excel file. | |
# START PROGRAM | |
import os #needed to navigate directory tree of resident harddisk | |
#set from and to directories | |
fromdirname='/Users/chriso/Desktop/JSTOR-journals/bundle/' | |
todirname='/Users/chriso/Desktop/JSTOR-journals/temp/' | |
#read filenames from directory | |
filenames=os.listdir(fromdirname) | |
#initialize some variables | |
contents=''; title=''; lasttitle1=''; lasttitle2=''; lasttitle3=''; i=0 | |
jnum=-1; volnum='0'; pnum='0' | |
#setting up csv file that lists authors, titles, etc. for Phil Review articles | |
listfile=open(todirname+'Monist.csv', 'w') | |
listfile.write('year,vol,page,author,title\n') | |
listfile.close | |
#list of desired journals and the abbreviations to be used in filenames | |
journals=[['The Journal of Philosophy, Psychology and Scientific Methods', | |
'The Philosophical Review', 'The Monist'], | |
['JPPSM','PhilRev','Monist']] | |
#other interesting journals: The American Journal of Psychology, Science, | |
# The Scientific Monthly, The Journal of Speculative Philosophy | |
for i in range(len(filenames)): | |
if filenames[i]!='.DS_Store': | |
filename=fromdirname+filenames[i] | |
#print(filenames[i]) | |
file=open(filename,'r', encoding='utf-8') | |
contents=file.read() | |
file.close() | |
if '<journaltitle>' in contents: | |
start=contents.find('<journaltitle>')+14 #find start of journal title | |
end=contents.find('</journaltitle>') #find end of journal title | |
jtitle=contents[start:end] #get journal title | |
if jtitle in journals[0]: #is it one of the journal I want? | |
jnum=journals[0].index(jtitle) | |
typeloc=contents.find('<type>') #is it a full article? | |
if contents[typeloc+6:typeloc+9]=='fla': | |
#print ('.', end='') | |
print(jtitle) | |
volstart=contents.find('<volume>')+8 #find start of volume number | |
volend=contents.find('</volume>') #find end of volume number | |
volnum=contents[volstart:volend] #get volume number | |
pstart=contents.find('<fpage>')+7 #find start of page number | |
pend=contents.find('</fpage>') #find end of page number | |
pnum=contents[pstart:pend] #get initial page number | |
yearstart=contents.find('<year>')+6 #find start of year | |
year=contents[yearstart:yearstart+4] #get volume number | |
atitlestart=contents.find('<title>')+7 #find start of article title | |
atitleend=contents.find('</title>') #find end of article title | |
atitle=contents[atitlestart:atitleend] #get article | |
while ',' in atitle: | |
commaloc=atitle.find(',') #find commmas in atitle... | |
atitle=atitle[:commaloc]+'`'+atitle[commaloc+1:] #...replace with ` | |
print(atitle) | |
authorstart=contents.find('<authors>\n <list-item>')+25 #find start of author's name | |
authorend=''; k=0; author='' #finding the end of the author's name is a little more complex. | |
while '<' not in author: #the < marks the </list-item> tag at the end of the 1st author's name | |
author=author+contents[authorstart+k] #accumulate author's name character by character | |
k=k+1 #ideally, we would like to list all authors. Haven't done that here. | |
author=author[:-1] #removes the < at the end of the author's name | |
if author=='P. C.': | |
author='Paul Carus' | |
revauthor='';k=1; #putting lastname first | |
while len(author)-k>=0 and author[len(author)-k] != ' ': #finding the space | |
k=k+1 #between the first & last names | |
lastname=author[len(author)-k+1:] | |
lastname.lower(); lastname.capitalize() #setting Lastname | |
firstname=author[:len(author)-k+1] #(if it was all CAPS) | |
firstname.lower(); firstname.capitalize() #setting Firstname | |
#Occasionally names come out all in CAPS anyway. Check by hand. | |
revauthor=lastname+firstname | |
while ' ' in revauthor: #removing spaces | |
spaceloc=revauthor.find(' ') | |
revauthor=revauthor[:spaceloc]+revauthor[spaceloc+1:] | |
while '.' in revauthor: #removing dots | |
dotloc=revauthor.find('.') | |
revauthor=revauthor[:dotloc]+revauthor[dotloc+1:] | |
#This process still fails on names with Jr. at the end. | |
#These must be corrected by hand at present. | |
if len(author) < 3: | |
revauthor='NoAuthor' | |
print(author) | |
#assemble name of file to which article will be saved | |
outfilename=journals[1][jnum]+'.'+volnum+'.'+pnum+'.txt' | |
fullpathname=todirname+outfilename | |
#print(fullpathname) | |
print() | |
fullpath=open(fullpathname, 'w', encoding='utf-8') #writes files to a new directory | |
fullpath.write(contents) | |
fullpath.close() | |
#print ('*', end='') | |
if jtitle=='The Monist': #Creates csv list of Monist articles | |
listfile=open(todirname+'Monist.csv', 'a', encoding='utf-8') | |
listfile.write(filenames[i]+','+year+','+volnum+','+pnum+','+revauthor+','+atitle+','+outfilename+'\n') | |
listfile.close() | |
if jtitle=='The Journal of Philosophy, Psychology and Scientific Methods': #Creates csv list of JPPSM articles | |
listfile=open(todirname+'JPPSM.csv', 'a', encoding='utf-8') | |
listfile.write(filenames[i]+','+year+','+volnum+','+pnum+','+revauthor+','+atitle+','+outfilename+'\n') | |
listfile.close() | |
if jtitle=='The Philosophical Review': #Creates csv list of Phil Rev articles | |
listfile=open(todirname+'PhilRev.csv', 'a', encoding='utf-8') | |
listfile.write(filenames[i]+','+year+','+volnum+','+pnum+','+revauthor+','+atitle+','+outfilename+'\n') | |
listfile.close() | |
# END PROGRAM |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment