Forked from christopherkullenberg/swepubjsonparser.py
Last active
December 30, 2015 08:38
-
-
Save skagedal/5ceddc015bd1156116a4 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
import re | |
from os import listdir | |
def fix_escapes(string): | |
# Libris over-escapes some backslashes. | |
string = string.replace("\\\\\"","\\\"") | |
# Libris fails to properly escape backslashes in strings, which occurs for example with inline | |
# LaTeX codes like "$\geq" which should be escaped as "$\\geq". They do seem to properly | |
# escape quote chars, however. Now, we can't easily know whethera string liike "\n" should be | |
# parsed as a newline (which it should in proper JSON) or as a backslash and an n. So while this | |
# might break some LaTeX codes, at least the JSON parsing should work. | |
return re.sub(r'(?<!\\)\\(?!["\\/])', r"\\\\", string) | |
for filename in listdir("GU20151228json/"): #alla filer i en katalog | |
print("opening " + filename) | |
with open("GU20151228json/" + filename, "rb") as currentFile: | |
bytes = currentFile.read() | |
string = fix_escapes(bytes.decode('utf-8')) | |
jsondata = json.loads(string) | |
print("parsed correctly") | |
for record in jsondata["xsearch"]["list"]: | |
print(record["title"]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment