Created
December 5, 2012 09:14
-
-
Save rhizoome/4214140 to your computer and use it in GitHub Desktop.
DBpedia to sqlite
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
import sys | |
import sqlite3 | |
import re | |
import time | |
from dateutil import parser | |
c = sqlite3.connect("dbpedia.sqlite") | |
c.row_factory = sqlite3.Row | |
c.execute('PRAGMA foreign_keys = OFF') | |
c.execute('PRAGMA synchronous = OFF') | |
c.execute('PRAGMA journal_mode = OFF') | |
c.execute('PRAGMA count_changes= OFF'); | |
c.execute('PRAGMA cache_size = -1048576'); | |
print("Init done") | |
f = open('instance_types_en.nt', 'r') | |
resource_match = re.compile("<http://dbpedia.org/resource/(.*)>") | |
type_match = re.compile(".+/([^/>]+)>") | |
type_set = set() | |
with c: | |
for line in f: | |
if line[0] == '#': | |
continue | |
(resource, rel, type_, lf) = line.split(' ', 3) | |
match = resource_match.match(resource) | |
resource_s = match.groups()[0] | |
match = type_match.match(type_) | |
type_s = match.groups()[0] | |
if type_s not in type_set: | |
try: | |
c.execute(""" | |
CREATE TABLE | |
`type_%(type_s)s` | |
( | |
`type_%(type_s)sID` INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT, | |
`resource` TEXT, | |
`type` TEXT, | |
`resource_s` TEXT | |
) | |
""" % locals()) | |
except: | |
pass | |
type_set.add(type_s) | |
c.execute(""" | |
INSERT INTO | |
`type_%(type_s)s` | |
( | |
`resource`, | |
`type`, | |
`resource_s` | |
) | |
VALUES( | |
:resource, | |
:type_, | |
:resource_s | |
) | |
""" % locals(), locals()) | |
print("types done") | |
type_set = None | |
property_set = set() | |
f = open('infobox_properties_en.nt', 'r') | |
resource_match = re.compile("<http://dbpedia.org/resource/(.*)>") | |
property_match = re.compile("<http://dbpedia.org/property/(.*)>") | |
value_tuple_match = re.compile('<([^<>]+)>.*\"([^\"]+)\"') | |
value_type_match = re.compile('\"([^\"]+)\"\^\^<([^<>]+)>') | |
number_match = re.compile("\D*(\d+)\D*") | |
dparser = parser.parser() | |
def try_parse( | |
type_, | |
property_, | |
value_s, | |
type_hint, | |
): | |
lower = value_s.lower() | |
plower = property_.lower() | |
factor = 1 | |
if ( "bc" in lower | |
or "b.c." in lower): | |
factor = -1 | |
if ( "date" in type_.lower() | |
or "date" in plower | |
or "year" in plower): | |
try: | |
value_s = dparser.parse(value_s) | |
value_s = value_s.toordinal() | |
return ("date", value_s * factor) | |
except: | |
match = number_match.match(value_s) | |
if match: | |
value_s = match.groups()[0] | |
try: | |
value_s = dparser.parse(value_s) | |
value_s = value_s.toordinal() | |
return ("date", value_s * factor) | |
except: | |
pass | |
try: | |
value_s = int(value_s) | |
return ("int", value_s * factor) | |
except: | |
pass | |
try: | |
value_s = float(value_s) | |
return ("float", value_s * factor) | |
except: | |
pass | |
return (type_hint, value_s) | |
with c: | |
count = 0 | |
for line in f: | |
#count += 1 | |
#if count > 1120: | |
# break | |
if line[0] == '#': | |
continue | |
(resource, property_, value) = line.split(' ', 2) | |
match = resource_match.match(resource) | |
resource_s = match.groups()[0] | |
match = property_match.match(property_) | |
property_s = match.groups()[0] | |
value = value.strip() | |
value = value[:-2] | |
value = value.strip() | |
if value.endswith('@en'): | |
value = value[:-3] | |
value_s = value | |
type_hint = "string" | |
match = value_tuple_match.match(value) | |
if match: | |
type_ = match.groups()[0] | |
value_s = match.groups()[1] | |
(type_hint, value_s) = try_parse( | |
type_, | |
property_, | |
value_s, | |
type_hint, | |
) | |
else: | |
match = value_type_match.match(value) | |
if match: | |
value_s = match.groups()[0] | |
type_ = match.groups()[1] | |
(type_hint, value_s) = try_parse( | |
type_, | |
property_, | |
value_s, | |
type_hint, | |
) | |
else: | |
match = resource_match.match(value) | |
if match: | |
value_s = match.groups()[0] | |
else: | |
(type_hint, value_s) = try_parse( | |
"", | |
property_, | |
value, | |
type_hint, | |
) | |
if property_s not in property_set: | |
try: | |
c.execute(""" | |
CREATE TABLE | |
`prop_%(property_s)s` | |
( | |
`prop_%(property_s)sID` INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT, | |
`resource` TEXT, | |
`property` TEXT, | |
`value` TEXT, | |
`resource_s` TEXT, | |
`value_s` TEXT, | |
`type_hint` TEXT | |
) | |
""" % locals()) | |
except: | |
pass | |
property_set.add(property_s) | |
c.execute(""" | |
INSERT INTO | |
`prop_%(property_s)s` | |
( | |
`resource`, | |
`property`, | |
`value`, | |
`resource_s`, | |
`value_s`, | |
`type_hint` | |
) | |
VALUES( | |
:resource, | |
:property_, | |
:value, | |
:resource_s, | |
:value_s, | |
:type_hint | |
) | |
""" % locals(), locals()) | |
print("properties done") | |
property_set = None | |
c.close() | |
c = sqlite3.connect("dbpedia.sqlite") | |
c.execute('PRAGMA foreign_keys = OFF') | |
c.execute('PRAGMA synchronous = OFF') | |
c.execute('PRAGMA journal_mode = OFF') | |
c.execute('PRAGMA count_changes= OFF') | |
c.execute('PRAGMA cache_size = -1048576'); | |
c.row_factory = sqlite3.Row | |
cursor = c.execute(""" | |
SELECT | |
* | |
FROM | |
`sqlite_master` | |
WHERE | |
`type` = 'table' | |
AND | |
`name` like 'type_%' | |
""") | |
for table in cursor: | |
name = table['name'] | |
c.execute(""" | |
CREATE INDEX | |
`%(name)s_resource_idx` | |
ON | |
`%(name)s` (`resource`) | |
""" % locals()) | |
c.execute(""" | |
CREATE INDEX | |
`%(name)s_type_idx` | |
ON | |
`%(name)s` (`type`) | |
""" % locals()) | |
c.execute(""" | |
CREATE INDEX | |
`%(name)s_resource_s_idx` | |
ON | |
`%(name)s` (`resource_s`) | |
""" % locals()) | |
cursor = c.execute(""" | |
SELECT | |
* | |
FROM | |
`sqlite_master` | |
WHERE | |
`type` = 'table' | |
AND | |
`name` like 'prop_%' | |
""") | |
for table in cursor: | |
name = table['name'] | |
c.execute(""" | |
CREATE INDEX | |
`%(name)s_resource_idx` | |
ON | |
`%(name)s` (`resource`) | |
""" % locals()) | |
c.execute(""" | |
CREATE INDEX | |
`%(name)s_property_idx` | |
ON | |
`%(name)s` (`property`) | |
""" % locals()) | |
c.execute(""" | |
CREATE INDEX | |
`%(name)s_value_idx` | |
ON | |
`%(name)s` (`value`) | |
""" % locals()) | |
c.execute(""" | |
CREATE INDEX | |
`%(name)s_resource_s_idx` | |
ON | |
`%(name)s` (`resource_s`) | |
""" % locals()) | |
c.execute(""" | |
CREATE INDEX | |
`%(name)s_value_s_idx` | |
ON | |
`%(name)s` (`value_s`) | |
""" % locals()) | |
c.execute(""" | |
CREATE INDEX | |
`%(name)s_type_hint_idx` | |
ON | |
`%(name)s` (`type_hint`) | |
""" % locals()) | |
print("index done") | |
c.execute("ANALYZE") | |
print("analyze done") | |
c.execute("VACUUM") | |
c.close() | |
print("VACUUM done") |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
CREATE TABLE | |
`prop_%(property_s)s` | |
( | |
`prop_%(property_s)sID` INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT, | |
`resource` TEXT, | |
`property` TEXT, | |
`value` TEXT, | |
`resource_s` TEXT, | |
`value_s` TEXT, | |
`type_hint` TEXT | |
) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
CREATE TABLE | |
`type_%(type_s)s` | |
( | |
`type_%(type_s)sID` INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT, | |
`resource` TEXT, | |
`type` TEXT, | |
`resource_s` TEXT | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment