Skip to content

Instantly share code, notes, and snippets.

@rhizoome
Created December 5, 2012 09:14
Show Gist options
  • Save rhizoome/4214140 to your computer and use it in GitHub Desktop.
Save rhizoome/4214140 to your computer and use it in GitHub Desktop.
DBpedia to sqlite
#/usr/bin/env python
# -*- coding: utf-8 -*-
import sys
import sqlite3
import re
import time
from dateutil import parser
c = sqlite3.connect("dbpedia.sqlite")
c.row_factory = sqlite3.Row
c.execute('PRAGMA foreign_keys = OFF')
c.execute('PRAGMA synchronous = OFF')
c.execute('PRAGMA journal_mode = OFF')
c.execute('PRAGMA count_changes= OFF');
c.execute('PRAGMA cache_size = -1048576');
print("Init done")
f = open('instance_types_en.nt', 'r')
resource_match = re.compile("<http://dbpedia.org/resource/(.*)>")
type_match = re.compile(".+/([^/>]+)>")
type_set = set()
with c:
for line in f:
if line[0] == '#':
continue
(resource, rel, type_, lf) = line.split(' ', 3)
match = resource_match.match(resource)
resource_s = match.groups()[0]
match = type_match.match(type_)
type_s = match.groups()[0]
if type_s not in type_set:
try:
c.execute("""
CREATE TABLE
`type_%(type_s)s`
(
`type_%(type_s)sID` INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT,
`resource` TEXT,
`type` TEXT,
`resource_s` TEXT
)
""" % locals())
except:
pass
type_set.add(type_s)
c.execute("""
INSERT INTO
`type_%(type_s)s`
(
`resource`,
`type`,
`resource_s`
)
VALUES(
:resource,
:type_,
:resource_s
)
""" % locals(), locals())
print("types done")
type_set = None
property_set = set()
f = open('infobox_properties_en.nt', 'r')
resource_match = re.compile("<http://dbpedia.org/resource/(.*)>")
property_match = re.compile("<http://dbpedia.org/property/(.*)>")
value_tuple_match = re.compile('<([^<>]+)>.*\"([^\"]+)\"')
value_type_match = re.compile('\"([^\"]+)\"\^\^<([^<>]+)>')
number_match = re.compile("\D*(\d+)\D*")
dparser = parser.parser()
def try_parse(
type_,
property_,
value_s,
type_hint,
):
lower = value_s.lower()
plower = property_.lower()
factor = 1
if ( "bc" in lower
or "b.c." in lower):
factor = -1
if ( "date" in type_.lower()
or "date" in plower
or "year" in plower):
try:
value_s = dparser.parse(value_s)
value_s = value_s.toordinal()
return ("date", value_s * factor)
except:
match = number_match.match(value_s)
if match:
value_s = match.groups()[0]
try:
value_s = dparser.parse(value_s)
value_s = value_s.toordinal()
return ("date", value_s * factor)
except:
pass
try:
value_s = int(value_s)
return ("int", value_s * factor)
except:
pass
try:
value_s = float(value_s)
return ("float", value_s * factor)
except:
pass
return (type_hint, value_s)
with c:
count = 0
for line in f:
#count += 1
#if count > 1120:
# break
if line[0] == '#':
continue
(resource, property_, value) = line.split(' ', 2)
match = resource_match.match(resource)
resource_s = match.groups()[0]
match = property_match.match(property_)
property_s = match.groups()[0]
value = value.strip()
value = value[:-2]
value = value.strip()
if value.endswith('@en'):
value = value[:-3]
value_s = value
type_hint = "string"
match = value_tuple_match.match(value)
if match:
type_ = match.groups()[0]
value_s = match.groups()[1]
(type_hint, value_s) = try_parse(
type_,
property_,
value_s,
type_hint,
)
else:
match = value_type_match.match(value)
if match:
value_s = match.groups()[0]
type_ = match.groups()[1]
(type_hint, value_s) = try_parse(
type_,
property_,
value_s,
type_hint,
)
else:
match = resource_match.match(value)
if match:
value_s = match.groups()[0]
else:
(type_hint, value_s) = try_parse(
"",
property_,
value,
type_hint,
)
if property_s not in property_set:
try:
c.execute("""
CREATE TABLE
`prop_%(property_s)s`
(
`prop_%(property_s)sID` INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT,
`resource` TEXT,
`property` TEXT,
`value` TEXT,
`resource_s` TEXT,
`value_s` TEXT,
`type_hint` TEXT
)
""" % locals())
except:
pass
property_set.add(property_s)
c.execute("""
INSERT INTO
`prop_%(property_s)s`
(
`resource`,
`property`,
`value`,
`resource_s`,
`value_s`,
`type_hint`
)
VALUES(
:resource,
:property_,
:value,
:resource_s,
:value_s,
:type_hint
)
""" % locals(), locals())
print("properties done")
property_set = None
c.close()
c = sqlite3.connect("dbpedia.sqlite")
c.execute('PRAGMA foreign_keys = OFF')
c.execute('PRAGMA synchronous = OFF')
c.execute('PRAGMA journal_mode = OFF')
c.execute('PRAGMA count_changes= OFF')
c.execute('PRAGMA cache_size = -1048576');
c.row_factory = sqlite3.Row
cursor = c.execute("""
SELECT
*
FROM
`sqlite_master`
WHERE
`type` = 'table'
AND
`name` like 'type_%'
""")
for table in cursor:
name = table['name']
c.execute("""
CREATE INDEX
`%(name)s_resource_idx`
ON
`%(name)s` (`resource`)
""" % locals())
c.execute("""
CREATE INDEX
`%(name)s_type_idx`
ON
`%(name)s` (`type`)
""" % locals())
c.execute("""
CREATE INDEX
`%(name)s_resource_s_idx`
ON
`%(name)s` (`resource_s`)
""" % locals())
cursor = c.execute("""
SELECT
*
FROM
`sqlite_master`
WHERE
`type` = 'table'
AND
`name` like 'prop_%'
""")
for table in cursor:
name = table['name']
c.execute("""
CREATE INDEX
`%(name)s_resource_idx`
ON
`%(name)s` (`resource`)
""" % locals())
c.execute("""
CREATE INDEX
`%(name)s_property_idx`
ON
`%(name)s` (`property`)
""" % locals())
c.execute("""
CREATE INDEX
`%(name)s_value_idx`
ON
`%(name)s` (`value`)
""" % locals())
c.execute("""
CREATE INDEX
`%(name)s_resource_s_idx`
ON
`%(name)s` (`resource_s`)
""" % locals())
c.execute("""
CREATE INDEX
`%(name)s_value_s_idx`
ON
`%(name)s` (`value_s`)
""" % locals())
c.execute("""
CREATE INDEX
`%(name)s_type_hint_idx`
ON
`%(name)s` (`type_hint`)
""" % locals())
print("index done")
c.execute("ANALYZE")
print("analyze done")
c.execute("VACUUM")
c.close()
print("VACUUM done")
CREATE TABLE
`prop_%(property_s)s`
(
`prop_%(property_s)sID` INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT,
`resource` TEXT,
`property` TEXT,
`value` TEXT,
`resource_s` TEXT,
`value_s` TEXT,
`type_hint` TEXT
)
CREATE TABLE
`type_%(type_s)s`
(
`type_%(type_s)sID` INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT,
`resource` TEXT,
`type` TEXT,
`resource_s` TEXT
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment