Skip to content

Instantly share code, notes, and snippets.

@pl77
Created February 23, 2018 05:18
Show Gist options
  • Save pl77/b9240d8f25cca564b180dfaf8777c9e7 to your computer and use it in GitHub Desktop.
Save pl77/b9240d8f25cca564b180dfaf8777c9e7 to your computer and use it in GitHub Desktop.
Internet Archive metadata schema attempt #1
import os
import sys
from sqlalchemy import Column, ForeignKey, Integer, String, UniqueConstraint, DateTime
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker, relationship, scoped_session
from sqlalchemy import event
Base = declarative_base()
class Server(Base):
__tablename__ = 'server'
# Here we define columns for the table person
# Notice that each column is also a normal Python instance attribute.
id = Column(Integer, primary_key=True)
name = Column(String, unique=True)
# rootserve = relationship("Rootdata")
def __repr__(self):
return '<Server %r>' % self.name
class Rootdata(Base):
__tablename__ = 'rootdata'
# Here we define columns for the table address.
# Notice that each column is also a normal Python instance attribute.
id = Column(Integer, primary_key=True)
uniq = Column(Integer, unique=True)
created = Column(Integer)
d1_server = Column(Integer, ForeignKey('server.id'))
d2_server = Column(Integer, ForeignKey('server.id'))
dir = Column(String)
files_count = Column(Integer)
is_collection = Column(Integer)
item_size = Column(Integer)
updated = Column(String)
mainserver = Column(Integer, ForeignKey('server.id'))
d1 = relationship("Server", foreign_keys=[d1_server])
d2 = relationship("Server", foreign_keys=[d2_server])
server = relationship("Server", foreign_keys=[mainserver])
def __repr__(self):
return '<Rootdata %r>' % self.uniq
class Taskdata(Base):
__tablename__ = 'taskdata'
# Here we define columns for the table address.
# Notice that each column is also a normal Python instance attribute.
# {'task_id': "", 'identifier': "", 'cmd': "", 'args': "", 'submittime': "", 'submitter': "", 'server': "", 'finished': ""}
task_id = Column(Integer, primary_key=True)
identifier = Column(String)
cmd = Column(String)
args = Column(String)
submittime = Column(String)
submitter = Column(String)
server = Column(String)
finished = Column(String)
def __repr__(self):
return '<Taskdata %r>' % self.identifier
class Format(Base):
__tablename__ = 'format'
# Here we define columns for the table person
# Notice that each column is also a normal Python instance attribute.
id = Column(Integer, primary_key=True)
name = Column(String, unique=True)
def __repr__(self):
return '<Format %r>' % self.name
class Files(Base):
__tablename__ = 'files'
# Here we define columns for the table address.
# Notice that each column is also a normal Python instance attribute.
id = Column(Integer, primary_key=True)
uniq = Column(Integer, ForeignKey('rootdata.uniq'))
bitrate = Column(Integer)
btih = Column(String)
format = Column(Integer, ForeignKey('format.id'))
height = Column(Integer)
width = Column(Integer)
length = Column(Integer)
size = Column(Integer)
mtime = Column(Integer)
dir = Column(String)
name = Column(String)
crc32 = Column(String)
md5 = Column(String)
sha1 = Column(String)
title = Column(String)
__table_args__ = (UniqueConstraint('uniq', 'sha1', name='_uniq_sha1_uc'),)
def __repr__(self):
return '<Files %r>' % self.sha1
class Mediatype(Base):
__tablename__ = 'mediatype'
# Here we define columns for the table person
# Notice that each column is also a normal Python instance attribute.
id = Column(Integer, primary_key=True)
name = Column(String, unique=True)
def __repr__(self):
return '<Mediatype %r>' % self.name
class Collection(Base):
__tablename__ = 'collection'
# Here we define columns for the table person
# Notice that each column is also a normal Python instance attribute.
id = Column(Integer, primary_key=True)
name = Column(String, unique=True)
def __repr__(self):
return '<Collection %r>' % self.name
class User(Base):
__tablename__ = 'user'
# Here we define columns for the table person
# Notice that each column is also a normal Python instance attribute.
id = Column(Integer, primary_key=True)
name = Column(String, unique=True)
def __repr__(self):
return '<User %r>' % self.name
class Language(Base):
__tablename__ = 'language'
# Here we define columns for the table person
# Notice that each column is also a normal Python instance attribute.
id = Column(Integer, primary_key=True)
name = Column(String, unique=True)
def __repr__(self):
return '<Language %r>' % self.name
class Metadata(Base):
__tablename__ = 'metadata'
# Here we define columns for the table address.
# Notice that each column is also a normal Python instance attribute.
id = Column(Integer, primary_key=True)
identifier = Column(String, unique=True)
mediatype = Column(Integer, ForeignKey('mediatype.id'))
collection = Column(Integer, ForeignKey('collection.id'))
creator = Column(Integer, ForeignKey('user.id'))
subject = Column(String)
date = Column(String)
description = Column(String)
language = Column(Integer, ForeignKey('language.id'))
noindex = Column(Integer)
publicdate = Column(Integer)
title = Column(String)
uniq = Column(Integer, ForeignKey('rootdata.uniq'))
uploader = Column(Integer, ForeignKey('user.id'))
def __repr__(self):
return '<Metadata %r>' % self.identifier
def _fk_pragma_on_connect(dbapi_con, con_record):
dbapi_con.execute("PRAGMA synchronous = OFF")
dbapi_con.execute("PRAGMA journal_mode = MEMORY")
# Create an engine that stores data in the local directory's
# sqlalchemy_example.db file.
engine = create_engine('sqlite:///D:\\archive\\internetarchive.db3', connect_args={'timeout': 30})
# Create all tables in the engine. This is equivalent to "Create Table"
# statements in raw SQL.
Base.metadata.create_all(engine)
Base.metadata.bind = engine
DBSession = sessionmaker(bind=engine)
# A DBSession() instance establishes all conversations with the database
# and represents a "staging zone" for all the objects loaded into the
# database session object. Any change made against the objects in the
# session won't be persisted into the database until you call
# session.commit(). If you're not happy about the changes, you can
# revert all of them back to the last commit by calling
# session.rollback()
Session = scoped_session(DBSession)
filesession = Session()
event.listen(engine, 'connect', _fk_pragma_on_connect)
mediatypes = ['account', 'audio', 'collection', 'data', 'etree', 'image', 'movies', 'software', 'texts', 'web']
for mtype in mediatypes:
if filesession.query(Mediatype).filter_by(name=mtype).count() < 1:
mediatype = Mediatype(name=mtype)
filesession.add(mediatype)
filesession.commit()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment