Created
January 6, 2015 17:27
-
-
Save BtbN/ed9dc9ab254d8f64fe79 to your computer and use it in GitHub Desktop.
combine.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| # 5 hinute parts | |
| #files_per_part=410 | |
| # 6 hour parts | |
| files_per_part=5400 | |
| input_dir="data/union" | |
| output_dir="output" | |
| import os | |
| import sys | |
| from sqlalchemy.ext.declarative import declarative_base | |
| from sqlalchemy import Column, Integer, String, create_engine | |
| from sqlalchemy.orm import sessionmaker | |
| from sqlalchemy.sql import exists | |
| os.chdir(os.path.dirname(__file__)) | |
| engine = create_engine('sqlite:///used_files.sqlite') | |
| Base = declarative_base() | |
| class PartFile(Base): | |
| __tablename__ = 'partfiles' | |
| name = Column(String, primary_key=True) | |
| def __init__(self, fname): | |
| self.name = fname | |
| class ConfVal(Base): | |
| __tablename__ = 'confvals' | |
| name = Column(String, primary_key=True) | |
| strval = Column(String) | |
| intval = Column(Integer) | |
| def __init__(self, name): | |
| self.name = name | |
| Base.metadata.create_all(engine) | |
| Session = sessionmaker(bind=engine) | |
| session = Session() | |
| dirs = sorted([os.path.join(input_dir, d) for d in os.listdir(input_dir) if os.path.isdir(os.path.join(input_dir, d))]) | |
| files = [] | |
| for subdir in dirs: | |
| subfiles = sorted([d for d in os.listdir(subdir) if os.path.isfile(os.path.join(subdir, d))]) | |
| for filen in subfiles: | |
| if len(files) >= files_per_part: | |
| break | |
| fileo = '%s/%s' % (os.path.basename(subdir), filen[filen.find('-')+1:]) | |
| if session.query(exists().where(PartFile.name == fileo)).scalar(): | |
| continue | |
| session.add(PartFile(fileo)) | |
| files += [os.path.abspath(os.path.join(subdir, filen))] | |
| if len(files) < files_per_part: | |
| print('At %s/%s input parts' % (len(files), files_per_part)) | |
| session.rollback() | |
| sys.exit(-1) | |
| partcnt = session.query(ConfVal).filter_by(name='partcount').first() | |
| if partcnt is None: | |
| partcnt = ConfVal('partcount') | |
| partcnt.intval = 0 | |
| session.add(partcnt) | |
| partcnt.intval += 1 | |
| oname = 'part%05d.concat.txt' % partcnt.intval | |
| f = open(os.path.join(output_dir, oname), 'w') | |
| for filen in files: | |
| f.write("file '%s'\n" % filen) | |
| f.close() | |
| print('Wrote concat list %s' % oname) | |
| session.commit() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment