import os import shutil import gzip import datetime # Parse first GAF file in each archive to get date. Create new folders based on date. ARCHIVE_DIR = "archive" NEW_DIR_ROOT = "" def make_gaf_dict(directory): # Every new folders gonna have one of these. Essentially recreate the state of the presubmission folder on this date. # For each distinct header date, create new folder # Attempt to fill each new folder with files having same header date, if file doesn't exist, grab closest previous file # Make a dict of # { # gaf_fname: { # "13.1": { # header_date: full_path, # header_date: full_path, # header_date: full_path # }, # "14.1": { # header_date: full_path, # header_date: full_path, # header_date: full_path # } # } # } iba_gaf_fnames = { "gene_association.paint_cgd.gaf.gz": {"13.1": {}, "14.1": {}}, "gene_association.paint_chicken.gaf.gz": {"13.1": {}, "14.1": {}}, "gene_association.paint_dictyBase.gaf.gz": {"13.1": {}, "14.1": {}}, "gene_association.paint_ecocyc.gaf.gz": {"13.1": {}, "14.1": {}}, "gene_association.paint_fb.gaf.gz": {"13.1": {}, "14.1": {}}, "gene_association.paint_human.gaf.gz": {"13.1": {}, "14.1": {}}, "gene_association.paint_mgi.gaf.gz": {"13.1": {}, "14.1": {}}, "gene_association.paint_other.gaf.gz": {"13.1": {}, "14.1": {}}, "gene_association.paint_pombase.gaf.gz": {"13.1": {}, "14.1": {}}, "gene_association.paint_rgd.gaf.gz": {"13.1": {}, "14.1": {}}, "gene_association.paint_sgd.gaf.gz": {"13.1": {}, "14.1": {}}, "gene_association.paint_tair.gaf.gz": {"13.1": {}, "14.1": {}}, "gene_association.paint_wb.gaf.gz": {"13.1": {}, "14.1": {}}, "gene_association.paint_zfin.gaf.gz": {"13.1": {}, "14.1": {}}, } for d in os.listdir(directory): archive_subdir = os.path.join(directory, d) for f in os.listdir(archive_subdir): gaf_file = os.path.join(archive_subdir, f) if f not in iba_gaf_fnames: print("{} not regular filename - {} - ignoring".format(f, gaf_file)) continue # Get panther version and creation date from header with gzip.open(gaf_file, mode="rt") as af: for l in af.readlines(): if l.startswith("!Created on"): date_str = l.replace("!Created on ", "") date_str = date_str[:-2] # Remove '.\n' # Wed Jun 6 13:59:48 2018 datetime_obj = datetime.datetime.strptime(date_str, '%c') if l.startswith("!PANTHER version: v."): pthr_version = l.replace("!PANTHER version: v.", "") pthr_version = pthr_version[:-2] # Remove '.\n' if not datetime_obj: raise("Crud") if not pthr_version: raise("Panther crud") created_at = str(datetime_obj.date()) iba_gaf_fnames[f][pthr_version][created_at] = gaf_file return iba_gaf_fnames def get_source_file(gaf_dict, fname, pthr_version, q_created_at): try: return gaf_dict[fname][pthr_version][q_created_at] except KeyError: # Try other dates for this file and version, grabbing closest previous file dated_paths = gaf_dict[fname][pthr_version] q_created_at_date = datetime.datetime.strptime(q_created_at, '%Y-%m-%d') # query date aka desired date # Reverse sort dates for s_created_at, full_path in sorted(dated_paths.items(), key=lambda x: datetime.datetime.strptime(x[0], '%Y-%m-%d'), reverse=True): s_created_at_date = datetime.datetime.strptime(s_created_at, '%Y-%m-%d') # searched date aka "what we have" if s_created_at_date < q_created_at_date: return full_path def new_subdir_pthr(new_subdir): return new_subdir.split("/")[-3] def new_subdir_date(new_subdir): return new_subdir.split("/")[-2] if __name__ == "__main__": iba_gaf_fnames = make_gaf_dict(ARCHIVE_DIR) # Collect distinct new directory paths new_subdirs = set() for fname, versions in iba_gaf_fnames.items(): for pthr_version, dated_paths in versions.items(): for created_at, full_path in dated_paths.items(): new_subdir = os.path.join(NEW_DIR_ROOT, pthr_version, created_at, "presubmission") new_subdirs.add(new_subdir) # sort by date new_subdirs_list = sorted(list(new_subdirs), key=lambda x: new_subdir_date(x)) dir_counter = 1 for new_subdir in new_subdirs_list: try: os.makedirs(new_subdir) except: print("{} already exists".format(new_subdir)) # Start copying files pthr_version = new_subdir_pthr(new_subdir) created_at = new_subdir_date(new_subdir) for fname in iba_gaf_fnames: f = get_source_file(iba_gaf_fnames, fname, pthr_version, created_at) if not f: if dir_counter == 1: print("No source file found for {}/{} but this is the earliest folder so whadya gonna do?".format(new_subdir, fname)) continue else: raise(Exception("No source file found for {}/{}".format(new_subdir, fname))) shutil.copy(f, os.path.join(new_subdir, fname)) dir_counter += 1