Skip to content
Snippets Groups Projects
general_merger.py 2.73 KiB
import argparse
import os

import pandas as pd
from tqdm import tqdm


class GeneralMerger(object):
    def __init__(self, input_folder, output_folder):
        self.input_folder = input_folder
        self.output_folder = output_folder

    def get_file_names(self):
        # read files in the folder_path
        files = [os.path.join(self.input_folder, f) for f in os.listdir(self.input_folder) if f.endswith('.txt')]
        files_split = [os.path.basename(f).split('---')[0] for f in files]
        files_set = list(set(files_split))
        return files_set

    def merge(self, 
              file_core_name=None, # this is the core name of the files to be merged
              ):
        # read files in the folder_path
        files = [os.path.join(self.input_folder, f) 
                 for f in os.listdir(self.input_folder) if f.endswith('.txt') and os.path.basename(f).split('---')[0] == file_core_name]

        # create an empty list to hold dataframes
        dfs = []

        # get the header from the first file
        header = pd.read_csv(files[0], sep=',', header=0).iloc[0]

        # read each file and append to the list
        for file in files:
            dfs.append(pd.read_csv(file, sep=',', header=0))

        # concatenate all dataframes into a single one
        merged_df = pd.concat(dfs, ignore_index=True)

        # add the header
        merged_df.columns = header

        # write the merged dataframe to a csv file
        path_to_save = os.path.join(self.output_folder, file_core_name + '_merged.txt')
        merged_df.to_csv(path_to_save, index=False, sep=',')

        return merged_df
    
    def merge_all(self):
        os.makedirs(self.output_folder, exist_ok=True)

        files = self.get_file_names()
        for file in tqdm(files):
            self.merge(file_core_name=file)

# test the code
# input_folder = '/home/nibio/mutable-outside-world/code/nibio_graph_sem_seg/data_split_merge/output_text'
# output_foler = '/home/nibio/mutable-outside-world/code/nibio_graph_sem_seg/data_split_merge/merged_output'
# merger = GeneralMerger(input_folder, output_foler)
# # merger.merge()
# files = merger.get_file_names()
# merger.merge_all()

# print(files)

if __name__ == '__main__':
    # parse the arguments
    parser = argparse.ArgumentParser()
    parser.add_argument('--input_folder', type=str, default='/home/nibio/mutable-outside-world/code/nibio_graph_sem_seg/data_split_merge/output_text')
    parser.add_argument('--output_folder', type=str, default='/home/nibio/mutable-outside-world/code/nibio_graph_sem_seg/data_split_merge/merged_output')
    args = parser.parse_args()

    # create the merger object
    merger = GeneralMerger(args.input_folder, args.output_folder)

    # merge all the files
    merger.merge_all()