From f5ad3556a057d402a219e723cde51185d09f3b45 Mon Sep 17 00:00:00 2001 From: Maciej Wielgosz <maciej.wielgosz@nibio.no> Date: Thu, 8 Jun 2023 09:58:18 +0200 Subject: [PATCH] spliting trees files for the model - unparallel done --- .gitignore | 6 +- .../pyg_implementaion_main_my_data_loader.py | 13 +- data_split_merge/general_merger.py | 80 +++++++++ data_split_merge/general_splitter.py | 157 ++++++++++++++++++ data_split_merge/merge_files.py | 17 ++ requirements.txt | 2 +- test_split/general_merger.py | 37 ----- test_split/general_splitter.py | 110 ------------ 8 files changed, 270 insertions(+), 152 deletions(-) create mode 100644 data_split_merge/general_merger.py create mode 100644 data_split_merge/general_splitter.py create mode 100644 data_split_merge/merge_files.py delete mode 100644 test_split/general_merger.py delete mode 100644 test_split/general_splitter.py diff --git a/.gitignore b/.gitignore index 0f2e01b..54a0327 100644 --- a/.gitignore +++ b/.gitignore @@ -118,6 +118,7 @@ pip_egg_info/ *.las *.png *.h5 +*.zip # Deep learning model.h5 @@ -132,5 +133,8 @@ ShapeNet/ maciek_data nibio_data nibio_data_no_commas - +maciek_data +nibio_data +nibio_data_las_single_file +nibio_data_las/ ``` diff --git a/PyG_implementation/pyg_implementaion_main_my_data_loader.py b/PyG_implementation/pyg_implementaion_main_my_data_loader.py index 564fdcc..f14a7ff 100644 --- a/PyG_implementation/pyg_implementaion_main_my_data_loader.py +++ b/PyG_implementation/pyg_implementaion_main_my_data_loader.py @@ -51,7 +51,7 @@ config.initial_lr = 1e-3 config.lr_scheduler_step_size = 5 config.gamma = 0.8 -config.epochs = 1 +config.epochs = 40 transform = T.Compose([ @@ -62,10 +62,17 @@ transform = T.Compose([ ]) pre_transform = T.NormalizeScale() -dataset_path = "/home/nibio/mutable-outside-world/code/nibio_graph_sem_seg/maciek_data/plane_maciek" +dataset_path = "/home/nibio/mutable-outside-world/code/nibio_graph_sem_seg/nibio_data_txt_single_file" +# dataset_path = "/home/nibio/mutable-outside-world/code/nibio_graph_sem_seg/maciek_data/plane_maciek" # dataset_path = "/home/nibio/mutable-outside-world/code/nibio_graph_sem_seg/nibio_data_no_commas" -train_val_dataset = MyData(dataset_path, split='trainval', transform=transform, pre_transform=pre_transform) +train_val_dataset = MyData( + dataset_path, + label_location=-2, + split='trainval', + transform=transform, + pre_transform=pre_transform + ) segmentation_class_frequency = {} diff --git a/data_split_merge/general_merger.py b/data_split_merge/general_merger.py new file mode 100644 index 0000000..742dff4 --- /dev/null +++ b/data_split_merge/general_merger.py @@ -0,0 +1,80 @@ +import argparse +import os + +import pandas as pd +from tqdm import tqdm + + +class GeneralMerger(object): + def __init__(self, input_folder, output_folder): + self.input_folder = input_folder + self.output_folder = output_folder + + def get_file_names(self): + # read files in the folder_path + files = [os.path.join(self.input_folder, f) for f in os.listdir(self.input_folder) if f.endswith('.txt')] + files_split = [os.path.basename(f).split('---')[0] for f in files] + files_set = list(set(files_split)) + return files_set + + def merge(self, + file_core_name=None, # this is the core name of the files to be merged + ): + # read files in the folder_path + files = [os.path.join(self.input_folder, f) + for f in os.listdir(self.input_folder) if f.endswith('.txt') and os.path.basename(f).split('---')[0] == file_core_name] + + # create an empty list to hold dataframes + dfs = [] + + # get the header from the first file + header = pd.read_csv(files[0], sep=',', header=0).iloc[0] + + # read each file and append to the list + for file in files: + dfs.append(pd.read_csv(file, sep=',', header=0)) + + # concatenate all dataframes into a single one + merged_df = pd.concat(dfs, ignore_index=True) + + # add the header + merged_df.columns = header + + # write the merged dataframe to a csv file + path_to_save = os.path.join(self.output_folder, file_core_name + '_merged.txt') + merged_df.to_csv(path_to_save, index=False, sep=',') + + return merged_df + + def merge_all(self): + os.makedirs(self.output_folder, exist_ok=True) + + files = self.get_file_names() + for file in tqdm(files): + self.merge(file_core_name=file) + +# test the code +# input_folder = '/home/nibio/mutable-outside-world/code/nibio_graph_sem_seg/data_split_merge/output_text' +# output_foler = '/home/nibio/mutable-outside-world/code/nibio_graph_sem_seg/data_split_merge/merged_output' +# merger = GeneralMerger(input_folder, output_foler) +# # merger.merge() +# files = merger.get_file_names() +# merger.merge_all() + +# print(files) + +if __name__ == '__main__': + # parse the arguments + parser = argparse.ArgumentParser() + parser.add_argument('--input_folder', type=str, default='/home/nibio/mutable-outside-world/code/nibio_graph_sem_seg/data_split_merge/output_text') + parser.add_argument('--output_folder', type=str, default='/home/nibio/mutable-outside-world/code/nibio_graph_sem_seg/data_split_merge/merged_output') + args = parser.parse_args() + + # create the merger object + merger = GeneralMerger(args.input_folder, args.output_folder) + + # merge all the files + merger.merge_all() + + + diff --git a/data_split_merge/general_splitter.py b/data_split_merge/general_splitter.py new file mode 100644 index 0000000..9e01cab --- /dev/null +++ b/data_split_merge/general_splitter.py @@ -0,0 +1,157 @@ +import json +import os +import argparse +import laspy +import numpy as np +from pathlib import Path +from sklearn.neighbors import KDTree +from tqdm import tqdm +from joblib import Parallel, delayed +import subprocess +from distutils.dir_util import copy_tree +import pdal + + + +SPLIT_TEMPLATE = json.dumps({ + "pipeline": [ + "input.las", + { + "type": "filters.chipper", + "capacity": "5000" + }, + { + "type": "writers.las", + "filename": "output_#.las" + } + ] +}) + +LIST_OF_LAS_FIELDS = ['label', 'treeID'] + + +def split(file_path, output_folder, capacity=5000): + # create the output folder if it does not exist + if not os.path.exists(output_folder): + os.makedirs(output_folder) + + # Generate unique filename + unique_filename = 'split_pipeline_' + str(os.getpid()) + '.json' + + with open(unique_filename, 'w') as f: + f.write( + SPLIT_TEMPLATE + .replace('input.las', file_path) + .replace('5000', str(capacity)) + .replace('output_#.las', + os.path.join(output_folder, + Path(file_path).stem + '---#.las')) + ) + # run the pipeline + + try: + subprocess.run(['pdal', 'pipeline', unique_filename], check=True) + except subprocess.CalledProcessError as e: + print(f'Command failed with error {e.returncode}. Output was:\n{e.output}') + + # remove the pipeline file + os.remove(unique_filename) + + +def transfer_extra_fields(file_path, output_folder): + # get list of extra fields in the input file using laspy + input_file = os.path.join(file_path) + las = laspy.read(input_file) + + # get pointcloud and put it into KDTree + point_cloud = np.vstack((las.x, las.y, las.z)).transpose() + + tree = KDTree(point_cloud) + + # get the list of output files + output_files = [os.path.join(output_folder, f) for f in os.listdir(output_folder) if f.endswith('.las')] + for output_file in output_files: + # read the output file + target = laspy.read(output_file) + target_xyz = target.xyz + # find the nearest neighbor for each point in target las file + ind = tree.query(target_xyz, k=1, return_distance=False) + + selected_points_x = las.x[ind.ravel()] + selected_points_y = las.y[ind.ravel()] + selected_points_z = las.z[ind.ravel()] + + new_header = laspy.LasHeader(point_format=las.point_format.id, version=las.header.version) + + # add extra dimensions to new las file + for item in LIST_OF_LAS_FIELDS: + new_header.add_extra_dim(laspy.ExtraBytesParams(name=item, type=np.int32)) + + new_las = laspy.LasData(new_header) + + # copy x, y, z, gt_label and target_label from target las file to the new las file + new_las.x = selected_points_x + new_las.y = selected_points_y + new_las.z = selected_points_z + + # copy contents of extra dimensions from target las file to the new las file + for item in new_header.point_format.dimension_names: + if item in LIST_OF_LAS_FIELDS: + new_las[item] = las[item][ind.ravel()] + + # write the new las file + new_las.write(output_file) + + +class GeneralSplitter(object): + def __init__(self, input_folder, output_folder, capacity=5000): + self.input_folder = input_folder + self.output_folder = output_folder + self.capacity = capacity + + + def process_file(self, file, capacity, output_folder): + # create the temporary output folder + temp_output_folder = 'temp_output_folder' + if not os.path.exists(temp_output_folder): + os.makedirs(temp_output_folder) + + # split and transfer extra fields + split(file, temp_output_folder, capacity) + transfer_extra_fields(file, temp_output_folder) + + # copy all the files from the temporary output folder to the output folder using shutil + copy_tree(temp_output_folder, output_folder) + + # remove the temporary output folder + subprocess.run(['rm', '-r', temp_output_folder], check=True) + + + def split_and_transfer_in_folder(self): + # get list of files in the input folder + files = [os.path.join(self.input_folder, f) for f in os.listdir(self.input_folder) if f.endswith('.las')] + for file in tqdm(files): + self.process_file(file, self.capacity, self.output_folder) + + # use parallel processing + # Parallel(n_jobs=4)(delayed(self.process_file)(file, self.capacity, self.output_folder) for file in tqdm(files)) + + + + +# filepath = '/home/nibio/mutable-outside-world/code/nibio_graph_sem_seg/data_split_merge/input_folder' +# output_folder = '/home/nibio/mutable-outside-world/code/nibio_graph_sem_seg/data_split_merge/output' +# splitter = GeneralSplitter(filepath, output_folder) +# splitter.split_and_transfer_in_folder() + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Split a LAS file into multiple LAS files') + parser.add_argument('--input_folder', type=str, help='Input LAS file') + parser.add_argument('--output_folder', type=str, help='Output folder') + parser.add_argument('--capacity', type=int, default=5000, help='Capacity of each output LAS file') + args = parser.parse_args() + + splitter = GeneralSplitter(args.input_folder, args.output_folder, args.capacity) + + splitter.split_and_transfer_in_folder() \ No newline at end of file diff --git a/data_split_merge/merge_files.py b/data_split_merge/merge_files.py new file mode 100644 index 0000000..bf47b61 --- /dev/null +++ b/data_split_merge/merge_files.py @@ -0,0 +1,17 @@ +import json +import glob + +# Get a list of all .las files in the current directory +las_files = glob.glob('output_*.las') + +# Create the pipeline +pipeline = { + "pipeline": las_files + [{ + "type": "writers.las", + "filename": "merged_output.las" + }] +} + +# Write the pipeline to a file +with open('merge_pipeline.json', 'w') as f: + json.dump(pipeline, f) diff --git a/requirements.txt b/requirements.txt index ec7c920..a5c43cc 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,7 +12,7 @@ backcall==0.2.0 Cerberus==1.3.4 certifi==2022.12.7 cffi==1.15.1 -chamfer-distance==0.1 +# chamfer-distance==0.1 charset-normalizer==3.0.1 circuitbreaker==1.4.0 click==8.1.3 diff --git a/test_split/general_merger.py b/test_split/general_merger.py deleted file mode 100644 index 32ba63e..0000000 --- a/test_split/general_merger.py +++ /dev/null @@ -1,37 +0,0 @@ -import os -import laspy -import pandas as pd - -class GeneralMerger(object): - def __init__(self, folder_path, file_name): - self.folder_path = folder_path - self.file_name = file_name - - def merge(self): - # read files in the folder_path - files = [os.path.join(self.folder_path, f) for f in os.listdir(self.folder_path) if f.endswith('.txt')] - - # create an empty list to hold dataframes - dfs = [] - - # get the header from the first file - header = pd.read_csv(files[0], sep=',', header=None).iloc[0] - - # read each file and append to the list - for file in files: - dfs.append(pd.read_csv(file, sep=',', header=None)) - - # concatenate all dataframes into a single one - merged_df = pd.concat(dfs, ignore_index=True) - - # add the header - merged_df.columns = header - - # write the merged dataframe to a csv file - merged_df.to_csv(self.file_name, index=False) - -# test the code -folder_path = '/home/nibio/mutable-outside-world/code/nibio_graph_sem_seg/test_split/output_text' -file_name = '/home/nibio/mutable-outside-world/code/nibio_graph_sem_seg/test_split/merged_output.txt' -merger = GeneralMerger(folder_path, file_name) -merger.merge() diff --git a/test_split/general_splitter.py b/test_split/general_splitter.py deleted file mode 100644 index 2ae83aa..0000000 --- a/test_split/general_splitter.py +++ /dev/null @@ -1,110 +0,0 @@ -import json -import os -import argparse -import laspy -import numpy as np -from sklearn.neighbors import KDTree - -SPLIT_TEMPLATE = json.dumps({ - "pipeline": [ - "input.las", - { - "type": "filters.chipper", - "capacity": "5000" - }, - { - "type": "writers.las", - "filename": "output_#.las" - } - ] -}) - -LIST_OF_LAS_FIELDS = ['label', 'treeID'] - -class GeneralSplitter(object): - def __init__(self, input_file, output_folder, capacity=5000): - self.input_file = input_file - self.output_folder = output_folder - self.capacity = capacity - - def split(self): - # create the output folder if it does not exist - if not os.path.exists(self.output_folder): - os.makedirs(self.output_folder) - - with open('split_pipeline.json', 'w') as f: - f.write( - SPLIT_TEMPLATE.replace('input.las', - self.input_file).replace('5000', str(self.capacity)).replace('output_#.las', - self.output_folder + '/output_#.las')) - # run the pipeline - os.system('pdal pipeline split_pipeline.json') - # remove the pipeline file - os.remove('split_pipeline.json') - - def transfer_extra_fields(self): - # get list of extra fields in the input file using laspy - input_file = os.path.join(self.input_file) - las = laspy.read(input_file) - - # get pointcloud and put it into KDTree - point_cloud = np.vstack((las.x, las.y, las.z)).transpose() - - tree = KDTree(point_cloud) - - # get the list of output files - output_files = [os.path.join(self.output_folder, f) for f in os.listdir(self.output_folder) if f.endswith('.las')] - for output_file in output_files: - # read the output file - target = laspy.read(output_file) - target_xyz = target.xyz - # find the nearest neighbor for each point in target las file - ind = tree.query(target_xyz, k=1, return_distance=False) - - selected_points_x = las.x[ind.ravel()] - selected_points_y = las.y[ind.ravel()] - selected_points_z = las.z[ind.ravel()] - selected_points = np.vstack([selected_points_x, selected_points_y, selected_points_z]).T - - - new_header = laspy.LasHeader(point_format=las.point_format.id, version=las.header.version) - - # add extra dimensions to new las file - for item in LIST_OF_LAS_FIELDS: - new_header.add_extra_dim(laspy.ExtraBytesParams(name=item, type=np.int32)) - - new_las = laspy.LasData(new_header) - - # copy x, y, z, gt_label and target_label from target las file to the new las file - new_las.x = selected_points_x - new_las.y = selected_points_y - new_las.z = selected_points_z - - # copy contents of extra dimensions from target las file to the new las file - for item in new_header.point_format.dimension_names: - if item in LIST_OF_LAS_FIELDS: - new_las[item] = las[item][ind.ravel()] - - - # write the new las file - new_las.write(output_file) - - -filepath = '/home/nibio/mutable-outside-world/code/nibio_graph_sem_seg/test_split/input.las' -output_folder = '/home/nibio/mutable-outside-world/code/nibio_graph_sem_seg/test_split/output' -splitter = GeneralSplitter(filepath, output_folder) -splitter.split() -splitter.transfer_extra_fields() - - -# if __name__ == '__main__': -# parser = argparse.ArgumentParser(description='Split a LAS file into multiple LAS files') -# parser.add_argument('--input_file', type=str, help='Input LAS file') -# parser.add_argument('--output_folder', type=str, help='Output folder') -# parser.add_argument('--capacity', type=int, default=5000, help='Capacity of each output LAS file') -# args = parser.parse_args() - -# splitter = GeneralSplitter(args.input_file, args.output_folder, args.capacity) -# # splitter.split() -# splitter.transfer_extra_fields() - -- GitLab