From f5ad3556a057d402a219e723cde51185d09f3b45 Mon Sep 17 00:00:00 2001
From: Maciej Wielgosz <maciej.wielgosz@nibio.no>
Date: Thu, 8 Jun 2023 09:58:18 +0200
Subject: [PATCH] spliting trees files for the model - unparallel done

---
 .gitignore                                    |   6 +-
 .../pyg_implementaion_main_my_data_loader.py  |  13 +-
 data_split_merge/general_merger.py            |  80 +++++++++
 data_split_merge/general_splitter.py          | 157 ++++++++++++++++++
 data_split_merge/merge_files.py               |  17 ++
 requirements.txt                              |   2 +-
 test_split/general_merger.py                  |  37 -----
 test_split/general_splitter.py                | 110 ------------
 8 files changed, 270 insertions(+), 152 deletions(-)
 create mode 100644 data_split_merge/general_merger.py
 create mode 100644 data_split_merge/general_splitter.py
 create mode 100644 data_split_merge/merge_files.py
 delete mode 100644 test_split/general_merger.py
 delete mode 100644 test_split/general_splitter.py

diff --git a/.gitignore b/.gitignore
index 0f2e01b..54a0327 100644
--- a/.gitignore
+++ b/.gitignore
@@ -118,6 +118,7 @@ pip_egg_info/
 *.las
 *.png
 *.h5
+*.zip
 
 # Deep learning
 model.h5
@@ -132,5 +133,8 @@ ShapeNet/
 maciek_data
 nibio_data
 nibio_data_no_commas
-
+maciek_data
+nibio_data
+nibio_data_las_single_file
+nibio_data_las/
 ```
diff --git a/PyG_implementation/pyg_implementaion_main_my_data_loader.py b/PyG_implementation/pyg_implementaion_main_my_data_loader.py
index 564fdcc..f14a7ff 100644
--- a/PyG_implementation/pyg_implementaion_main_my_data_loader.py
+++ b/PyG_implementation/pyg_implementaion_main_my_data_loader.py
@@ -51,7 +51,7 @@ config.initial_lr = 1e-3
 config.lr_scheduler_step_size = 5
 config.gamma = 0.8
 
-config.epochs = 1
+config.epochs = 40
 
 
 transform = T.Compose([
@@ -62,10 +62,17 @@ transform = T.Compose([
 ])
 pre_transform = T.NormalizeScale()
 
-dataset_path = "/home/nibio/mutable-outside-world/code/nibio_graph_sem_seg/maciek_data/plane_maciek"
+dataset_path = "/home/nibio/mutable-outside-world/code/nibio_graph_sem_seg/nibio_data_txt_single_file"
+# dataset_path = "/home/nibio/mutable-outside-world/code/nibio_graph_sem_seg/maciek_data/plane_maciek"
 # dataset_path = "/home/nibio/mutable-outside-world/code/nibio_graph_sem_seg/nibio_data_no_commas"
 
-train_val_dataset = MyData(dataset_path, split='trainval', transform=transform, pre_transform=pre_transform)
+train_val_dataset = MyData(
+    dataset_path, 
+    label_location=-2,
+    split='trainval', 
+    transform=transform, 
+    pre_transform=pre_transform
+    )
 
 
 segmentation_class_frequency = {}
diff --git a/data_split_merge/general_merger.py b/data_split_merge/general_merger.py
new file mode 100644
index 0000000..742dff4
--- /dev/null
+++ b/data_split_merge/general_merger.py
@@ -0,0 +1,80 @@
+import argparse
+import os
+
+import pandas as pd
+from tqdm import tqdm
+
+
+class GeneralMerger(object):
+    def __init__(self, input_folder, output_folder):
+        self.input_folder = input_folder
+        self.output_folder = output_folder
+
+    def get_file_names(self):
+        # read files in the folder_path
+        files = [os.path.join(self.input_folder, f) for f in os.listdir(self.input_folder) if f.endswith('.txt')]
+        files_split = [os.path.basename(f).split('---')[0] for f in files]
+        files_set = list(set(files_split))
+        return files_set
+
+    def merge(self, 
+              file_core_name=None, # this is the core name of the files to be merged
+              ):
+        # read files in the folder_path
+        files = [os.path.join(self.input_folder, f) 
+                 for f in os.listdir(self.input_folder) if f.endswith('.txt') and os.path.basename(f).split('---')[0] == file_core_name]
+
+        # create an empty list to hold dataframes
+        dfs = []
+
+        # get the header from the first file
+        header = pd.read_csv(files[0], sep=',', header=0).iloc[0]
+
+        # read each file and append to the list
+        for file in files:
+            dfs.append(pd.read_csv(file, sep=',', header=0))
+
+        # concatenate all dataframes into a single one
+        merged_df = pd.concat(dfs, ignore_index=True)
+
+        # add the header
+        merged_df.columns = header
+
+        # write the merged dataframe to a csv file
+        path_to_save = os.path.join(self.output_folder, file_core_name + '_merged.txt')
+        merged_df.to_csv(path_to_save, index=False, sep=',')
+
+        return merged_df
+    
+    def merge_all(self):
+        os.makedirs(self.output_folder, exist_ok=True)
+
+        files = self.get_file_names()
+        for file in tqdm(files):
+            self.merge(file_core_name=file)
+
+# test the code
+# input_folder = '/home/nibio/mutable-outside-world/code/nibio_graph_sem_seg/data_split_merge/output_text'
+# output_foler = '/home/nibio/mutable-outside-world/code/nibio_graph_sem_seg/data_split_merge/merged_output'
+# merger = GeneralMerger(input_folder, output_foler)
+# # merger.merge()
+# files = merger.get_file_names()
+# merger.merge_all()
+
+# print(files)
+
+if __name__ == '__main__':
+    # parse the arguments
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--input_folder', type=str, default='/home/nibio/mutable-outside-world/code/nibio_graph_sem_seg/data_split_merge/output_text')
+    parser.add_argument('--output_folder', type=str, default='/home/nibio/mutable-outside-world/code/nibio_graph_sem_seg/data_split_merge/merged_output')
+    args = parser.parse_args()
+
+    # create the merger object
+    merger = GeneralMerger(args.input_folder, args.output_folder)
+
+    # merge all the files
+    merger.merge_all()
+
+
+
diff --git a/data_split_merge/general_splitter.py b/data_split_merge/general_splitter.py
new file mode 100644
index 0000000..9e01cab
--- /dev/null
+++ b/data_split_merge/general_splitter.py
@@ -0,0 +1,157 @@
+import json
+import os
+import argparse
+import laspy
+import numpy as np
+from pathlib import Path
+from sklearn.neighbors import KDTree
+from tqdm import tqdm
+from joblib import Parallel, delayed
+import subprocess
+from distutils.dir_util import copy_tree
+import pdal 
+
+
+
+SPLIT_TEMPLATE = json.dumps({
+    "pipeline": [
+    "input.las",
+    {
+      "type": "filters.chipper",
+      "capacity": "5000"
+    },
+    {
+      "type": "writers.las",
+      "filename": "output_#.las"
+    }
+  ]
+})
+
+LIST_OF_LAS_FIELDS = ['label', 'treeID']
+
+
+def split(file_path, output_folder, capacity=5000):
+    # create the output folder if it does not exist
+    if not os.path.exists(output_folder):
+        os.makedirs(output_folder)
+
+    # Generate unique filename
+    unique_filename = 'split_pipeline_' + str(os.getpid()) + '.json'
+
+    with open(unique_filename, 'w') as f:
+        f.write(
+            SPLIT_TEMPLATE
+            .replace('input.las', file_path)
+            .replace('5000', str(capacity))
+            .replace('output_#.las', 
+                        os.path.join(output_folder, 
+                                    Path(file_path).stem + '---#.las'))
+            )
+    # run the pipeline
+    
+    try:
+        subprocess.run(['pdal', 'pipeline', unique_filename], check=True)
+    except subprocess.CalledProcessError as e:
+        print(f'Command failed with error {e.returncode}. Output was:\n{e.output}')
+
+    # remove the pipeline file
+    os.remove(unique_filename)
+
+
+def transfer_extra_fields(file_path, output_folder):
+    # get list of extra fields in the input file using laspy
+    input_file = os.path.join(file_path)
+    las = laspy.read(input_file)
+    
+    # get pointcloud and put it into KDTree
+    point_cloud = np.vstack((las.x, las.y, las.z)).transpose()
+
+    tree = KDTree(point_cloud)
+
+    # get the list of output files
+    output_files = [os.path.join(output_folder, f) for f in os.listdir(output_folder) if f.endswith('.las')]
+    for output_file in output_files:
+        # read the output file
+        target = laspy.read(output_file)
+        target_xyz = target.xyz
+            # find the nearest neighbor for each point in target las file
+        ind = tree.query(target_xyz, k=1, return_distance=False)
+
+        selected_points_x = las.x[ind.ravel()]
+        selected_points_y = las.y[ind.ravel()]
+        selected_points_z = las.z[ind.ravel()]
+
+        new_header = laspy.LasHeader(point_format=las.point_format.id, version=las.header.version)
+
+        # add extra dimensions to new las file
+        for item in LIST_OF_LAS_FIELDS:
+            new_header.add_extra_dim(laspy.ExtraBytesParams(name=item, type=np.int32))
+
+        new_las = laspy.LasData(new_header)
+        
+        # copy x, y, z, gt_label and target_label from target las file to the new las file
+        new_las.x = selected_points_x
+        new_las.y = selected_points_y
+        new_las.z = selected_points_z
+
+        # copy contents of extra dimensions from target las file to the new las file
+        for item in new_header.point_format.dimension_names:
+            if item in LIST_OF_LAS_FIELDS:
+                new_las[item] = las[item][ind.ravel()]
+
+        # write the new las file
+        new_las.write(output_file)
+
+
+class GeneralSplitter(object):
+    def __init__(self, input_folder, output_folder, capacity=5000):
+        self.input_folder = input_folder
+        self.output_folder = output_folder
+        self.capacity = capacity
+
+
+    def process_file(self, file, capacity, output_folder):
+        # create the temporary output folder
+        temp_output_folder = 'temp_output_folder'
+        if not os.path.exists(temp_output_folder):
+            os.makedirs(temp_output_folder)
+        
+        # split and transfer extra fields
+        split(file, temp_output_folder, capacity)
+        transfer_extra_fields(file, temp_output_folder)
+        
+        # copy all the files from the temporary output folder to the output folder using shutil
+        copy_tree(temp_output_folder, output_folder)
+    
+        # remove the temporary output folder
+        subprocess.run(['rm', '-r', temp_output_folder], check=True)
+
+
+    def split_and_transfer_in_folder(self):
+        # get list of files in the input folder
+        files = [os.path.join(self.input_folder, f) for f in os.listdir(self.input_folder) if f.endswith('.las')]
+        for file in tqdm(files):
+            self.process_file(file, self.capacity, self.output_folder)
+
+        # use parallel processing
+        # Parallel(n_jobs=4)(delayed(self.process_file)(file, self.capacity, self.output_folder) for file in tqdm(files))
+
+
+
+      
+# filepath = '/home/nibio/mutable-outside-world/code/nibio_graph_sem_seg/data_split_merge/input_folder'
+# output_folder = '/home/nibio/mutable-outside-world/code/nibio_graph_sem_seg/data_split_merge/output'
+# splitter = GeneralSplitter(filepath, output_folder)
+# splitter.split_and_transfer_in_folder()
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Split a LAS file into multiple LAS files')
+    parser.add_argument('--input_folder', type=str, help='Input LAS file')
+    parser.add_argument('--output_folder', type=str, help='Output folder')
+    parser.add_argument('--capacity', type=int, default=5000, help='Capacity of each output LAS file')
+    args = parser.parse_args()
+
+    splitter = GeneralSplitter(args.input_folder, args.output_folder, args.capacity)
+
+    splitter.split_and_transfer_in_folder()
\ No newline at end of file
diff --git a/data_split_merge/merge_files.py b/data_split_merge/merge_files.py
new file mode 100644
index 0000000..bf47b61
--- /dev/null
+++ b/data_split_merge/merge_files.py
@@ -0,0 +1,17 @@
+import json
+import glob
+
+# Get a list of all .las files in the current directory
+las_files = glob.glob('output_*.las')
+
+# Create the pipeline
+pipeline = {
+    "pipeline": las_files + [{
+        "type": "writers.las",
+        "filename": "merged_output.las"
+    }]
+}
+
+# Write the pipeline to a file
+with open('merge_pipeline.json', 'w') as f:
+    json.dump(pipeline, f)
diff --git a/requirements.txt b/requirements.txt
index ec7c920..a5c43cc 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -12,7 +12,7 @@ backcall==0.2.0
 Cerberus==1.3.4
 certifi==2022.12.7
 cffi==1.15.1
-chamfer-distance==0.1
+# chamfer-distance==0.1
 charset-normalizer==3.0.1
 circuitbreaker==1.4.0
 click==8.1.3
diff --git a/test_split/general_merger.py b/test_split/general_merger.py
deleted file mode 100644
index 32ba63e..0000000
--- a/test_split/general_merger.py
+++ /dev/null
@@ -1,37 +0,0 @@
-import os
-import laspy
-import pandas as pd
-
-class GeneralMerger(object):
-    def __init__(self, folder_path, file_name):
-        self.folder_path = folder_path
-        self.file_name = file_name
-
-    def merge(self):
-        # read files in the folder_path
-        files = [os.path.join(self.folder_path, f) for f in os.listdir(self.folder_path) if f.endswith('.txt')]
-
-        # create an empty list to hold dataframes
-        dfs = []
-
-        # get the header from the first file
-        header = pd.read_csv(files[0], sep=',', header=None).iloc[0]
-
-        # read each file and append to the list
-        for file in files:
-            dfs.append(pd.read_csv(file, sep=',', header=None))
-
-        # concatenate all dataframes into a single one
-        merged_df = pd.concat(dfs, ignore_index=True)
-
-        # add the header
-        merged_df.columns = header
-
-        # write the merged dataframe to a csv file
-        merged_df.to_csv(self.file_name, index=False)
-
-# test the code
-folder_path = '/home/nibio/mutable-outside-world/code/nibio_graph_sem_seg/test_split/output_text'
-file_name = '/home/nibio/mutable-outside-world/code/nibio_graph_sem_seg/test_split/merged_output.txt'
-merger = GeneralMerger(folder_path, file_name)
-merger.merge()
diff --git a/test_split/general_splitter.py b/test_split/general_splitter.py
deleted file mode 100644
index 2ae83aa..0000000
--- a/test_split/general_splitter.py
+++ /dev/null
@@ -1,110 +0,0 @@
-import json
-import os
-import argparse
-import laspy
-import numpy as np
-from sklearn.neighbors import KDTree
-
-SPLIT_TEMPLATE = json.dumps({
-    "pipeline": [
-    "input.las",
-    {
-      "type": "filters.chipper",
-      "capacity": "5000"
-    },
-    {
-      "type": "writers.las",
-      "filename": "output_#.las"
-    }
-  ]
-})
-
-LIST_OF_LAS_FIELDS = ['label', 'treeID']
-
-class GeneralSplitter(object):
-    def __init__(self, input_file, output_folder, capacity=5000):
-        self.input_file = input_file
-        self.output_folder = output_folder
-        self.capacity = capacity
-
-    def split(self):
-        # create the output folder if it does not exist
-        if not os.path.exists(self.output_folder):
-            os.makedirs(self.output_folder)
-
-        with open('split_pipeline.json', 'w') as f:
-            f.write(
-                SPLIT_TEMPLATE.replace('input.las', 
-                                       self.input_file).replace('5000', str(self.capacity)).replace('output_#.las', 
-                                                                                                    self.output_folder + '/output_#.las'))
-        # run the pipeline
-        os.system('pdal pipeline split_pipeline.json')
-        # remove the pipeline file
-        os.remove('split_pipeline.json')
-
-    def transfer_extra_fields(self):
-        # get list of extra fields in the input file using laspy
-        input_file = os.path.join(self.input_file)
-        las = laspy.read(input_file)
-       
-        # get pointcloud and put it into KDTree
-        point_cloud = np.vstack((las.x, las.y, las.z)).transpose()
-
-        tree = KDTree(point_cloud)
-
-        # get the list of output files
-        output_files = [os.path.join(self.output_folder, f) for f in os.listdir(self.output_folder) if f.endswith('.las')]
-        for output_file in output_files:
-            # read the output file
-            target = laspy.read(output_file)
-            target_xyz = target.xyz
-             # find the nearest neighbor for each point in target las file
-            ind = tree.query(target_xyz, k=1, return_distance=False)
-
-            selected_points_x = las.x[ind.ravel()]
-            selected_points_y = las.y[ind.ravel()]
-            selected_points_z = las.z[ind.ravel()]
-            selected_points = np.vstack([selected_points_x, selected_points_y, selected_points_z]).T
-
-
-            new_header = laspy.LasHeader(point_format=las.point_format.id, version=las.header.version)
-
-            # add extra dimensions to new las file
-            for item in LIST_OF_LAS_FIELDS:
-                new_header.add_extra_dim(laspy.ExtraBytesParams(name=item, type=np.int32))
-
-            new_las = laspy.LasData(new_header)
-            
-            # copy x, y, z, gt_label and target_label from target las file to the new las file
-            new_las.x = selected_points_x
-            new_las.y = selected_points_y
-            new_las.z = selected_points_z
-
-            # copy contents of extra dimensions from target las file to the new las file
-            for item in new_header.point_format.dimension_names:
-                if item in LIST_OF_LAS_FIELDS:
-                    new_las[item] = las[item][ind.ravel()]
-
-
-            # write the new las file
-            new_las.write(output_file)
-
-      
-filepath = '/home/nibio/mutable-outside-world/code/nibio_graph_sem_seg/test_split/input.las'
-output_folder = '/home/nibio/mutable-outside-world/code/nibio_graph_sem_seg/test_split/output'
-splitter = GeneralSplitter(filepath, output_folder)
-splitter.split()
-splitter.transfer_extra_fields()
-
-
-# if __name__ == '__main__':
-#     parser = argparse.ArgumentParser(description='Split a LAS file into multiple LAS files')
-#     parser.add_argument('--input_file', type=str, help='Input LAS file')
-#     parser.add_argument('--output_folder', type=str, help='Output folder')
-#     parser.add_argument('--capacity', type=int, default=5000, help='Capacity of each output LAS file')
-#     args = parser.parse_args()
-
-#     splitter = GeneralSplitter(args.input_file, args.output_folder, args.capacity)
-#     # splitter.split()
-#     splitter.transfer_extra_fields()
-
-- 
GitLab