Search code examples
c#choetl

Replace or dont append if record/row already exists in csv


I have the following code that loops through json files in a directory and creates a csv file with following records:

results.csv

File Name    Page  Practice Name
fileXYZ.json 1     XYZ & Co
fileAB2.json 1     ABC & Co
file1.json   1     Associates & Co

However, if i stop execution and rerun the program again, what happens is that the same records get inserted in the csv file again, resulting in:

File Name    Page  Practice Name
fileXYZ.json 1     XYZ & Co
fileAB2.json 1     ABC & Co
file1.json   1     Associates & Co
fileXYZ.json 1     XYZ & Co
fileAB2.json 1     ABC & Co
file1.json   1     Associates & Co 

How do i check if the record already exists (i.e. each field is the same of the fields being inserted) and replace it (or basically dont append it again?) for example, if i were to run the program again because there was a change in file1.json and also because there was a new file added to the directory, the new csv should look like this:

results.csv:

File Name    Page  Practice Name
fileXYZ.json 1     XYZ & Co
fileAB2.json 1     ABC & Co
file1.json   1     Corpum & Co
file32.json  1     FirmA

code:

using ChoETL;
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;

static void Main(string[] args)
{
    //Output to CSV
    foreach (var jsonFile in Directory.GetFiles(jsonFilesPath))
    {
        JsonToCsv(jsonFile, csvFilePath);
    }
}
public static string fieldValue(IEnumerable<dynamic> lines, string nameOfField, bool throwException = false)
{
    var skipped = lines.SkipWhile(l => l.text != nameOfField);

    switch (throwException)
    {
        case true:
            var enumerator = lines.GetEnumerator();
            
            while (enumerator.MoveNext())
            {
                if (skipped.Count() == 0)
                    return skipped.Skip(1).First().text;
                else
                    throw new InvalidDataException("Odd number of items found in IEnumerable<>");
            }

            break;
        case false:
            // Skip(#) to skip over the unnecessary Lines, 
            // such as "Account Information", preceding "Practice Name".
            return skipped.Skip(1).First().text;

            break;
        default:
            Console.WriteLine("Default case");
            break;
    }
    // Returning null isn't recommended, but it does fix the error "not all code paths return a value"
    return null;
}
public static void JsonToCsv(string jsonInputFile, string csvFile)
{
    using (var p = new ChoJSONReader(jsonInputFile).WithJSONPath("$..readResults"))
    {
        using (var fs = new FileStream(csvFile, FileMode.Append, FileAccess.Write))
        {
             using (var writer = new ChoCSVWriter(fs))
             {
                writer.WithField("FileName", fieldName: "File Name")
                    .WithField("Page")
                    .WithField("PracticeName", fieldName: "Practice Name");

                if (fs.Position == 0) // we don't need header if file already existed before
                {
                    writer.WithFirstLineHeader();
                }
                
                // Limit the result to page 1 since the fields below only exist on the 1st page
                writer.Write(p
                    .Where(r1 => r1.page == 1)
                    .Select(r1 =>
                    {
                        var lines = (dynamic[])r1.lines;
                        return new
                        {
                            FileName = jsonInputFile,
                            Page = r1.page,
                            PracticeName = //lines[6].text,
                                fieldValue(lines, "Practice Name"),
                        };
                    }
                ));

                }

                fs.Write(Environment.NewLine); // append new line carrier so we don't write to the same line when file reopened for writing
        }
    }
}

Sample JSON file

{
  "status": "succeeded",
  "createdDateTime": "2020-10-30T15:56:11Z",
  "lastUpdatedDateTime": "2020-10-30T15:56:12Z",
  "analyzeResult": {
    "version": "3.0.0",
    "readResults": [
      {
        "page": 1,
        "angle": 0.086,
        "width": 684,
        "height": 272,
        "unit": "pixel",
        "lines": [
          {
            "boundingBox": [
              7,
              6,
              196,
              5,
              196,
              24,
              7,
              25
            ],
            "text": "Account Information",
            "words": [
              {
                "boundingBox": [
                  10,
                  7,
                  83,
                  7,
                  81,
                  24,
                  7,
                  26
                ],
                "text": "Account",
                "confidence": 0.981
              },
              {
                "boundingBox": [
                  87,
                  7,
                  196,
                  6,
                  196,
                  24,
                  85,
                  24
                ],
                "text": "Information",
                "confidence": 0.939
              }
            ]
          },
          {
            "boundingBox": [
              120,
              56,
              223,
              57,
              223,
              70,
              120,
              70
            ],
            "text": "Practice Name",
            "words": [
              {
                "boundingBox": [
                  120,
                  57,
                  176,
                  57,
                  176,
                  70,
                  120,
                  71
                ],
                "text": "Practice",
                "confidence": 0.982
              },
              {
                "boundingBox": [
                  179,
                  57,
                  222,
                  57,
                  222,
                  71,
                  179,
                  70
                ],
                "text": "Name",
                "confidence": 0.985
              }
            ]
          },
          {
            "boundingBox": [
              236,
              62,
              390,
              62,
              390,
              77,
              236,
              77
            ],
            "text": "Some Practice Name",
            "words": [
              {
                "boundingBox": [
                  236,
                  62,
                  277,
                  62,
                  277,
                  78,
                  236,
                  78
                ],
                "text": "Some",
                "confidence": 0.987
              },
              {
                "boundingBox": [
                  280,
                  62,
                  340,
                  62,
                  341,
                  78,
                  280,
                  77
                ],
                "text": "Practice",
                "confidence": 0.984
              },
              {
                "boundingBox": [
                  343,
                  62,
                  390,
                  62,
                  390,
                  78,
                  344,
                  78
                ],
                "text": "Name",
                "confidence": 0.987
              }
            ]
          },
          {
            "boundingBox": [
              107,
              102,
              223,
              102,
              223,
              115,
              107,
              115
            ],
            "text": "Owner Full Name",
            "words": [
              {
                "boundingBox": [
                  108,
                  103,
                  151,
                  102,
                  151,
                  116,
                  107,
                  116
                ],
                "text": "Owner",
                "confidence": 0.985
              },
              {
                "boundingBox": [
                  154,
                  102,
                  177,
                  102,
                  176,
                  116,
                  153,
                  116
                ],
                "text": "Full",
                "confidence": 0.954
              },
              {
                "boundingBox": [
                  180,
                  102,
                  224,
                  103,
                  223,
                  116,
                  179,
                  116
                ],
                "text": "Name",
                "confidence": 0.987
              }
            ]
          },
          {
            "boundingBox": [
              237,
              104,
              298,
              104,
              298,
              119,
              237,
              119
            ],
            "text": "Bob Lee",
            "words": [
              {
                "boundingBox": [
                  238,
                  104,
                  266,
                  104,
                  266,
                  119,
                  238,
                  120
                ],
                "text": "Bob",
                "confidence": 0.987
              },
              {
                "boundingBox": [
                  269,
                  104,
                  298,
                  105,
                  298,
                  120,
                  269,
                  119
                ],
                "text": "Lee",
                "confidence": 0.987
              }
            ]
          },
          {
            "boundingBox": [
              136,
              147,
              223,
              147,
              223,
              160,
              137,
              161
            ],
            "text": "Owner Email",
            "words": [
              {
                "boundingBox": [
                  137,
                  148,
                  181,
                  147,
                  181,
                  161,
                  137,
                  162
                ],
                "text": "Owner",
                "confidence": 0.985
              },
              {
                "boundingBox": [
                  184,
                  147,
                  224,
                  147,
                  224,
                  161,
                  184,
                  161
                ],
                "text": "Email",
                "confidence": 0.985
              }
            ]
          },
          {
            "boundingBox": [
              239,
              144,
              361,
              144,
              361,
              162,
              239,
              162
            ],
            "text": "bob@gmail.com",
            "words": [
              {
                "boundingBox": [
                  240,
                  145,
                  362,
                  146,
                  361,
                  163,
                  240,
                  163
                ],
                "text": "bob@gmail.com",
                "confidence": 0.974
              }
            ]
          },
          {
            "boundingBox": [
              137,
              193,
              224,
              193,
              224,
              208,
              137,
              208
            ],
            "text": "Server Setup",
            "words": [
              {
                "boundingBox": [
                  137,
                  194,
                  179,
                  194,
                  179,
                  208,
                  137,
                  208
                ],
                "text": "Server",
                "confidence": 0.985
              },
              {
                "boundingBox": [
                  182,
                  194,
                  224,
                  194,
                  224,
                  209,
                  182,
                  208
                ],
                "text": "Setup",
                "confidence": 0.985
              }
            ]
          },
          {
            "boundingBox": [
              276,
              188,
              340,
              192,
              339,
              211,
              275,
              209
            ],
            "text": "cloud",
            "words": [
              {
                "boundingBox": [
                  297,
                  192,
                  339,
                  194,
                  339,
                  211,
                  297,
                  211
                ],
                "text": "cloud",
                "confidence": 0.933
              }
            ]
          },
          {
            "boundingBox": [
              376,
              187,
              461,
              191,
              460,
              212,
              376,
              211
            ],
            "text": "Location",
            "words": [
              {
                "boundingBox": [
                  394,
                  191,
                  460,
                  196,
                  459,
                  211,
                  394,
                  211
                ],
                "text": "Location",
                "confidence": 0.844
              }
            ]
          },
          {
            "boundingBox": [
              500,
              189,
              666,
              192,
              665,
              212,
              499,
              211
            ],
            "text": "LIcentral (multi-location)",
            "words": [
              {
                "boundingBox": [
                  501,
                  190,
                  567,
                  195,
                  567,
                  212,
                  500,
                  212
                ],
                "text": "LIcentral",
                "confidence": 0.665
              },
              {
                "boundingBox": [
                  572,
                  195,
                  665,
                  195,
                  665,
                  212,
                  571,
                  212
                ],
                "text": "(multi-location)",
                "confidence": 0.899
              }
            ]
          },
          {
            "boundingBox": [
              21,
              238,
              224,
              238,
              223,
              255,
              21,
              253
            ],
            "text": "Number of Locations Enrolling",
            "words": [
              {
                "boundingBox": [
                  21,
                  239,
                  76,
                  239,
                  76,
                  253,
                  21,
                  253
                ],
                "text": "Number",
                "confidence": 0.985
              },
              {
                "boundingBox": [
                  79,
                  239,
                  92,
                  239,
                  92,
                  253,
                  79,
                  253
                ],
                "text": "of",
                "confidence": 0.983
              },
              {
                "boundingBox": [
                  95,
                  239,
                  161,
                  239,
                  161,
                  254,
                  95,
                  253
                ],
                "text": "Locations",
                "confidence": 0.981
              },
              {
                "boundingBox": [
                  164,
                  239,
                  224,
                  239,
                  223,
                  256,
                  163,
                  254
                ],
                "text": "Enrolling",
                "confidence": 0.983
              }
            ]
          },
          {
            "boundingBox": [
              273,
              237,
              289,
              239,
              288,
              257,
              272,
              255
            ],
            "text": "1",
            "words": [
              {
                "boundingBox": [
                  278,
                  237,
                  290,
                  239,
                  287,
                  257,
                  276,
                  255
                ],
                "text": "1",
                "confidence": 0.981
              }
            ]
          },
          {
            "boundingBox": [
              337,
              239,
              670,
              239,
              670,
              253,
              337,
              252
            ],
            "text": "*If more than 1 location, add info on the locations form",
            "words": [
              {
                "boundingBox": [
                  338,
                  239,
                  347,
                  239,
                  347,
                  252,
                  338,
                  252
                ],
                "text": "*If",
                "confidence": 0.874
              },
              {
                "boundingBox": [
                  350,
                  239,
                  384,
                  239,
                  384,
                  253,
                  350,
                  252
                ],
                "text": "more",
                "confidence": 0.983
              },
              {
                "boundingBox": [
                  386,
                  239,
                  416,
                  239,
                  416,
                  253,
                  386,
                  253
                ],
                "text": "than",
                "confidence": 0.986
              },
              {
                "boundingBox": [
                  419,
                  239,
                  422,
                  239,
                  422,
                  253,
                  419,
                  253
                ],
                "text": "1",
                "confidence": 0.635
              },
              {
                "boundingBox": [
                  425,
                  239,
                  478,
                  239,
                  478,
                  253,
                  425,
                  253
                ],
                "text": "location,",
                "confidence": 0.955
              },
              {
                "boundingBox": [
                  481,
                  239,
                  506,
                  239,
                  506,
                  253,
                  481,
                  253
                ],
                "text": "add",
                "confidence": 0.986
              },
              {
                "boundingBox": [
                  509,
                  239,
                  533,
                  239,
                  533,
                  253,
                  509,
                  253
                ],
                "text": "info",
                "confidence": 0.981
              },
              {
                "boundingBox": [
                  535,
                  239,
                  551,
                  239,
                  552,
                  253,
                  535,
                  253
                ],
                "text": "on",
                "confidence": 0.988
              },
              {
                "boundingBox": [
                  554,
                  239,
                  574,
                  239,
                  575,
                  253,
                  554,
                  253
                ],
                "text": "the",
                "confidence": 0.987
              },
              {
                "boundingBox": [
                  577,
                  239,
                  634,
                  239,
                  634,
                  253,
                  577,
                  253
                ],
                "text": "locations",
                "confidence": 0.973
              },
              {
                "boundingBox": [
                  636,
                  239,
                  666,
                  240,
                  666,
                  253,
                  637,
                  253
                ],
                "text": "form",
                "confidence": 0.986
              }
            ]
          }
        ]
      }
    ]
  }
}

screenshot of csv after adding spproach 2 of Supun De Silva answer: enter image description here


Solution

  • FYI. Sample file you provided does not work as it fails at var lines = (dynamic[])r1.lines;

    Approach 1 - Rename Old File and Create a new one for data appending

    1. Introduce new Function

     private static void RenameIfExist(string csvFilePath)
     {
        if (File.Exists(csvFilePath))
        {
            System.IO.File.Move(csvFilePath, $"{csvFilePath}_{DateTime.Now.ToString("backup_yyyyMMdd_HHmmss")}");
        }
    }
    

    2. Call the Mover Function and use Create mode in new file

    public static void JsonToCsv(string jsonInputFile, string csvFile)
    {
        using (var p = new ChoJSONReader(jsonInputFile).WithJSONPath("$..readResults"))
        {
            Program.RenameIfExist(csvFile);
    
            using (var fs = new FileStream(csvFile, FileMode.Create, FileAccess.Write))
            {
                try
                {
                    using (ChoCSVWriter<dynamic>  writer = new ChoCSVWriter(fs)
                        .WithField("FileName", fieldName: "File Name")
                        .WithField("Page")
                        .WithField("PracticeName", fieldName: "Practice Name")
                        .WithFirstLineHeader())
                    {
                        // Limit the result to page 1 since the fields below only exist on the 1st page
                        writer.Write(p
                            .Where(r1 => r1.page == 1)
                            .Select(r1 =>
                            {
                                var lines = (dynamic[])r1.lines;
                                return new
                                {
                                    FileName = jsonInputFile,
                                    Page = r1.page,
                                    PracticeName = fieldValue(lines, "Practice Name"),
                                };
                            }
                        ));
                    }
    
                }
                catch(Exception e)
                {
                    throw e;
                }
            }
        }
    }
    

    Approach 2 - Open Existing File and create a lookup structure with the data

    • You may need to tweak this a bit

    1. Declare new struct to store a key

    private static Dictionary<string, bool> processedfileStates = new Dictionary<string, bool>();
    

    2. Pre-Loader Function

    private static void LoadOldStatsIfExist(string csvFilePath)
    {
        if (File.Exists(csvFilePath))
        {
            using (var fs = new FileStream(csvFilePath, FileMode.Open, FileAccess.Read))
            {
                using (ChoCSVReader<dynamic> reader = new ChoCSVReader(fs).WithFirstLineHeader())
                {
                    using (var dataReader = reader.AsDataReader())
                    {
                        while (dataReader.Read())
                        {
                            Program.processedfileStates.Add($"{dataReader[0].ToString()}_{dataReader[1].ToString()}_{dataReader[2].ToString()}", true);
                        }
                    }
                }
            }
                
        }
    }
    

    3. Json to CSV fcn

    public static void JsonToCsv(string jsonInputFile, string csvFile)
        {
            using (var p = new ChoJSONReader(jsonInputFile).WithJSONPath("$..readResults"))
            {
                Program.LoadOldStatsIfExist(csvFile);
    
                using (var fs = new FileStream(csvFile, Program.processedfileStates.Count == 0 ? FileMode.Create : FileMode.Append, FileAccess.Write))
                {
                    if (Program.processedfileStates.Count != 0)
                    {
                        fs.Write(Environment.NewLine);
                    }
                    
                    try
                    {
                        ChoCSVWriter<dynamic> writer = new ChoCSVWriter(fs);
                        if (Program.processedfileStates.Count == 0)
                        {
                            writer.WithFirstLineHeader();
                        }
    
                        using (writer
                            .WithField("FileName", fieldName: "File Name")
                            .WithField("Page")
                            .WithField("PracticeName", fieldName: "Practice Name")
                            )
                        {
    
                            if (Program.processedfileStates.Count == 0)
                            {
                                writer = writer.WithFirstLineHeader();
                            }
    
                            // Limit the result to page 1 since the fields below only exist on the 1st page
                            var data = p
                                .Where(r1 => r1.page == 1)
    
                                .Select(r1 =>
                                {
                                    var lines = (dynamic[])r1.lines;
                                    return new
                                    {
                                        FileName = jsonInputFile,
                                        Page = r1.page,
                                        PracticeName = fieldValue(lines, "Practice Name"),
                                    };
                                }
                            ).Where(de => !processedfileStates.ContainsKey($"{de.FileName.ToString()}_{de.Page.ToString()}_{de.PracticeName.ToString()}"));
    
                            writer.Write(data);
                        }
    
                    }
                    catch (Exception e)
                    {
                        throw e;
                    }
                }
            }
        }
    

    Approach 2 - Refactored

     public class OPModel
    {
        public string FileName { get; set; }
        public long Page { get; set; }
        public string PracticeName { get; set; }
    }
    
    public class Program
    {
        const string jsonFilesPath = "D:\\DevWork\\C#\\TempProject1\\ConsoleApp1\\data";
        const string csvFilePath = "D:\\DevWork\\C#\\TempProject1\\ConsoleApp1\\output\\op.csv";
    
        private static Dictionary<string, bool> processedfileStates = new Dictionary<string, bool>();
        private static bool fileExisted = false;
        private static void RenameIfExist(string csvFilePath)
        {
            if (File.Exists(csvFilePath))
            {
                System.IO.File.Move(csvFilePath, $"{csvFilePath}_{DateTime.Now.ToString("backup_yyyyMMdd_HHmmss")}");
            }
        }
        private static void LoadOldStatsIfExist(string csvFilePath)
        {
            if (File.Exists(csvFilePath))
            {
                using (var fs = new FileStream(csvFilePath, FileMode.Open, FileAccess.Read))
                {
                    using (ChoCSVReader<dynamic> reader = new ChoCSVReader(fs).WithFirstLineHeader())
                    {
                        using (var dTable = reader.AsDataTable())
                        {
                            foreach (DataRow row in dTable.Rows)
                            {
                                Program.processedfileStates.Add($"{row["File Name"].ToString()}_{row["Page"].ToString()}_{row["Practice Name"].ToString()}", true);
                            }
                        }
                    }
                }
            }
        }
        public static void Main(string[] args)
        {
            try
            {
                Program.fileExisted = File.Exists(csvFilePath);
                Program.LoadOldStatsIfExist(csvFilePath);
    
                List<OPModel> dataToWrite = new List<OPModel>();
                // Persist each file to
                foreach (var jsonFile in Directory.GetFiles(jsonFilesPath))
                {
                    dataToWrite.AddRange(JsonToCsv(jsonFile));
                }
    
                if (dataToWrite.Count != 0)
                {
                    using (var fs = new FileStream(csvFilePath, !Program.fileExisted ? FileMode.Create : FileMode.Append, FileAccess.Write))
                    {
                        try
                        {
                            ChoCSVWriter<OPModel> writer = new ChoCSVWriter<OPModel>(fs);
                            using (writer.WithField("FileName", fieldName: "File Name").WithField("Page").WithField("PracticeName", fieldName: "Practice Name"))
                            {
                                if (!Program.fileExisted)
                                {
                                    writer = writer.WithFirstLineHeader();
                                }
    
                                writer.Write(dataToWrite);
                            }
    
                            fs.Write(Environment.NewLine);
    
                        }
                        catch (Exception e)
                        {
                            throw e;
                        }
                    }
                }
                
    
                //Output to CSV
                
                Console.ReadKey();
            }
            catch (Exception ex)
            {
    
            }
        }
        public static string fieldValue(IEnumerable<dynamic> lines, string nameOfField, bool throwException = false)
        {
            var skipped = lines.SkipWhile(l => l.text != nameOfField);
    
            switch (throwException)
            {
                case true:
                    var enumerator = lines.GetEnumerator();
    
                    while (enumerator.MoveNext())
                    {
                        if (enumerator.MoveNext())
                            return skipped.Skip(1).First().text;
                        else
                            throw new InvalidDataException("Odd number of items found in IEnumerable<>");
                    }
    
                    break;
                case false:
                    // Skip(#) to skip over the unnecessary Lines, 
                    // such as "Account Information", preceding "Practice Name".
                    return skipped.Skip(1).First().text;
                default:
                    Console.WriteLine("Default case");
                    break;
            }
            // Returning null isn't recommended, but it does fix the error "not all code paths return a value"
            return null;
        }
    
    
        public static List<OPModel> JsonToCsv(string jsonInputFile)
        {
            using (var reader = new ChoJSONReader(jsonInputFile).WithJSONPath("$..readResults"))
            {
                var data = reader.Where(r1 => r1.page == 1)
                                .Select(r1 =>
                                {
                                    var lines = (dynamic[])r1.lines;
                                    return new OPModel
                                    {
                                        FileName = jsonInputFile,
                                        Page = r1.page,
                                        PracticeName = Program.fieldValue(lines, "Practice Name")
                                    };
                                }
                            ).Where(de => !processedfileStates.ContainsKey($"{de.FileName.ToString()}_{de.Page.ToString()}_{de.PracticeName.ToString()}")).ToList();
    
                return data;
            }
        }
    }