Search code examples
c#.netmultithreadingzipgtfs

Reading multiple files from zip archive with CsvHelper simultaneously


I'm working on something that will download a zip archive, then read each of the files in said archive into Lists. When I program it synchronously it succeeds, but takes forever.

I decided to try to use Tasks to read each of the files on different threads. When I do this, I get the following:

End of Central Directory record could not be found

Here's a class I wrote to handle the download and extraction:

public class GtfsFileDownloader
{
    public string FileLocation { get; set; }
    public string FileName { get; set; }
    public MemoryStream ZipStream { get; set; }

    public GtfsFileDownloader(string loc, string nm)
    {
        FileLocation = loc;
        FileName = nm;
    }

    public void DownloadZip()
    {
        ZipStream = new MemoryStream(new WebClient().DownloadData(FileLocation + FileName));
    }

    public List<T> GetFileContents<T, Q>(string fileName) where Q: ClassMap
    {
        var retList = new List<T>();
        var entry = new ZipArchive(ZipStream).Entries.SingleOrDefault(x => x.FullName == fileName);
        if(entry != null)
        {
            using (var reader = new StreamReader(entry.Open()))
            {
                using (var csv = new CsvReader(reader))
                {
                    csv.Configuration.HeaderValidated = null;
                    csv.Configuration.MissingFieldFound = null;
                    csv.Configuration.RegisterClassMap<Q>();
                    try
                    {
                        retList = csv.GetRecords<T>().ToList();
                    }
                    catch(CsvHelperException ex)
                    {
                        throw new System.Exception(ex.Message);
                    }
                }
            }
        }
        return retList;
    }
}

And here's the main code:

var downloader = new GtfsFileDownloader(agency.GtfsZipUrlDirectory, agency.GtfsZipUrlFileName);
                            downloader.DownloadZip();

                            var agencyInfo = new List<DbAgency>();
                            var stopInfo = new List<DbStop>();
                            var routeInfo = new List<DbRoute>();
                            var tripInfo = new List<DbTrip>();
                            var stopTimeInfo = new List<DbStopTime>();
                            var calendarInfo = new List<DbCalendar>();
                            var fareAttributeInfo = new List<DbFareAttribute>();
                            var shapeInfo = new List<DbShape>();
                            var frequencyInfo = new List<DbFrequency>();
                            var transferInfo = new List<DbTransfer>();
                            var pathwayInfo = new List<DbPathway>();
                            var levelInfo = new List<DbLevel>();
                            var feedInfoInfo = new List<DbFeedInfo>();
                            var tasks = new List<Task>();

                            tasks.Add(new Task(() => { agencyInfo = downloader.GetFileContents<DbAgency, AgencyMap>("agencies.txt");                              }));
                            tasks.Add(new Task(() => { stopInfo = downloader.GetFileContents<DbStop, StopMap>("stops.txt");                                       }));
                            tasks.Add(new Task(() => { routeInfo = downloader.GetFileContents<DbRoute, RouteMap>("routes.txt");                                   }));
                            tasks.Add(new Task(() => { tripInfo = downloader.GetFileContents<DbTrip, TripMap>("trips.txt");                                       }));
                            tasks.Add(new Task(() => { stopTimeInfo = downloader.GetFileContents<DbStopTime, StopTimeMap>("stop_times.txt");                      }));
                            tasks.Add(new Task(() => { calendarInfo = downloader.GetFileContents<DbCalendar, CalendarMap>("calendar.txt");                        }));
                            tasks.Add(new Task(() => { fareAttributeInfo = downloader.GetFileContents<DbFareAttribute, FareAttributeMap>("fare_attributes.txt");  }));
                            tasks.Add(new Task(() => { shapeInfo = downloader.GetFileContents<DbShape, ShapeMap>("shapes.txt");                                   }));
                            tasks.Add(new Task(() => { frequencyInfo = downloader.GetFileContents<DbFrequency, FrequencyMap>("frequencies.txt");                  }));
                            tasks.Add(new Task(() => { transferInfo = downloader.GetFileContents<DbTransfer, TransferMap>("transfers.txt");                       }));
                            tasks.Add(new Task(() => { pathwayInfo = downloader.GetFileContents<DbPathway, PathwayMap>("pathways.txt");                           }));
                            tasks.Add(new Task(() => { levelInfo = downloader.GetFileContents<DbLevel, LevelMap>("levels.txt");                                   }));
                            tasks.Add(new Task(() => { feedInfoInfo = downloader.GetFileContents<DbFeedInfo, FeedInfoMap>("feed_info.txt");                       }));
                            foreach(Task t in tasks)
                            {
                                t.Start();
                            }

                            Task.WaitAll(tasks.ToArray());

I'm assuming I'm doing something incorrectly with multithreading (I'm not too experienced in multithreading). Like I mentioned, if I take out the Task stuff and run it single-threaded, it does not throw the error above.


Solution

  • Try making MemoryStream per Task:

    public class GtfsFileDownloader
    {
        public string FileLocation { get; set; }
        public string FileName { get; set; }
        public byte[] ZipBytes { get; set; }
    
        public GtfsFileDownloader(string loc, string nm)
        {
            FileLocation = loc;
            FileName = nm;
        }
    
        public void DownloadZip()
        {
            ZipBytes = new WebClient().DownloadData(FileLocation + FileName);
        }
    
        public List<T> GetFileContents<T, Q>(string fileName) where Q: ClassMap
        {
            var retList = new List<T>();
            using (var ZipStream = new MemoryStream(ZipBytes)) {
                var entry = new ZipArchive(ZipStream).Entries.SingleOrDefault(x => x.FullName == fileName);
                if(entry != null)
                {
                    using (var reader = new StreamReader(entry.Open()))
                    {
                        using (var csv = new CsvReader(reader))
                        {
                            csv.Configuration.HeaderValidated = null;
                            csv.Configuration.MissingFieldFound = null;
                            csv.Configuration.RegisterClassMap<Q>();
                            try
                            {
                                retList = csv.GetRecords<T>().ToList();
                            }
                            catch(CsvHelperException ex)
                            {
                                throw new System.Exception(ex.Message);
                            }
                        }
                    }
                }
            }
            return retList;
        }
    }