Search code examples
rubyoptimizationcsvfastercsv

Speed up csv import


I want to import big amount of cvs data (not directly to AR, but after some fetches), and my code is very slow.

def csv_import 
    require 'csv'
    file = File.open("/#{Rails.public_path}/uploads/shate.csv")
    csv = CSV.open(file, "r:ISO-8859-15:UTF-8", {:col_sep => ";", :row_sep => :auto, :headers => :first_row})

    csv.each do |row|
      #ename,esupp= row[1].split(/_/) 
      #(ename,esupp,foo) = row[1]..split('_')
      abrakadabra = row[0].to_s()
      (ename,esupp) = abrakadabra.split(/_/)
      eprice = row[6]
      eqnt = row[1]
      # logger.info("1) ")
      # logger.info(ename)
      # logger.info("---")
      # logger.info(esupp)
      #----
      #ename = row[4]
      #eprice = row[7]
      #eqnt = row[10]
      #esupp = row[12]

        if ename.present? && ename.size>3
        search_condition = "*" + ename.upcase + "*"     

        if esupp.present?
          #supplier = @suppliers.find{|item| item['SUP_BRAND'] =~ Regexp.new(".*#{esupp}.*") }
          supplier = Supplier.where("SUP_BRAND like ?", "%#{esupp}%").first
          logger.warn("!!! *** supp !!!")
          #logger.warn(supplier)
        end

        if supplier.present?

          @search = ArtLookup.find(:all, :conditions => ['MATCH (ARL_SEARCH_NUMBER) AGAINST(? IN BOOLEAN MODE)', search_condition.gsub(/[^0-9A-Za-z]/, '')])
          @articles = Article.find(:all, :conditions => { :ART_ID => @search.map(&:ARL_ART_ID)})
          @art_concret = @articles.find_all{|item| item.ART_ARTICLE_NR.gsub(/[^0-9A-Za-z]/, '').include?(ename.gsub(/[^0-9A-Za-z]/, '')) }

          @aa = @art_concret.find{|item| item['ART_SUP_ID']==supplier.SUP_ID} #| @articles
          if @aa.present?
            @art = Article.find_by_ART_ID(@aa)
          end

          if @art.present?
            @art.PRICEM = eprice
            @art.QUANTITYM = eqnt
            @art.datetime_of_update = DateTime.now
            @art.save
          end

        end
        logger.warn("------------------------------")       
      end

      #logger.warn(esupp)
    end
 end

Even if I delete and get only this, it is slow.

def csv_import 
    require 'csv'
    file = File.open("/#{Rails.public_path}/uploads/shate.csv")
    csv = CSV.open(file, "r:ISO-8859-15:UTF-8", {:col_sep => ";", :row_sep => :auto, :headers => :first_row})

    csv.each do |row|
    end
end

Could anybody help me increase the speed using fastercsv?


Solution

  • I don't think it will get much faster.

    That said, some testing shows that a significant part of time is spent for the transcoding (about 15% for my test case). So if you could skip that (e.g. by creating the CSV in UTF-8 already) you would see some improvement.

    Besides, according to ruby-doc.org the "primary" interface for reading CSVs is foreach, so this should be preferred:

    def csv_import
      import 'csv'
      CSV.foreach("/#{Rails.public_path}/uploads/shate.csv", {:encoding => 'ISO-8859-15:UTF-8', :col_sep => ';', :row_sep => :auto, :headers => :first_row}) do | row |
        # use row here...
      end
    end
    

    Update

    You could also try splitting the parsing into several threads. I reached some performance increase experimenting with this code (treatment of heading left out):

    N = 10000
    def csv_import
      all_lines = File.read("/#{Rails.public_path}/uploads/shate.csv").lines
      # parts will contain the parsed CSV data of the different chunks/slices
      # threads will contain the threads
      parts, threads = [], []
      # iterate over chunks/slices of N lines of the CSV file
      all_lines.each_slice(N) do | plines |
        # add an array object for the current chunk to parts
        parts << result = []
        # create a thread for parsing the current chunk, hand it over the chunk 
        # and the current parts sub-array
        threads << Thread.new(plines.join, result) do  | tsrc, tresult |
          # parse the chunk
          parsed = CSV.parse(tsrc, {:encoding => 'ISO-8859-15:UTF-8', :col_sep => ";", :row_sep => :auto})
          # add the parsed data to the parts sub-array
          tresult.replace(parsed.to_a)
        end
      end
      # wait for all threads to finish
      threads.each(&:join)
      # merge all the parts sub-arrays into one big array and iterate over it
      parts.flatten(1).each do | row |
        # use row (Array)
      end
    end
    

    This splits the input into chunks of 10000 lines and creates a parsing thread for each of the chunks. Each threads gets handed over a sub-array in the array parts for storing its result. When all threads are finished (after threads.each(&:join)) the results of all chunks in parts are joint and that's it.