Search code examples
pythonramgdal

Python consuming all my RAM and system hangs


I am trying to extract values from multiple tiff files in a folder using python. I have used glob-glob to iterate through the files and extract the values for each location based on latitude and longitude vales. I have written the code and its runs fine in the beginning but eventually keeps consuming my ram and my system hangs. I have almost 5000 such files and more than 200 locations. The code is given below.

import math,os,sys,glob
import gdal
from openpyxl import load_workbook
f1=('C:\Users\APPLIED\Desktop\SPIE Analysis\Data\Data1.xlsx')
wt = load_workbook(f1, read_only=False)#, keep_vba=True)
sheet1 = wt['Sheet1']
#---------------------------------------------------------------------------
lat = []
file_in = open("C:\Users\APPLIED\Desktop\SPIE Analysis\Data\lat.txt", "r")
for y in file_in.read().split("\n"):
    lat.append(y)
file_in.close
lon = []
file_in = open("C:\Users\APPLIED\Desktop\SPIE Analysis\Data\lon.txt", "r")
for y in file_in.read().split("\n"):
    lon.append(y)
file_in.close
#---------------------------------------------------------------------------
lp=0
y1=2002
print y1
os.chdir("F:\\wget\\Renamed")
for file in glob.glob("*.tif"):
    b=[]
    b=list(file)
    c = file
    a = [b[0],b[1],b[2],b[3]]
    d = [b[4],b[5],b[6]]
    year = int(''.join(a))
    day = int(''.join(d))
    dif=year-2002
    if year!=y1:
        print ""
        print year
        if (year-1)%4==0:
            lp=lp+1
        y1=year
    else:
        sys.stdout.write(">")
    r=day+dif*365+lp+3
    for i in range(0,274):
        u0 = float(os.popen('gdallocationinfo -valonly -wgs84 %s %s %s' % (c, (lon[i]), lat[i])).read())
        if u0<0:
            if u0==-9999:
                sheet1.cell(row=r,column=i+2).value = u0
            else:
                sheet1.cell(row=r,column=i+2).value = 0
        else:
            sheet1.cell(row=r,column=i+2).value = u0/1000
    wt.save(f1)

Solution

  • I wrote this little code snippet, I hope it will work. I recommend you using pandas library instead of openpyxl as it's much easier to operate. I also changed and optimized the code a little bit to more conform to nowadays standards and python 3.

    import math,os,sys,glob
    import gdal
    import pandas as pd
    import numpy as np
    #---------------------------------------------------------------------------
    # newer version of doing things
    with open("C:\Users\APPLIED\Desktop\SPIE Analysis\Data\lat.txt", "r") as file_in:
        lat = [y for y in file_in.read().split("\n")]
    
    with open("C:\Users\APPLIED\Desktop\SPIE Analysis\Data\lon.txt", "r") as file_in:
        lon = [y for y in file_in.read().split("\n")]
    #---------------------------------------------------------------------------
    lp = 0
    y1 = 2002
    print(y1)
    os.chdir("F:\\wget\\Renamed")
    data_dict = {}
    for file in glob.glob("*.tif"):
        year = int(file[:4]) # you can slice strings and convert them to int like this 
        day = int(file[4:7]) # string is a vector of chars so you can just slice it
        dif = year - 2002
        if year != y1:
            print("")
            print(year)
            # In python if something == 0 and if something is False (not True) is equivalent
            if not (year - 1) % 4:
                lp = lp + 1
            y1 = year
        else:
            print(">")
        r = day + dif * 365 + lp + 3
        arr = np.zeros((1, 274))
        for i in range(274):
            u0 = float(os.popen('gdallocationinfo -valonly -wgs84 %s %s %s' % (file, (lon[i]), lat[i])).read())
            if u0 < 0:
                if u0 == -9999:
                    val_to_write = u0
                else:
                    val_to_write = 0
            else:
                val_to_write = u0/1000
    
            arr[i] = val_to_write
        data_dict[r] = arr
    
    # we construct a pandas dataframe out of the dictionary
    df = pd.DataFrame.from_dict(data_dict, orient='index')
    
    # I'm not sure whether all r indices will be consecutive or will some values be missing
    # In the latter case we will reindex the data frame, missing rows will be added and filled with NaN
    # We're creating a new Index from 0 to whatever is the maximum key value in the dictionary, +1
    df = df.reindex(pd.Index(np.arange(0, max(data_dict, key=int) + 1))) # only if indices are missing, e.g. 0,2,8,9
    
    # Finally we're saving the dataframe to xlsx file
    writer = pd.ExcelWriter('C:\Users\APPLIED\Desktop\SPIE Analysis\Data\Data1.xlsx')
    df.to_excel(writer, 'Sheet1', index=False, header=False, startcol=2)
    writer.save()