Search code examples
pythonfilesplithttplib2

Reading file from the internet and split into 2


I am brand new to Python and try the following: I am reading a file from the internet and want to split it at a certain amount of lines. 1. File = line 1 to x 2. File = line x+1 to eof

I use httplib2 to read the file from the internet and split then this file into 2. Tried it with the "with" but it seems that I cannot use f.readline() etc when I am reading a file from the internet and use it with "with". If I open a local file it works fine.

Do I miss something here?

Thank you very much for your help in advance.

with data_file as f: #data_file is the file read from the internet

Here is my function:

 def create_data_files(data_file):

    # read the file from the internet and split it into two files

    # Loading file give info if the file was loaded from cache or internet
    try:
        print("Reading file from the Internet or Cache")
        h = httplib2.Http(".cache")
        data_header, data_file = h.request(DATA_URL) # , headers={'cache-control':'no-cache'}) # to force download form internet
        data_file = data_file.decode()


    except httplib2.HttpLib2Error as e:
        print(e)

    # Give the info if the file was read from the internet or from the cache

    print("DataHeader", data_header.fromcache)

    if data_header.fromcache == True:
        print("File was read from cache")
    else:
        print("File was read from the internet")

    # Counting the amount of total characters in the file - only for testing
    # print("Total amount of characters in the original file", len(data_file)) # just for testing

    # Counting the lines in the file
    print("Counting lines in the file")
    single_line = data_file.split("\n")
    for value in single_line:
        value =value.strip()
        #print(value)   # juist for testing - prints all the lines separeted
    print("Total amount of lines in the original file", len(single_line))

    # Asking the user how many lines in percentage of the total amount should be training data
    while True:
        #split_factor = int(input("What percentage should be use as training data? Enter a number between 0 and 100: "))
        split_factor = 70
        print("Split Factor set to 70% for test purposes")
        if 0 <= split_factor <= 100:
            break
        print('try again')

    split_number = int(len(single_line)*split_factor/100)
    print("Number of Training set data", split_number) # just for testing

    # Splitting the file into 2

    training_data_file = 0
    test_data_file = 0




    return training_data_file, test_data_file

Solution

  • from collections import deque
    import httplib2
    
    
    def create_data_files(data_url, split_factor=0.7):
    
        h = httplib2.Http()
        resp_headers, content = h.request(data_url, "GET")
        # for python3
        content = content.decode()
    
        lines = deque(content.split('\n'))
    
        stop = len(lines) * split_factor
        training, test = [], []
        i = 0
        while lines:
            l = lines.popleft()
            if i <= stop:
                training.append(l)
            else:
                test.append(l)
            i +=1
    
        training_str, test_str = '\n'.join(training), '\n'.join(test)
        return training_str, test_str
    

    This should do the trick (not tested and simplified).

    data_header, data_file = h.request(DATA_URL)

    data_file is not a file like object but a string