Search code examples
pythonpython-3.xlistdata-conversion

How to Convert a Text File into a List in Python3


In Python3, from an existing .txt file which contain lyric/subtitle/other, I want to make a simple list (without any nestings) of existing words, without spaces or other interpuction signs.

Based on other StackExchange requests, i made this

import csv

crimefile = open('she_loves_you.txt', 'r')
reader = csv.reader(crimefile)
allRows = list(reader) # result is a list with nested lists

ultimate = []
for i in allRows:
    ultimate += i # result is a list with elements longer than one word

ultimate2 = []
for i in ultimate:
    ultimate2 += i # result is a list with elements which are single letters

my wished result would be like

['She', 'loves', 'you', 'yeah', 'yeah', 'yeah', 'She', 'loves', 'you', ...]

======================================================================

Interesting as well would be to understand why the code (it runs as extension of the one above):

import re
print (re.findall(r"[\w']+", ultimate))

brings the following error:

Traceback (most recent call last):
  File "4.4.4.csv.into.list.py", line 72, in <module>
    print (re.findall(r"[\w']+", ultimate))
  File "/usr/lib/python3.7/re.py", line 223, in findall
    return _compile(pattern, flags).findall(string)
TypeError: expected string or bytes-like object

Solution

  • Bellow is full output of the work i made in area of this question

    import csv
    import re
    import json
    
    #1 def1
    #def decomposition(file):
    '''
        opening the text file,
        and in 3 steps creating a list containing signle words that appears in the text file
    '''
    
    crimefile = open('she_loves_you.txt', 'r')
    reader = csv.reader(crimefile)
    
            #step1 : list with nested lists
    allRows = list(reader) # result is a list with nested lists, on which we are going to work later
    
            #step2 : one list, with elements longer that one word
    ultimate = []
    for i in allRows:
        ultimate += i
    
            #step3 : one list, with elements which are lenght of one word
                #print (re.findall(r"[\w']+", ultimate)) # does not work
                #print (re.findall(r"[\w']+", str(ultimate)))  # works
    list_of_words = re.findall(r"[\w']+", ' '.join(ultimate)) # works even better!
    
    
    #2 def2
    def saving():
        '''
        #    creating/opening writable file (as a variable),
        #    and saving into it 'list of words'
        '''
    
        with open('she_loves_you_list.txt', 'w') as fp:
        #Save as JSON with
            json.dump(list_of_words, fp)
    
    
    #3 def3
    def lyric_to_frequencies(lyrics):
        '''
        #    you provide a list,
        #    and recieve a dictionary, which contain amount of unique words in this list
        '''
    
        myDict = {}
        for word in lyrics:
            if word in myDict:
                myDict[word] += 1
            else :
                myDict[word] = 1
        #print (myDict)
        return myDict
    
    #4 def4
    def  most_common_words(freqs):
        '''
        you provide a list of words ('freqs')
        and recieve how often they appear
        '''
    
        values = freqs.values()
        best = max(values) #finding biggest value very easily
        words = []
        for k in freqs : # and here we are checking which entries have biggers (best) values
            if freqs[k] == best:
                words.append(k) #just add it to the list
        print(words,best)
        return(words,best)
    
    #5 def5
    def words_often(freqs, minTimes):
        '''
        you provide a list of words ('freqs') AND minimumTimes how the word suppose to appear in file to be printed out
        and recieve how often they appear
        '''
    
        result = []
        done = False
        while not done :
            temp = most_common_words(freqs)
            if temp[1] >= minTimes:
                result.append(temp)
                for w in temp[0]:
                    del(freqs[w])
            else:
                done = True
        return result
    
    
    
    #1
    decomposition('she_loves_you.txt')
    
    #2
    saving()
    
    #3
    lyric_to_frequencies(list_of_words)
    
    #4
    most_common_words(lyric_to_frequencies(list_of_words))
    
    #5
    words_often(lyric_to_frequencies(list_of_words), 5)