Search code examples
pythonpython-re

How to split paragraphs by specific starts and ends with specific rules by python re or something?


How to split paragraphs by specific starts and ends like this? Use python re or something?

Original_texts_with_paragraphs= """
0 zzzzz zzzz zzzzzzzz zzz GG G GGG
1 zzz AA zzz AAA zzzz zzz zzz zzzz zzzz zzzz zzz
2 zzzzz AA zzz zzz AAA zzzz zzzzz zzzzz zzzzz zzzz
3 zzz zzzz zzz zzz zzzz zzzz zzz zzzz EE zzz EEE
4 zzz zzzz zzz zzz zzz zzzz zzz zzzz zzz zzzz zzz zzz
5 zz zzz zzz zzz zzz zzz zz zzz zzzzzz zzzz zzzz zzz
6 BBBB BB zzzz BBB zz B zzzzzz zzzz zzzz zzzz zzzz zz
7 zzz zzz zzzz zzzz zzz zzzzz zzzz zzzzz zzzzz FF FFF zzz
8 CCCC zzz CCC zzzz zz zzzz zzzz zzz zzz GG G GGG
9 zzz AA zz AAA zzzz zzz zzz zzz FF zz FFF
10 AA zzzz AAA zzzz zzzz zzzz zzzzz zzzzz
11 zz CCCC zzz CCC zzzz zzzz zzzz zzzz zzzz zz
12 zzzz zzz zzzzz zzzzz zzzzz zzzzz zzzz zzzz EE zz EEE
13 zz zzz zzzz zzz zzz zzz 
14 zzz AA AAA zzz zzz zzzz zzzz GG zz GGG
15 BBBB BB BBB zz B zzzz zz GG zz G GGG
16 zzz  zzzz
17 zzzz zzz zzz  zzz
18 zz AA zzz AAA zzzz zzzz zzzz zzz
19 zzzz zzz zzz zzz zzzz zzzz zzzz GG G GGG
20 zzzz zzzzz zzzzzzzz zzz
21 zzzzz zzzz zzzzzzzz zzz GG G GGG
22 zzzzzz zzz zz zzzz
23 zzzzz AA zzz zzz AAA zzzz zzzzz zzzzz zzzzz zzzz
24 zzz zzzz zzz zzz zzzz zzzz zzz zzzz EE zzz EEE
"""

Starts:

Starts_with = """
AA AAA
BBBB BB BBB B
CCCC CCC
"""

Ends:

Ends_with = """
EE EEE
FF FFF
GG G GGG
"""

Desired output:

desired_output = """
1 zzz AA zzz AAA zzzz zzz zzz zzzz zzzz zzzz zzz
2 zzzzz AA zzz zzz AAA zzzz zzzzz zzzzz zzzzz zzzz
3 zzz zzzz zzz zzz zzzz zzzz zzz zzzz EE zzz EEE


6 BBBB BB zzzz BBB zz B zzzzzz zzzz zzzz zzzz zzzz zz
7 zzz zzz zzzz zzzz zzz zzzzz zzzz zzzzz zzzzz FF FFF zzz


8 CCCC zzz CCC zzzz zz zzzz zzzz zzz zzz GG G GGG


9 zzz AA zz AAA zzzz zzz zzz zzz FF zz FFF


10 AA zzzz AAA zzzz zzzz zzzz zzzzz zzzzz
11 zz CCCC zzz CCC zzzz zzzz zzzz zzzz zzzz zz
12 zzzz zzz zzzzz zzzzz zzzzz zzzzz zzzz zzzz EE zz EEE


14 zzz AA AAA zzz zzz zzzz zzzz GG zz GGG


15 BBBB BB BBB zz B zzzz zz GG zz G GGG


18 zz AA zzz AAA zzzz zzzz zzzz zzz
19 zzzz zzz zzz zzz zzzz zzzz zzzz GG G GGG


23 zzzzz AA zzz zzz AAA zzzz zzzzz zzzzz zzzzz zzzz
24 zzz zzzz zzz zzz zzzz zzzz zzz zzzz EE zzz EEE
"""

How to do that?

i tried this code but its not working well

import re

original = 'Original.txt'
new = 'Padded.txt'

Starts = (
    r'.*AA.*AAA.*',
    r'.*BBBB.*BB.*BBB.*B.*',
    r'.*CCCC.*CCC.*',
)

Ends = (
    r'.*EE.*EEE',
    r'.*FF.*FFF',
    r'.*GG.*G.*GGG'
)

desired_output = []


with open( original, 'r' ) as f:
    output = open( new, 'a+' )
    start = False

    for line in f:
        end = False

        for pattern in Starts:
            start_chars = line[:30]
            if re .search( pattern, start_chars ) != None:
                start = True

        for pattern in Ends:
            end_chars = line[-20:]
            if re .search( pattern, end_chars ) != None:
                start = False
                end = True

        if end:
            output .write( line +'\n\n' )
            desired_output .append(line +'\n\n')
        elif start:
            output .write( line )
            desired_output .append(line)

print(desired_output)

i tried it lot but i still confused it spliting paragraphs. My main purpose is split paragraphs by specific starts and ends. I have lists of starts and ends.

I also need to store rest of lines into other list. for use it later.

please help me?


Solution

  • thx @Doyousketch2. i edited your code then it works better.

    try:
        file = open("Padded1.txt","r+")
        file.truncate(0)
        file.close()
    except:
        pass
    
    import re
    
    original = 'Original1.txt'
    new = 'Padded1.txt'
    
    result = []
    
    with open( original, 'r' ) as f:
        output = open( new, 'a+' )
        start = False
    
        for line in f:
            end = False
    
            for pattern in Starts:
                start_chars = line[:30]
                if re .search( pattern, start_chars ) != None:
                    start = True
    
            for pattern in Ends:
                end_chars = line[-20:]
                if re .search( pattern, end_chars ) != None:
                    #####start = False
                    end = True
    
            if end and start:  
                output .write( line +'\n\n\n' )
                result.append(line +'\n\n\n')
                start = False
                end = False
                
            elif start:
                output .write( line )
                result.append(line)
                end = False
    output.close()
    
    if start == True and end == False:
        #remove last line from a text line in python
        fd=open("Padded1.txt","r")
        d=fd.read()
        fd.close()
        sep = "\n\n\n\n"
        s=sep.join(d.split(sep)[:-1])
        fd=open("Padded1.txt","w+")
        for i in range(len(s)):
            fd.write(s[i])
        fd.close()
        
    elif start == False and end == True:
        #remove last line from a text line in python
        fd=open("Padded1.txt","r")
        d=fd.read()
        fd.close()
        sep = "\n\n\n\n"
        s=sep.join(d.split(sep)[:-1])
        fd=open("Padded1.txt","w+")
        for i in range(len(s)):
            fd.write(s[i])
        fd.close()
    
    
    
    # Comparing Two Text Files, Removing the duplicate lines, and Writing results to a new text file
    
    small_file = open('Original1.txt','r')
    long_file = open('Padded1.txt','r')
    output_file = open('output_file.txt','w')
    
    try:
        small_lines = small_file.readlines()
        small_lines_cleaned = [line.rstrip().lower() for line in small_lines]
        long_file_lines = long_file.readlines()
        long_lines_cleaned = [line.rstrip().lower() for line in long_file_lines]
    
        for line in small_lines_cleaned:
            if line not in long_lines_cleaned:
                output_file.writelines(line + '\n')
    
    finally:
        small_file.close()
        long_file.close()
        output_file.close()
        
    print('Done')