Get lines between starting and ending keywords in text file, then do post processing using python

Question

Get lines between starting and ending keywords in text file, then do post processing using python

I have a text file, I would like to extract the lines between starting (Socket:) and ending keywords (Socket:), then do a processing.

Input:

Socket: 1
Device ID: 0x0B028041 0xCC344007 0x10000834 0x00000011
CB: 3/ID: 0x445DDC13
BIBID: 0x65C     

A:0xB0000190 D:0x310020FF
A:0xB0000194 D:0x00000000
A:0xB0000198 D:0x31002010
A:0xB000019C D:0x00000017
A:0xB00001A0 D:0x31002020
A:0xB00001A4 D:0x00000017
A:0xB00001A8 D:0x31002040
A:0xB00001AC D:0x00000000
A:0xB00001B0 D:0x31001000
ART: 0xB0000800 DRT: 0xB0000000
ART: 0xB0000804 DRT: 0xB0000000
ART: 0xB0000808 DRT: 0xB0000000
ART: 0xB000080C DRT: 0xB0000000
ART: 0xB0000810 DRT: 0xB0000000
ART: 0xB0000814 DRT: 0xB0000000
ART: 0xB0000818 DRT: 0xB0000000
ART: 0xB000081C DRT: 0xB0000000
ART: 0xB0000820 DRT: 0xB0000000
ART: 0xB0000824 DRT: 0xB0000000
ART: 0xB0000828 DRT: 0xB0000000
ART: 0xB000082C DRT: 0xB0000000
ART: 0xB0000830 DRT: 0xB0000000
ART: 0xB0000834 DRT: 0xB0000000
ART: 0xB0000838 DRT: 0xB0000000
ART: 0xB000083C DRT: 0xB0000000
ART: 0xB0000840 DRT: 0xB0000000
ART: 0xB0000844 DRT: 0xB0000000
ART: 0xB0000848 DRT: 0xB0000000
ART: 0xB000084C DRT: 0xB0000000
ART: 0xB0000850 DRT: 0xB0000000
ART: 0xB0000854 DRT: 0xB0000000
ART: 0xB0000858 DRT: 0xB0000000
ART: 0xB000085C DRT: 0xB0000000
ART: 0xB0000860 DRT: 0xB0000000
ART: 0xB0000864 DRT: 0xB0000000
ART: 0xB0000868 DRT: 0xB0000000
ART: 0xB000086C DRT: 0xB0000000
ART: 0xB0000870 DRT: 0xB0000000
ART: 0xB0000874 DRT: 0xB0000000
ART: 0xB0000878 DRT: 0xB0000000
ART: 0xB000087C DRT: 0xB0000000
...
Socket:2
...

Current Code:

    import re

    from collections import defaultdict
    dict2=defaultdict(list)
    dict3=defaultdict(list)
    socket_position_status = False
    dev_id_status = False
    CB_noS_status = False
    trf_val_flag = False
    dict1=defaultdict(list)
    pattern="QWL"
    rd_case= "Digital"
    setup_temp = "0C"
    readout_temp = "0C"
    address=[]

    def tde_file():
        with open(r'C:\Gert_batch file\DOE_parsing\Thebe\DOE 4 - 5K\NEW SFR\PF\tde\MRB_QWL_0c_Digital_PS60c_TC1798.tde', 'rb') as f:
            for line in f:
                pattern_tde = ":TEST_RESULT"
                if pattern_tde in line:

                    tde_addr = ':TEST_RESULT (\d+); addr: ([0-9A-Za-z]{10})'
                    x0 =re.match(tde_addr, line)

                    if x0:
                        address_tde = x0.group(2)
                        tde_addr_1 = 'cfp_(vqs|vcs)_m(\d+) \// HB05_SB255'
                        x1 = re.search(tde_addr_1,line)

                        tde_addr_2 = 'cfp_(vqs|vcs)_m\dm\d_(vth\d.\d) \// HB05_SB255'
                        x2 = re.search(tde_addr_2,line)

                        tde_addrs_1 = '(DTS_)value_(before|after)_test_(start|finish)'
                        y0 = re.search(tde_addrs_1,line)

                        if x1:
                            hlp_s = x1.group(2).zfill(2)
                            identifier =  x1.group(1)+"_m"+hlp_s

                        if x2:
                            identifier =  x2.group(1)+"_m"+x2.group(2)

                        try:
                            tde_addr_3 = '(SBE|DBE)|(Number of (\ds) bit fail) \// HB05_SB255'
                            hlp_s = re.search(tde_addr_3,line).group(1)
                            if hlp_s:
                                dict1[pattern,rd_case,address_tde]=identifier+"_"+hlp_s

                        except AttributeError:pass

                        try:
                            tde_addr_3 = '(SBE|DBE)|(Number of (\ds) bit fail) \// HB05_SB255'
                            hlp_s = re.search(tde_addr_3,line).group(3)
                            if hlp_s:
                                dict1[pattern,rd_case,address_tde]=identifier+"_"+hlp_s

                        except AttributeError: pass

                        try:
                            if y0.group(1) and y0.group(3):
                                dict1[pattern,rd_case,address_tde]=y0.group(1)+y0.group(3)+"_temp"
                        except AttributeError: pass
        #print dict1
        #print len(dict1.keys())
        #for k,v in sorted(dict1.items()):
            #print k,v


    def evaluate_lot_wxy(trf_dev_id_pattern):
        import re
        binary_value = ""

        line = trf_dev_id_pattern
        dev_id = 'Device ID: ([0-9a-zA-Z]{10}) ([0-9a-zA-Z]{10}) ([0-9a-zA-Z]{10}) ([0-9a-zA-Z]{10})$'
        hex_inp1 = re.search(dev_id,line)
        #print hex_inp1.group()
        hex_inp2 = hex_inp1.group(4)+hex_inp1.group(3)+hex_inp1.group(2)+hex_inp1.group(1)
        hex_inp3 =  re.sub('0x', '', hex_inp2)
        #print hex_inp3
        for i,val in enumerate(str(hex_inp3)):
            binary_value1=str(bin(int(val,16))[2:]).zfill(4)
            binary_value = str(binary_value) + str(binary_value1)
        #print binary_value

        wafer_val = binary_value[90:96]
        wafer = int(wafer_val,2)
        y_pos_val = binary_value[106:113]
        y_pos = int(y_pos_val,2)
        x_pos_val = binary_value[98:105]
        x_pos = int(x_pos_val,2)
        year_val = binary_value[63:67]
        year = int(year_val,2)
        production_week_val = binary_value[67:73]
        production_week = int(production_week_val,2)
        serial_no_val = binary_value[73:83]
        serial_no=int(serial_no_val,2)
        lot ="ZA"+str(year)+str(production_week)+str(serial_no)
        if (1 <= wafer <= 25) and (1<= x_pos <= 65) and (1 <= y_pos <= 65):
            dev_id_status = True
        return lot,wafer,x_pos,y_pos,dev_id_status

    tde_file()

    with open(r"C:\Gert_batch file\DOE_parsing\Thebe\DOE 4 - 5K\NEW SFR\PF\1kCycling\Results_452_13384\Result     Files\temp\452_20170111_021021_TC1798_MRB_QWL_0c_Digital_PS60c_1021002999.trf", "rt") as f1:
    lines = f1.read()
    print lines
    #for lines in f1.read():

    match = re.search(r'Socket:(.*?)Socket:', lines, flags=re.DOTALL)
    #print match.group()
    for line in match.group().splitlines():
        if "Socket:" in line:
            trf_addr = 'Socket: (\d+)$'
            x0 =re.match(trf_addr, line)
            try:
                if x0.group(1).zfill(3):
                    socket_position = x0.group(1).zfill(3)
                    socket_position_status = True
                    #print socket_position


            except AttributeError: pass

        elif "Device ID:" in line:
            dev_id = 'Device ID: ([0-9a-zA-Z]{10}) ([0-9a-zA-Z]{10}) ([0-9a-zA-Z]{10}) ([0-9a-zA-Z]{10})$'
            x1 = re.search(dev_id,line)
            try:
                if  x1.group(1) and x1.group(1) and x1.group(1) and x1.group(1):
                    trf_dev_id_pattern = x1.group()#x1.group(4)+x1.group(3)+x1.group(2)+x1.group(1)
                    lot_wafer_x_y = evaluate_lot_wxy(trf_dev_id_pattern)
                    dev_id_status = True
                    #print lot_wafer_x_y

            except AttributeError: pass

        elif "CB:" in line:
            CB_pat = 'CB: (\d+)\/'
            x2 = re.search(CB_pat,line)
            try:
                if  x2.group(1):
                    CB_noS_status = True
                    #print CB_noS_status
            except AttributeError: pass

        elif"ART:" in line:
            regex = re.search("ART: ([0-9A-Za-z]{10}) DRT: ([0-9A-Za-z]{10})",line)
            #print line
            try:
                if regex.group(1) and regex.group(2):
                    for key1,val1 in dict1.iteritems():
                        if regex.group(1) in key1:
                            #print "Address:"+regex.group(1)
                            hlp_a = val1
                            hlp_b = hlp_a.split("_")
                            identifier = hlp_b[0]
                            fail_class = hlp_b[1]
                            key_addtional = hlp_b[2]
                            val = regex.group(2)
                            value = int(val[3:],16)
                            dict2[rd_case,pattern,setup_temp,readout_temp,socket_position,fail_class,identifier,key_addtional]= value        

            except AttributeError: pass

    for k,v in sorted(dict2.items()):
        print k,v

Current Output:

Currently the code is print the output for the first match and I would like to get the output for the entire match in the input file.

('Digital', 'QWL', '0C', '0C', '001', 'finish', 'DTS', 'temp') 16
('Digital', 'QWL', '0C', '0C', '001', 'm02', 'vqs', '0s') 0
('Digital', 'QWL', '0C', '0C', '001', 'm02', 'vqs', '1s') 0
('Digital', 'QWL', '0C', '0C', '001', 'm02', 'vqs', 'DBE') 0
('Digital', 'QWL', '0C', '0C', '001', 'm02', 'vqs', 'SBE') 0
('Digital', 'QWL', '0C', '0C', '001', 'm03', 'vqs', '0s') 0
('Digital', 'QWL', '0C', '0C', '001', 'm03', 'vqs', '1s') 0
('Digital', 'QWL', '0C', '0C', '001', 'm03', 'vqs', 'DBE') 0
('Digital', 'QWL', '0C', '0C', '001', 'm03', 'vqs', 'SBE') 0
('Digital', 'QWL', '0C', '0C', '001', 'm04', 'vqs', '0s') 0
('Digital', 'QWL', '0C', '0C', '001', 'm04', 'vqs', '1s') 0
('Digital', 'QWL', '0C', '0C', '001', 'm04', 'vqs', 'DBE') 0
('Digital', 'QWL', '0C', '0C', '001', 'm04', 'vqs', 'SBE') 0
('Digital', 'QWL', '0C', '0C', '001', 'm05', 'vqs', '0s') 0
('Digital', 'QWL', '0C', '0C', '001', 'm05', 'vqs', '1s') 0
('Digital', 'QWL', '0C', '0C', '001', 'm05', 'vqs', 'DBE') 0
('Digital', 'QWL', '0C', '0C', '001', 'm05', 'vqs', 'SBE') 0
('Digital', 'QWL', '0C', '0C', '001', 'm06', 'vqs', '0s') 0
('Digital', 'QWL', '0C', '0C', '001', 'm06', 'vqs', '1s') 0
('Digital', 'QWL', '0C', '0C', '001', 'm06', 'vqs', 'DBE') 0
('Digital', 'QWL', '0C', '0C', '001', 'm06', 'vqs', 'SBE') 0
('Digital', 'QWL', '0C', '0C', '001', 'm07', 'vqs', '0s') 0
('Digital', 'QWL', '0C', '0C', '001', 'm07', 'vqs', '1s') 0
('Digital', 'QWL', '0C', '0C', '001', 'm07', 'vqs', 'DBE') 0
('Digital', 'QWL', '0C', '0C', '001', 'm07', 'vqs', 'SBE') 0
('Digital', 'QWL', '0C', '0C', '001', 'm08', 'vqs', '0s') 0
('Digital', 'QWL', '0C', '0C', '001', 'm08', 'vqs', '1s') 0
('Digital', 'QWL', '0C', '0C', '001', 'm08', 'vqs', 'DBE') 0
('Digital', 'QWL', '0C', '0C', '001', 'm08', 'vqs', 'SBE') 0
('Digital', 'QWL', '0C', '0C', '001', 'm09', 'vqs', '0s') 0
('Digital', 'QWL', '0C', '0C', '001', 'm09', 'vqs', '1s') 0
('Digital', 'QWL', '0C', '0C', '001', 'm09', 'vqs', 'DBE') 0
('Digital', 'QWL', '0C', '0C', '001', 'm09', 'vqs', 'SBE') 0
('Digital', 'QWL', '0C', '0C', '001', 'm10', 'vqs', '0s') 0
('Digital', 'QWL', '0C', '0C', '001', 'm10', 'vqs', '1s') 0
('Digital', 'QWL', '0C', '0C', '001', 'm10', 'vqs', 'DBE') 0
('Digital', 'QWL', '0C', '0C', '001', 'm10', 'vqs', 'SBE') 0
('Digital', 'QWL', '0C', '0C', '001', 'm11', 'vqs', '0s') 0
('Digital', 'QWL', '0C', '0C', '001', 'm11', 'vqs', '1s') 0
('Digital', 'QWL', '0C', '0C', '001', 'm11', 'vqs', 'DBE') 0
('Digital', 'QWL', '0C', '0C', '001', 'm11', 'vqs', 'SBE') 0
('Digital', 'QWL', '0C', '0C', '001', 'm12', 'vqs', '0s') 0
('Digital', 'QWL', '0C', '0C', '001', 'm12', 'vqs', '1s') 0
('Digital', 'QWL', '0C', '0C', '001', 'm12', 'vqs', 'DBE') 0
('Digital', 'QWL', '0C', '0C', '001', 'm12', 'vqs', 'SBE') 0
('Digital', 'QWL', '0C', '0C', '001', 'm13', 'vqs', '0s') 0
('Digital', 'QWL', '0C', '0C', '001', 'm13', 'vqs', '1s') 0
('Digital', 'QWL', '0C', '0C', '001', 'm13', 'vqs', 'DBE') 0
('Digital', 'QWL', '0C', '0C', '001', 'm13', 'vqs', 'SBE') 0
('Digital', 'QWL', '0C', '0C', '001', 'm14', 'vqs', '0s') 0
('Digital', 'QWL', '0C', '0C', '001', 'm14', 'vqs', '1s') 0
('Digital', 'QWL', '0C', '0C', '001', 'm14', 'vqs', 'DBE') 0
('Digital', 'QWL', '0C', '0C', '001', 'm14', 'vqs', 'SBE') 0
('Digital', 'QWL', '0C', '0C', '001', 'm15', 'vqs', '0s') 0
('Digital', 'QWL', '0C', '0C', '001', 'm15', 'vqs', '1s') 0
('Digital', 'QWL', '0C', '0C', '001', 'm15', 'vqs', 'DBE') 0
('Digital', 'QWL', '0C', '0C', '001', 'm15', 'vqs', 'SBE') 0
('Digital', 'QWL', '0C', '0C', '001', 'start', 'DTS', 'temp') 14

The above is the output for first match, but I would like to get the output for every match in the file. Could anyone help me in this ? Thanks in advance.

python

regex

python-2.7

asked on Stack Overflow Jan 24, 2017 by

user3827728 • edited Jan 26, 2017 by

user3827728

2 Answers

try this:

import re
txt = '''Test_Socket: 1

TestA ID: 0x0B028041 0xCC344007 0x10000834 0x00000011

TestA_CB: 3/ID: 0x445DDC13

TESTA_BD: 0x65C

A:0xB0000190 D:0x310020FF

ART: 0xB0000878 DRT: 0xB0000000

ART: 0xB000087C DRT: 0xB0000000 ... Test_Socket:2'''

match = re.search(r'Test_Socket:(.*?)Test_Socket:', txt, flags=re.DOTALL)
print(match.group(1))

code outputs:

TestA ID: 0x0B028041 0xCC344007 0x10000834 0x00000011

TestA_CB: 3/ID: 0x445DDC13

TESTA_BD: 0x65C

A:0xB0000190 D:0x310020FF

ART: 0xB0000878 DRT: 0xB0000000

ART: 0xB000087C DRT: 0xB0000000 ...

after extracting the lines your can iterate them or run another regex to get what you need from these lines.

for line in match.group(1).splitlines():
    ...

answered on Stack Overflow Jan 24, 2017 by

ShmulikA • edited Jan 24, 2017 by

ShmulikA

I found a solution, I have tried re.finditer() in regex and it is working as expected. Please find my code, if there is a better approach than this, plz let me know. I would like thank everyone.

CODE:

import re

from collections import defaultdict
dict2=defaultdict(list)
dict3=defaultdict(list)
dict1=defaultdict(list)
pattern="QWL"
rd_case= "Digital"
setup_temp = "0C"
readout_temp = "0C"
address=[]

def tde_file():
  with open(r'C:\Gert_batch file\DOE_parsing\Thebe\DOE 4 - 5K\NEW SFR\PF\tde\MRB_QWL_0c_Digital_PS60c_TC1798.tde', 'rb') as f:
    for line in f:
        pattern_tde = ":TEST_RESULT"
        if pattern_tde in line:

            tde_addr = ':TEST_RESULT (\d+); addr: ([0-9A-Za-z]{10})'
            x0 =re.match(tde_addr, line)

            if x0:
                address_tde = x0.group(2)
                tde_addr_1 = 'cfp_(vqs|vcs)_m(\d+) \// HB05_SB255'
                x1 = re.search(tde_addr_1,line)

                tde_addr_2 = 'cfp_(vqs|vcs)_m\dm\d_(vth\d.\d) \// HB05_SB255'
                x2 = re.search(tde_addr_2,line)

                tde_addrs_1 = '(DTS_)value_(before|after)_test_(start|finish)'
                y0 = re.search(tde_addrs_1,line)

                if x1:
                    hlp_s = x1.group(2).zfill(2)
                    identifier =  x1.group(1)+"_m"+hlp_s

                if x2:
                    identifier =  x2.group(1)+"_m"+x2.group(2)

                try:
                    tde_addr_3 = '(SBE|DBE)|(Number of (\ds) bit fail) \// HB05_SB255'
                    hlp_s = re.search(tde_addr_3,line).group(1)
                    if hlp_s:
                              dict1[pattern,rd_case,address_tde]=identifier+"_"+hlp_s

                except AttributeError:pass

                try:
                    tde_addr_3 = '(SBE|DBE)|(Number of (\ds) bit fail) \// HB05_SB255'
                    hlp_s = re.search(tde_addr_3,line).group(3)
                    if hlp_s:
                        dict1[pattern,rd_case,address_tde]=identifier+"_"+hlp_s

                except AttributeError: pass

                try:
                    if y0.group(1) and y0.group(3):
                        dict1[pattern,rd_case,address_tde]=y0.group(1)+y0.group(3)+"_temp"
                except AttributeError: pass

def evaluate_lot_wxy(trf_dev_id_pattern):
  import re
  binary_value = ""
  dev_id_status = False

  line = trf_dev_id_pattern
  dev_id = 'Device ID: ([0-9a-zA-Z]{10}) ([0-9a-zA-Z]{10}) ([0-9a-zA-Z]{10}) ([0-9a-zA-Z]{10})$'
  hex_inp1 = re.search(dev_id,line)
  #print hex_inp1.group()
  hex_inp2 = hex_inp1.group(4)+hex_inp1.group(3)+hex_inp1.group(2)+hex_inp1.group(1)
  hex_inp3 =  re.sub('0x', '', hex_inp2)
  #print hex_inp3
  for i,val in enumerate(str(hex_inp3)):
     binary_value1=str(bin(int(val,16))[2:]).zfill(4)
     binary_value = str(binary_value) + str(binary_value1)
     #print binary_value

  wafer_val = binary_value[90:96]
  wafer = int(wafer_val,2)
  y_pos_val = binary_value[106:113]
  y_pos = int(y_pos_val,2)
  x_pos_val = binary_value[98:105]
  x_pos = int(x_pos_val,2)
  year_val = binary_value[63:67]
  year = int(year_val,2)
  production_week_val = binary_value[67:73]
  production_week = int(production_week_val,2)
  serial_no_val = binary_value[73:83]
  serial_no=int(serial_no_val,2)
  lot ="ZA"+str(year)+str(production_week)+str(serial_no)
  if (1 <= wafer <= 25) and (1<= x_pos <= 65) and (1 <= y_pos <= 65):
    dev_id_status = True
  return lot,wafer,x_pos,y_pos,dev_id_status

tde_file()
with open(r"C:\Gert_batch file\DOE_parsing\Thebe\DOE 4 - 5K\NEW SFR\PF\1k Cycling\Results_452_13384\Result Files\452_20170111_021021_TC1798_MRB_QWL_0c_Digital_PS60c_1021002999.trf") as f1:
        lines = f1.read()
        socket_position_status = False
        dev_id_status = False
        CB_noS_status = False
        trf_val_flag = False
        for m in re.finditer(r'Socket:(.*?)ART: 0xB00017EC DRT: 0x00000000\n', lines,flags=re.DOTALL):
            x1 = ('%s' % (m.group(0)))

            for line in x1.splitlines():
                if "Socket:" in line:
                    trf_addr = 'Socket: (\d+)$'
                    x0 =re.match(trf_addr, line)
                    try:
                        if x0.group(1).zfill(3):
                            socket_position = x0.group(1).zfill(3)
                            socket_position_status = True
                            #print socket_position


                    except AttributeError: pass

                elif "Device ID:" in line:
                    dev_id = 'Device ID: ([0-9a-zA-Z]{10}) ([0-9a-zA-Z]{10}) ([0-9a-zA-Z]{10}) ([0-9a-zA-Z]{10})$'
                    x1 = re.search(dev_id,line)
                    try:
                        if  x1.group(1) and x1.group(1) and x1.group(1) and x1.group(1):
                            trf_dev_id_pattern = x1.group()#x1.group(4)+x1.group(3)+x1.group(2)+x1.group(1)
                            lot_wafer_x_y = evaluate_lot_wxy(trf_dev_id_pattern)
                            dev_id_status = lot_wafer_x_y[4]
                            #print lot_wafer_x_y

                    except AttributeError: pass

                elif "CB:" in line:
                    CB_pat = 'CB: (\d+)\/'
                    x2 = re.search(CB_pat,line)
                    try:
                        if  x2.group(1):
                            CB_no = x2.group(1)
                            CB_noS_status = True
                            #print CB_noS_status
                    except AttributeError: pass

                elif"ART:" in line:
                    regex = re.search("ART: ([0-9A-Za-z]{10}) DRT: ([0-9A-Za-z]{10}$)",line)
                    #print line
                    try:
                        if regex.group(1) and regex.group(2):
                            for key1,val1 in dict1.iteritems():
                                if regex.group(1) in key1:
                                    #print "Address:"+regex.group(1)
                                    hlp_a = val1
                                    hlp_b = hlp_a.split("_")
                                    identifier = hlp_b[0]
                                    fail_class = hlp_b[1]
                                    key_addtional = hlp_b[2]
                                    val = regex.group(2)
                                    value = int(val[3:],16)
                                    trf_val_flag = True
                                    if dev_id_status and trf_val_flag and CB_noS_status:
                                        dict3[rd_case,pattern,setup_temp,readout_temp,CB_no,socket_position,fail_class,identifier,key_addtional]= value
                                        continue


                    except AttributeError: pass

    #elif socket_position_status and dev_id_status and CB_noS_status and trf_val_flag:
        #dict3[rd_case,pattern,setup_temp,readout_temp,socket_position,fail_class,identifier]= value

print len(dict3.keys())

answered on Stack Overflow Feb 2, 2017 by

user3827728

User contributions licensed under CC BY-SA 3.0