How to select data between two special characters? python - python

I have a file abcd.txt containing:
"""
hello,123 [1231,12312]1231231
hello, world[3r45t,3242]6542
123 213 135
4234 gdfg gfd 32
sd23 234 sdf 23
hi, hello[234,23423]561
hello, hi[123,123]985
"""
I want to print the string which is after the second ',' character till the ']'.
My output should be:
12312
3242
23423
123
I tried this:
def select(self):
file = open('gis.dat')
list1 = []
for line in file:
line = line.strip()
if re.search('[a-zA-Z]',line):
list1.append(line.partition(',')[-1].rpartition(']')[0])
return list1

You may use:
import re
for line in open("abcd.txt"):
match = re.findall(r".*?,.*?,(\d+)", line)
if match:
print match[0]
Output:
12312
3242
23423
123

Related

How to use tabulate library?

I am trying to use tabulate with the zip_longest function. So I have it like this:
from __future__ import print_function
from tabulate import tabulate
from itertools import zip_longest
import itertools
import locale
import operator
import re
50 ="['INGBNL2A, VAT number: NL851703884B01 i\nTel, +31 (0}1 80 61 88 \n\nrut ard wegetables\n\x0c']"
fruit_words = ['Appels', 'Ananas', 'Peen Waspeen',
'Tomaten Cherry', 'Sinaasappels',
'Watermeloenen', 'Rettich', 'Peren', 'Peen', 'Mandarijnen', 'Meloenen', 'Grapefruit']
def total_amount_fruit_regex(format_=re.escape):
return r"(\d*(?:\.\d+)*)\s*~?=?\s*(" + '|'.join(
format_(word) for word in fruit_words) + ')'
def total_fruit_per_sort():
number_found = re.findall(total_amount_fruit_regex(), verdi50)
fruit_dict = {}
for n, f in number_found:
fruit_dict[f] = fruit_dict.get(f, 0) + int(n)
result = '\n'.join(f'{key}: {val}' for key, val in fruit_dict.items())
return result
def fruit_list(format_=re.escape):
return "|".join(format_(word) for word in fruit_words)
def findallfruit(regex):
return re.findall(regex, verdi50)
def verdi_total_number_fruit_regex():
return rf"(\d*(?:\.\d+)*)\s*\W+(?:{fruit_list()})"
def show_extracted_data_from_file():
regexes = [
verdi_total_number_fruit_regex(),
]
matches = [findallfruit(regex) for regex in regexes]
fruit_list = total_fruit_per_sort().split("\n")
return "\n".join(" \t ".join(items) for items in zip_longest(tabulate(*matches, fruit_list, headers=['header','header2'], fillvalue='', )))
print(show_extracted_data_from_file())
But then I get this error:
TypeError at /controlepunt140
tabulate() got multiple values for argument 'headers'
So how to improve this?
So if you remove the tabulate function. Then the format looks like this:
16 Watermeloenen: 466
360 Appels: 688
6 Sinaasappels: 803
75
9
688
22
80
160
320
160
61
So expected output is with headers:
header1 header2
------- -------
16 Watermeloenen: 466
360 Appels: 688
6 Sinaasappels: 803
75
9
688
22
80
160
320
160
61
Like how it works in tabulate.
You should be passing a single table to the tabulate() function, passing multiple lists results in the TypeError: tabulate() got multiple values for argument 'headers' you are seeing.
Updating your return statement -
def show_extracted_data_from_file():
regexes = [
verdi_total_number_fruit_regex(),
]
matches = [findallfruit(regex) for regex in regexes]
fruit_list = total_fruit_per_sort().split("\n")
return tabulate(zip_longest(*matches, fruit_list), headers=['header1','header2'])
Output:
header1 header2
--------- ------------------
16 Watermeloenen: 466
360 Appels: 688
6 Sinaasappels: 803
75
9
688
22
80
160
320
160
61

Python regex and several named groups found in line

I want to extract some specific information from CFT (tool to copy files from any OS to another OS).
I parse the output of the command "cftutil listcat content=full"
I want to extract dates, hours, number of records in local and remote file...
For examples I get lines like
Records NRECS = 49016 Records NREC = 49016
where I want to extract the value after NRECS or NREC
or lines like
Begin date DATEB = 15/02/2019 End date DATEE = 15/02/2019
Begin time TIMEB = 12:18:21.05 End time TIMEE = 12:18:23.16
where I want the date and hour for begin and end of transfer
My regex only gets the first match in the line
my program
import re
reg = r"""
(^.*)DIRECT\s+=\s(?P<direct>[A-Z]{4})
|
(^.*)DATEE\s+=\s(?P<date_end>\d{2}\/\d{2}\/\d{4})
|
(^.*)DATEB\s+=\s(?P<date_deb>\d{2}\/\d{2}\/\d{4})
|
(^.*)TIMEB\s+=\s(?P<hour_deb>\d{2}:\d{2}:\d{2}.\d{2})
|
(^.*)TIMEE\s+=\s(?P<hour_end>\d{2}:\d{2}:\d{2}.\d{2})
|
(^.*)NREC\s+=\s(?P<nb_records_loc>\d+)
|
(^.*)NRECS\s+=\s(?P<nbrecords_rem>\d+)
"""
pat1 = re.compile(reg,re.VERBOSE)
with open("CFT_FULL.TXT","r") as a:
source = a.read().split('\n')
for i, lin in enumerate(source):
if (" FNAME" in lin or " NFNAME" in lin):
print '\n'.join(source[i+1:i+2])
m = re.search(pat1, lin)
if m is not None:
print m.lastgroup, "---> ", m.group(m.lastindex)
if "JOBNAME" in lin :
print lin,'\n'
Edit: partial output
date_end ---> 06/02/2019
hour_deb ---> 08:19:48.63
nb_records_loc ---> 139
But I should have
date_deb ---> 06/02/2019
date_end ---> 06/02/2019
hour_deb ---> 08:19:48.63
hour_end ---> 08:19:49.52
Thanks for any hint
edit
Eventually, the following code works fine
m = re.search(pat1, lin)
if m is not None:
for x in re.finditer(pat1,lin):
print x.lastgroup,"--->", x.group(x.lastindex)
it prints
date_deb ---> 06/02/2019
date_end ---> 06/02/2019
hour_deb ---> 08:19:58.64
hour_end ---> 08:19:58.75
nbrecords_rem ---> 62
nbrecords_loc ---> 62
Please change your regex to ,
reg = r"""
DIRECT\s+=\s(?P<direct>[A-Z]{4})
|
DATEE\s+=\s(?P<date_end>\d{2}\/\d{2}\/\d{4})
|
DATEB\s+=\s(?P<date_deb>\d{2}\/\d{2}\/\d{4})
|
TIMEB\s+=\s(?P<hour_deb>\d{2}:\d{2}:\d{2}.\d{2})
|
TIMEE\s+=\s(?P<hour_end>\d{2}:\d{2}:\d{2}.\d{2})
|
NREC\s+=\s(?P<nb_records_loc>\d+)
|
NRECS\s+=\s(?P<nbrecords_rem>\d+)
"""
Also, as re.search() returns the first occurrence of a match, i would suggest to use, re.finditer(). Eg. for the below string:
Begin date DATEB = 15/02/2019 End date DATEE = 15/02/2019
Begin time TIMEB = 12:18:21.05 End time TIMEE = 12:18:23.16
The expected output will be,
>>> for x in re.finditer(pat1,above_string):
print(x)
<_sre.SRE_Match object; span=(13, 34), match='DATEB = 15/02/2019'>
<_sre.SRE_Match object; span=(46, 67), match='DATEE = 15/02/2019'>
<_sre.SRE_Match object; span=(82, 104), match='TIMEB = 12:18:21.05'>
<_sre.SRE_Match object; span=(115, 137), match='TIMEE = 12:18:23.16'>

After splitting add newline

After splitting add newline.
Both queue size = 3;
How to split without '\n' in end
source.txt:
234;234
456;567
4567;6789
exec:
with open('source.txt') as line:
for source in line:
result = source.split(";")
qM.put(result[0])
qP.put(result[1])
print(result[0])
print(result[1])
print('----')
result:
234
234
----
456
567
----
4567
6789
----
Please check this: using strip()
import Queue
qM = Queue.Queue()
qP = Queue.Queue()
with open('txt') as line:
for source in line:
result = source.strip().split(";")
qM.put(result[0])
qP.put(result[1])
print(result[0])
print(result[1])
print('----')
Output:
234
234
----
456
567
----
4567
6789
----

Python: compare column in two files

I'm just trying to solving this text-processing task using python, but I'm not able to compare column.
What I have tried :
#!/usr/bin/env python
import sys
def Main():
print "This is your input Files %s,%s" % ( file1,file2 )
f1 = open(file1, 'r')
f2 = open(file2, 'r')
for line in f1:
column1_f1 = line.split()[:1]
#print column1_f1
for check in f2:
column2_f2 = check.split()[:1]
print column1_f1,column2_f2
if column1_f1 == column2_f2:
print "Match",line
else:
print line,check
f1.close()
f2.close()
if __name__ == '__main__':
if len(sys.argv) != 3:
print >> sys.stderr, "This Script need exact 2 argument, aborting"
exit(1)
else:
ThisScript, file1, file2 = sys.argv
Main()
I'm new in Python, Please help me to learn and understand this..
I would resolve it in similar way in python3 that user46911 did with awk. Read second file and save its keys in a dictionary. Later check if exists for each line of first file:
import sys
codes = {}
with open(sys.argv[2], 'r') as f2:
for line in f2:
fields = line.split()
codes[fields[0]] = fields[1]
with open(sys.argv[1], 'r') as f1:
for line in f1:
fields = line.split(None, 1)
if fields[0] in codes:
print('{0:4s}{1:s}'.format(codes[fields[0]], line[4:]), end='')
else:
print(line, end='')
Run it like:
python3 script.py file1 file2
That yields:
060090 AKRABERG FYR DN 6138 -666 101
EKVG 060100 VAGA FLOGHAVN DN 6205 -728 88
060110 TORSHAVN DN 6201 -675 55
060120 KIRKJA DN 6231 -631 55
060130 KLAKSVIK HELIPORT DN 6221 -656 75
060160 HORNS REV A DN 5550 786 21
060170 HORNS REV B DN 5558 761 10
060190 SILSTRUP DN 5691 863 0
060210 HANSTHOLM DN 5711 858 0
EKGF 060220 TYRA OEST DN 5571 480 43
EKTS 060240 THISTED LUFTHAVN DN 5706 870 8
060290 GROENLANDSHAVNEN DN 5703 1005 0
EKYT 060300 FLYVESTATION AALBORG DN 5708 985 13
060310 TYLSTRUP DN 5718 995 0
060320 STENHOEJ DN 5736 1033 56
060330 HIRTSHALS DN 5758 995 0
EKSN 060340 SINDAL FLYVEPLADS DN 5750 1021 28

How can I replace blank entries in a text table with 0 in Python?

I have tables which looks like this:
text = """
ID = 1234
Hello World 135,343 117,668 81,228
Another line of text (30,632) (48,063)
More text 0 11,205 0
Even more text 1,447 681
ID = 18372
Another table 35,323 38,302 909,381
Another line with text 13 15
More text here 7 0
Even more text here 7,011 1,447 681
"""
Is there a way to replace the "blank" entries in each table with 0? I am trying to set delimiters between the entries, but using the following code can't deal with blank spots in the tables:
for line in text.splitlines():
if 'ID' not in line:
line1 = line.split()
line = '|'.join((' '.join(line1[:-3]), '|'.join(line1[-3:])))
print line
else:
print line
The output is:
ID = 1234
|
Hello World|135,343|117,668|81,228
Another line of|text|(30,632)|(48,063)
More text|0|11,205|0
Even more|text|1,447|681
|
ID = 18372
|
Another table|35,323|38,302|909,381
Another line with|text|13|15
More text|here|7|0
Even more text here|7,011|1,447|681
As you can see, the first problem shows up on the second line of the first table. The word 'text' is considered the first column. Any way to fix this in Python to replace blank entries with 0?
Here is a function for finding columns in a bunch of lines. The second argument pat defines what a column is, and can be any regex.
import itertools as it
import re
def find_columns(lines, pat = r' '):
'''
Usage:
widths = find_columns(lines)
for line in lines:
if not line: continue
vals = [ line[widths[i]:widths[i+1]].strip() for i in range(len(widths)-1) ]
'''
widths = []
maxlen = max(len(line) for line in lines)
for line in lines:
line = ''.join([line, ' '*(maxlen-len(line))])
candidates = []
for match in re.finditer(pat, line):
candidates.extend(range(match.start(), match.end()+1))
widths.append(set(candidates))
widths = sorted(set.intersection(*widths))
diffs = [widths[i+1]-widths[i] for i in range(len(widths)-1)]
diffs = [None]+diffs
widths = [w for d, w in zip(diffs, widths) if d != 1]
if widths[0] != 0: widths = [0]+widths
return widths
def report(text):
for key, group in it.groupby(text.splitlines(), lambda line:line.startswith('ID')):
lines = list(group)
if key:
print('\n'.join(lines))
else:
# r' (?![a-zA-Z])' defines a column to be any whitespace
# not followed by alphabetic characters.
widths = find_columns(lines, pat = r'\s(?![a-zA-Z])')
for line in lines:
if not line: continue
vals = [ line[widths[i]:widths[i+1]] for i in range(len(widths)-1) ]
vals = [v if v.strip() else v[1:]+'0' for v in vals]
print('|'.join(vals))
text = """\
ID = 1234
Hello World 135,343 117,668 81,228
Another line of text (30,632) (48,063)
More text 0 11,205 0
Even more text 1,447 681
ID = 18372
Another table 35,323 38,302 909,381
Another line with text 13 15
More text here 7 0
Even more text here 7,011 1,447 681
"""
report(text)
yields
ID = 1234
Hello World | 135,343| 117,668| 81,228
Another line of text| (30,632)| 0| (48,063)
More text | 0 | 11,205| 0
Even more text | 0| 1,447 | 681
ID = 18372
Another table | 35,323| 38,302| 909,381
Another line with text| 13 | 15|0
More text here | 0| 7 | 0
Even more text here | 7,011| 1,447| 681

Categories

Resources