Unicode error python - python

Here is gist of the problem.
I am trying get data from a REST API call and storing them in a database.
Then I running few queries to find out TOP 3 users. I could not pack all the list values that I am getting from MySQL to a JSON file.
I am unable to get past the following issue.
File "/Users/id1/Downloads/user1.py", line 58, in
get_last_three_installed_user
results.append(dict(zip(columns, row)))
TypeError: 'unicode' object is not callable
This is the output of a SQL query
+----------------+--------+-------------+------------+-----------------+
| name | gender | nationality | registered | registered_date |
+----------------+--------+-------------+------------+-----------------+
| mélissa robin | female | FR | 1437761753 | 2015-07-24 |
| agathe fabre | female | FR | 1437002837 | 2015-07-15 |
| soline morin | female | FR | 1436138376 | 2015-07-05 |
+----------------+--------+-------------+------------+-----------------+
If I try str(name) I am getting following error:
name = str(json_dict["results"][result]["user"]["name"]["first"]) +"
"+ str(json_dict["results"][result]["user"]["name"]["last"])
UnicodeEncodeError: 'ascii' codec can't encode character u'\xe4' in
position 1: ordinal not in range(128)
Here is my code:
def get_last_three_installed_user(file_type):
count_sql = "select name,gender,nationality,registered,DATE_FORMAT(from_unixtime(registered), '%Y-%m-%d') registered_date from install_user order by from_unixtime(registered) desc limit 3 "
curs.execute(count_sql)
columns = [column[0] for column in curs.description]
results = []
if file_type == 'csv':
fp = open('user_list.csv', 'w')
csvFile = csv.writer(fp)
rows = curs.fetchall()
csvFile.writerows(rows)
else:
with open('file_count.json', 'w') as outfile:
for row in curs.fetchall():
results.append(dict(zip(columns, row)))
print results
output = {"TableData": results}
json.dump(output, outfile, sort_keys = True, indent = 4, ensure_ascii=False)

This code pretty much took care of it.
def get_last_three_installed_user(file_type):
count_sql = "select name,gender,nationality,registered,DATE_FORMAT(from_unixtime(registered), '%Y-%m-%d') registered_date from install_user order by from_unixtime(registered) desc limit 1,3 "
curs.execute(count_sql)
results = []
dict1 ={}
if file_type == 'csv':
fp = open('user_list.csv', 'w')
csvFile = csv.writer(fp)
rows = curs.fetchall()
csvFile.writerows(rows)
else:
with open('file_count.json', 'w') as outfile:
for row in curs.fetchall():
for idx, col in enumerate(curs.description):
dict1[col[0]] = row[idx]
results.append(dict1)
output = {"TableData": results}
json.dump(output, outfile, sort_keys = True, indent = 4, ensure_ascii=False)

Related

Parse ascii table header

So I need to parse this into dataframe or list:
tmp =
['+--------------+-----------------------------------------+',
'| Something to | Some header with subheader |',
'| watch or +-----------------+-----------------------+',
'| idk | First | another text again |',
'| | | with one more line |',
'| | +-----------------------+',
'| | | and this | how it be |',
'+--------------+-----------------+-----------------------+']
It is just txt table with strange header. I need to transform it to this:
['Something to watch or idk', 'Some header with subheader First', 'Some header with subheader another text again with one more line and this', 'Some header with subheader another text again with one more line how it be']
Here's my first solution that make me closer to victory (you can see the comments my tries):
pluses = [i for i, element in enumerate(tmp) if element[0] == '+']
tmp2 = tmp[pluses[0]:pluses[1]+1].copy()
table_str=''.join(tmp[pluses[0]:pluses[1]+1])
col=[[i for i, symbol in enumerate(line) if symbol == '+' or symbol == '|'] for line in tmp2]
tmp3=[]
strt = ''.join(tmp2.copy())
table_list = [l.strip().replace('\n', '') for l in re.split(r'\+[+-]+', strt) if l.strip()]
for row in table_list:
joined_row = ['' for _ in range(len(row))]
for lines in [line for line in row.split('||')]:
line_part = [i.strip() for i in lines.split('|') if i]
joined_row = [i + j for i, j in zip(joined_row, line_part)]
tmp3.append(joined_row)
here's out:
tmp3
out[4]:
[['Something to', 'Some header with subheader'],
['Something towatch or'],
['idk', 'First', 'another text again'],
['idk', 'First', 'another text againwith one more line'],
['idk'],
['', '', 'and this', 'how it be']]
Remains only join this in the right way but idk how to...
Here's addon:
We can locate pluses and splitters by this:
col=[[i for i, symbol in enumerate(line) if symbol == '+' or symbol == '|'] for line in tmp2]
[[0, 15, 57],
[0, 15, 57],
[0, 15, 33, 57],
[0, 15, 33, 57],
[0, 15, 33, 57],
[0, 15, 33, 57],
[0, 15, 33, 45, 57],
[0, 15, 33, 57]]
And then we can split or group by cell but idk how to too... Please help
Example No.2:
+----------+------------------------------------------------------------+---------------+----------------------------------+--------------------+-----------------------+
| Number | longtextveryveryloooooong | aaaaaaaaaaa | bbbbbbbbbbbbbbbbbb | dfsdfgsdfddd |qqqqqqqqqqqqqqqqqqqqqq |
| string | | | ccccccccccccccccccccc | affasdd as |qqqqqqqqqqqqqqqqqqqqqq |
| | | | eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee,| seeerrrr e, | dfsdfffffffffffff |
| | | | anothertext and something | percent | ttttttttttttttttt |
| | | | (nothingtodo), | | sssssssssssssssssssss |
| | | | and text | |zzzzzzzzzzzzzzzzzzzzzz |
| | | +----------------------------------+ | b rererereerr ppppppp |
| | | | all | longtext wit- | | |
| | | | |h many character| | |
+----------+------------------------------------------------------------+---------------+-----------------+----------------+--------------------+-----------------------+
You could do it recursively - parsing each "sub table" at a time:
def parse_table(table, header='', root='', table_len=None):
# store length of original table
if not table_len:
table_len = len(table)
# end of current "column"
col = table[0].find('+', 1)
rows = [
row for row in range(1, len(table))
if table[row].startswith('+')
and table[row][col] == '+'
]
row = rows[0]
# split "line" contents into columns
# end of "line" is either `+` or final `|`
end = col
num_cols = table[0].count('+')
if num_cols != table[1].count('|'):
end = table[1].rfind('|')
columns = (line[1:end].split('|') for line in table[1:row])
# rebuild each column appending to header
content = [
' '.join([header] + [line.strip() for line in lines]).strip()
for lines in zip(*columns)
]
# is there a table below?
if row + 2 < len(table):
header = content[-1]
# if we are not the last table - we are a header
if len(rows) > 1:
header = content.pop()
# if we are the first table in column - we are the root
if not root:
root = header
next_table = [line[:col + 1] for line in table[row:]]
content.extend(
parse_table(
next_table,
header=header,
root=root,
table_len=table_len
)
)
# is there a table to the right?
if col + 2 < len(table[0]):
# find start line of next table
row = next(
row for row, line in enumerate(table, start=-1)
if line[col] == '|'
)
next_table = [line[col:] for line in table[row:]]
# new top-level table - reset root
if len(next_table) == table_len:
root = ''
# next table on same level - reset header
if len(table) == len(next_table):
header = root
content.extend(
parse_table(
next_table,
header=header,
root=root,
table_len=table_len
)
)
return content
Output:
>>> parse_table(table)
['Something to watch or idk',
'Some header with subheader First',
'Some header with subheader another text again with one more line and this',
'Some header with subheader another text again with one more line how it be']
>>> parse_table(big_table)
['Number string',
'longtextveryveryloooooong',
'aaaaaaaaaaa',
'bbbbbbbbbbbbbbbbbb ccccccccccccccccccccc eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee, anothertext and something (nothingtodo), and text all',
'bbbbbbbbbbbbbbbbbb ccccccccccccccccccccc eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee, anothertext and something (nothingtodo), and text longtext wit- h many character',
'dfsdfgsdfddd affasdd as seeerrrr e, percent',
'qqqqqqqqqqqqqqqqqqqqqq qqqqqqqqqqqqqqqqqqqqqq dfsdfffffffffffff ttttttttttttttttt sssssssssssssssssssss zzzzzzzzzzzzzzzzzzzzzz b rererereerr ppppppp']
>>> parse_table(planets)
['Planets Planet Sun (Solar) Earth Moon Mars',
'Planets R (km) 696000 6371 1737 3390',
'Planets mass (x 10^29 kg) 1989100000 5973.6 73.5 641.85']
As the input is in the format of a reStructuredText table, you could use the docutils table parser.
import docutils.parsers.rst.tableparser
from collections.abc import Iterable
def extract_texts(tds):
" recursively extract StringLists and join"
texts = []
for e in tds:
if isinstance(e, docutils.statemachine.StringList):
texts.append(' '.join([s.strip() for s in list(e) if s]))
break
if isinstance(e, Iterable):
texts.append(extract_texts(e))
return texts
>>> parser = docutils.parsers.rst.tableparser.GridTableParser()
>>> tds = parser.parse(docutils.statemachine.StringList(tmp))
>>> extract_texts(tds)
[[],
[],
[[['Something to watch or idk'], ['Some header with subheader']],
[['First'], ['another text again with one more line']],
[['and this | how it be']]]]
then flatten.
For a more general usage, it is interesting to give a look in tds (the structure returned by parse): some documentation there

So close yet I can't figure out whats wrong

I am taking a course through school and have this challenge on Codio:
For your final challenge in this unit, you will load two files:
The first file F1 will have information about some accounts. It will be pipe-delimited and have one record per line, with these fields:
ACCOUNT NUMBER | PIN CODE | BALANCE
The second file F2 will contain instructions: one on each line. The instructions will look like this:
COMMAND | AMOUNT | ACCOUNT NUMBER | PIN CODE
COMMAND will be either add or sub. If the command is add, you will add AMOUNT to the BALANCE in the account files F1. If the command is sub, you will subtract.
However, there are a number of reasons for which you may need to reject the transaction. If you are asked to subtract an amount that would put the account below zero or if the pin code you are provided does not match the pin code in the account record, the transaction is ignored.
Account Transactions
Given pipe-delimited files F1 and F2 where F1 contains accounts with fields ACCOUNT NUM|PIN|BALANCE and F2 contains transaction instructions COMMAND|AMOUNT|ACCOUNT NUM|PIN, execute the transactions, storing the results back in F1.
The COMMAND field will be add or sub indicating addition or subtraction from the account.
Transactions which do not provide the correct PIN code or attempt to put the account below zero should be ignored.
This is my code for the challenge:
records = []
with open(F1,'r') as account_info:
content = account_info.readlines()
for row in content:
recordList = row.strip("\n").split('|')
records.append(recordList)
records2 = []
with open(F2,'r') as user_input:
content2 = user_input.readlines()
for row in content2:
recordList2 = row.strip("\n").split('|')
records2.append(recordList2)
for i in range(len(records)):
row = records[i]
for i in range(len(records2)):
row = records2[i]
for row in records and records2:
if records[i][1] == records2[i][3] and records2[i][0] == "add":
newBalance = int(records[i][2]) + int(records2[i][1])
records[i][2] = str(newBalance)
elif records2[i][0] == "sub" and int(records[i][2]) >= int(records2[i][1]):
newBalance = int(records[i][2]) - int(records2[i][1])
records[i][2] = str(newBalance)
output_records = ""
i = 0
while i <= len(records):
output_records += '|'.join(records[i])
if i != len(records):
output_records += '\n'
i += 1
if i == len(records):
break
outputFile = open(F1, 'w')
outputFile.write(output_records)
outputFile.close
This is what I'm getting for output which is off by one number.
Your program output did not match the expected output.
Your output:
1000|1234|10000
1020|2222|0
3000|3344|0
2020|1234|90000
Expected output:
1000|1234|11000
1020|2222|0
3000|3344|0
2020|1234|90000
Can someone point me in the direction of where I'm going wrong? Thanks.
Assume amount and balance are integer-valued.
For float change int(...) to float(...) in Code
Code
# Get Records
with open('file1.txt','r') as f1:
records = []
for row in f1:
row = row.rstrip().split('|')
# Strip white space and convert balance to float
row = [x.strip() if i != 2 else int(x.strip()) for i, x in enumerate(row)]
records.append(row)
# Get Transactions
with open('file2.txt', 'r') as f2:
transactions = []
for row in f2:
row = row.rstrip().split('|')
# Strip whitespace and convert balance to float
row = [x.strip() if i != 1 else int(x.strip()) for i, x in enumerate(row)]
transactions.append(row)
# Perform Transactions
for t in transactions:
for record in records:
# check records for matching account & pin
# Brute force search -- okay for records and transactions only in thousands
if t[2:] == record[:2]:
# Found account to update (record account & pin matches transaction)
if t[0] =='add':
record[-1] += t[1] # increment balance
elif t[0] == 'sub':
if record[-1] - t[1] >= 0:
record[-1] -= t[1] # decrement balance
break
# Output updated records
with open('file1.txt', 'w') as f3:
for row in records:
row = [str(x) for x in row]
f3.write(' | '.join(row) + '\n')
Test
Prior to running
File1.txt
1000 | 1234 | 10000
1020 | 2222 | 2500
3000 | 3344 | 3000
2020 | 1234 | 95000
File2.txt
add | 1000 | 1000 | 1234
sub | 1000 | 1020 | 2222
add | 1000 | 3000 | 3344
sub | 1000 | 2020 | 1234
After running
File1.txt
1000 | 1234 | 11000
1020 | 2222 | 1500
3000 | 3344 | 4000
2020 | 1234 | 94000
I think the problem could come from these:
for row in records and records2:
if records[i][1] == records2[i][3] and records2[i][0] == "add":
newBalance = int(records[i][2]) + int(records2[i][1])
records[i][2] = str(newBalance)
elif records2[i][0] == "sub" and int(records[i][2]) >= int(records2[i][1]):
newBalance = int(records[i][2]) - int(records2[i][1])
records[i][2] = str(newBalance)
From what I see, if records[i][1] != records2[i][3] it still run the elif and subtract.
Your code is really messy, i can advise you to delete all and restart from an empty file:
the following lines are meaningless:
for row in records and records2:
for i in range(len(records)):
row = records[i]
for i in range(len(records2)):
row = records2[i]
If you know how to use dictionaries, they might help a bit:
Here there is some pseudo code of a possible type of solution:
accounts = {}
with open(F1,'r') as f:
for line in f:
acc, pin, balance = line.split('|')
accounts[acc] = {'pin': pin, 'balance': int(balance)}
with open(F2,'r') as f:
for line in f:
command, amount, acc, pin = line.split('|')
amount = int(amount)
if accounts[acc]['pin'] != pin:
continue # wrong pin
if command == 'add':
accounts[acc]['balance'] += amount
elif accounts[acc]['balance'] >= amount: # if there is enough balance to sub
accounts[acc]['balance'] -= amount

How to detect memory leak in python code?

I'm new to machine learning and python both! I want my code to predict the object which is mostly the car in my case.
When I start the script it runs smoothly but after 20 or so pictures it hangs up my system because of the memory leak.
I want this script to run to my whole database which is way much more than 20 pictures.
I have tried pympler tracker to track which objects are taking the most memory -
This is the code I'm trying to run to predict the objects in the picture:
from imageai.Prediction import ImagePrediction
import os
import urllib.request
import mysql.connector
from pympler.tracker import SummaryTracker
tracker = SummaryTracker()
mydb = mysql.connector.connect(
host="localhost",
user="phpmyadmin",
passwd="anshu",
database="python_test"
)
counter = 0
mycursor = mydb.cursor()
sql = "SELECT id, image_url FROM `used_cars` " \
"WHERE is_processed = '0' AND image_url IS NOT NULL LIMIT 1"
mycursor.execute(sql)
result = mycursor.fetchall()
def dl_img(url, filepath, filename):
fullpath = filepath + filename
urllib.request.urlretrieve(url,fullpath)
for eachfile in result:
id = eachfile[0]
print(id)
filename = "image.jpg"
url = eachfile[1]
filepath = "/home/priyanshu/PycharmProjects/untitled/images/"
print(filename)
print(url)
print(filepath)
dl_img(url, filepath, filename)
execution_path = "/home/priyanshu/PycharmProjects/untitled/images/"
prediction = ImagePrediction()
prediction.setModelTypeAsResNet()
prediction.setModelPath( os.path.join(execution_path, "/home/priyanshu/Downloads/resnet50_weights_tf_dim_ordering_tf_kernels.h 5"))
prediction.loadModel()
predictions, probabilities = prediction.predictImage(os.path.join(execution_path, "image.jpg"), result_count=1)
for eachPrediction, eachProbability in zip(predictions, probabilities):
per = 0.00
label = ""
print(eachPrediction, " : ", eachProbability)
label = eachPrediction
per = eachProbability
print("Label: " + label)
print("Per:" + str(per))
counter = counter + 1
print("Picture Number: " + str(counter))
sql1 = "UPDATE used_cars SET is_processed = '1' WHERE id = '%s'" % id
sql2 = "INSERT into label (used_car_image_id, object_label, percentage) " \
"VALUE ('%s', '%s', '%s') " % (id, label, per)
print("done")
mycursor.execute(sql1)
mycursor.execute(sql2)
mydb.commit()
tracker.print_diff()
This is the result I'm getting from a single picture and it is consuming whole RAM after some iterations. What change should I do to stop the leaking?
seat_belt : 12.617655098438263
Label: seat_belt
Per:12.617655098438263
Picture Number: 1
done
types | objects | total size
<class 'tuple | 130920 | 11.98 MB
<class 'dict | 24002 | 6.82 MB
<class 'list | 56597 | 5.75 MB
<class 'int | 175920 | 4.70 MB
<class 'str | 26047 | 1.92 MB
<class 'set | 740 | 464.38 KB
<class 'tensorflow.python.framework.ops.Tensor | 6515 |
356.29 KB
<class 'tensorflow.python.framework.ops.Operation._InputList |
6097 | 333.43 KB
<class 'tensorflow.python.framework.ops.Operation | 6097 |
333.43 KB
<class 'SwigPyObject | 6098 | 285.84 KB
<class 'tensorflow.python.pywrap_tensorflow_internal.TF_Output |
4656 | 254.62 KB
<class 'tensorflow.python.framework.traceable_stack.TraceableObject | 3309 | 180.96 KB
<class 'tensorflow.python.framework.tensor_shape.Dimension |
1767 | 96.63 KB
<class 'tensorflow.python.framework.tensor_shape.TensorShapeV1 |
1298 | 70.98 KB
<class 'weakref | 807 | 63.05 KB
In this case the model is loading every time in the for loop with image. The model should be outside the for loop, in that case the model won't start every time and won't take the memory which the program is taking.
Code should work this way ->
execution_path = "/home/priyanshu/PycharmProjects/untitled/images/"
prediction = ImagePrediction()
prediction.setModelTypeAsResNet()
prediction.setModelPath( os.path.join(execution_path, "/home/priyanshu/Downloads/resnet50_weights_tf_dim_ordering_tf_kernels.h 5"))
prediction.loadModel()
for eachfile in result:
id = eachfile[0]
print(id)
filename = "image.jpg"
url = eachfile[1]
filepath = "/home/priyanshu/PycharmProjects/untitled/images/"
print(filename)
print(url)
print(filepath)
dl_img(url, filepath, filename)
predictions, probabilities = prediction.predictImage(os.path.join(execution_path, "image.jpg"), result_count=1)
for eachPrediction, eachProbability in zip(predictions, probabilities):
per = 0.00
label = ""
print(eachPrediction, " : ", eachProbability)
label = eachPrediction
per = eachProbability
print("Label: " + label)
print("Per:" + str(per))
counter = counter + 1
print("Picture Number: " + str(counter))
sql1 = "UPDATE used_cars SET is_processed = '1' WHERE id = '%s'" % id
sql2 = "INSERT into label (used_car_image_id, object_label, percentage) " \
"VALUE ('%s', '%s', '%s') " % (id, label, per)
print("done")
mycursor.execute(sql1)
mycursor.execute(sql2)
mydb.commit()
tracker.print_diff()

Append value only when not found

If a taxonomy in taxonomies is not in translations. I want it to print 152W00000X | Not Found currently all of the lines print with Not Found. if I remove the else I get an out of range error.
taxonomies = ['152W00000X', '156FX1800X', '200000000X', '261QD0000X', '3336C0003X', '333600000X', '261QD0000X']
translations = {'261QD0000X': 'Clinic/Center Dental', '3336C0003X': 'Pharmacy Community/Retail Pharmacy', '333600000X': 'Pharmacy'}
a = 0
final = []
for nums in taxonomies:
for i, v in translations.items():
if nums == i:
data = v
final.append(data)
else:
final.append('Not Found')
for nums in taxonomies:
print nums, "|", final[a]
a = a + 1
Current output is:
152W00000X | Not Found
156FX1800X | Not Found
200000000X | Not Found
261QD0000X | Not Found
3336C0003X | Not Found
333600000X | Not Found
261QD0000X | Not Found
The ideal output is:
152W00000X | Not Found
156FX1800X | Not Found
200000000X | Not Found
261QD0000X | Clinic/Center Dental
3336C0003X | Pharmacy Community/Retail Pharmacy
333600000X | Pharmacy
261QD0000X | Clinic/Center Dental
taxonomies = ['152W00000X', '156FX1800X', '200000000X', '261QD0000X', '3336C0003X', '333600000X', '261QD0000X']
translations = {'261QD0000X': 'Clinic/Center Dental', '3336C0003X': 'Pharmacy Community/Retail Pharmacy', '333600000X': 'Pharmacy'}
a = 0
final = []
for nums in taxonomies:
final.append(translations.get(nums, 'Not Found'))
for nums in taxonomies:
print nums, "|", final[a]
a = a + 1
I am using re to split IDVtaxo.txt at two or more spaces. Unless the source is actually delimited by tabs then this will work.
import re
with open('IDVtaxo.txt') as f:
idvtaxo = {re.split(r'\s{2,}', x)[0]: re.split(r'\s{2,}', x)[2] for x in f.read().splitlines()}
with open('taxonomies.txt') as f:
taxonomies = f.read().splitlines()
for taxonomy in taxonomies:
data = taxonomy.split('|')
tranlated = idvtaxo.get(data[1], 'Not Found')
print '%s|%s' % (taxonomy, tranlated)

I have a SAP generated file with many columns and some unwanted rows. How should I read directly into Pandas?

My table:
Table To Be Searched MSEG
Number of hits 273208
Maximum No. of Entri 0
Runtime 00:24:17
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|Mat. Doc. |MatYr|MvT|Material |Plnt|SLoc|Batch |Customer| Amount in LC| Amount| Quantity|BUn| Qty in UnE|EUn|PO |MatYr|Mat. Doc. |Order |Profit Ctr|SLED/BBD |Pstng Date|Entry Date|Time |User name |
|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|4912693062|2015 |551|100062 |HDC2|0001|5G30MC1A11| | 9.03 | 9.06 | 0.083 |CS | 2 |EA | | | | |IN1165B085|26.01.2016|01.08.2015|01.08.2015|01:13:16|O33462 |
|4912693063|2015 |501|166 |HDC2|0004| | | 0.00 | 0.00 | 2 |EA | 2 |EA | | | | |IN1165B085| |01.08.2015|01.08.2015|01:13:17|O33462 |
|4912693320|2015 |551|101343 |HDC2|0001|5G28MC1A11| | 53.73 | 53.72 | 0.500 |CS | 12 |EA | | | | |IN1165B085|25.01.2016|01.08.2015|01.08.2015|01:16:30|O33462 |
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Table To Be Searched MSEG
Number of hits 273208
Maximum No. of Entri 0
Runtime 00:24:17
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|Mat. Doc. |MatYr|MvT|Material |Plnt|SLoc|Batch |Customer| Amount in LC| Amount| Quantity|BUn| Qty in UnE|EUn|PO |MatYr|Mat. Doc. |Order |Profit Ctr|SLED/BBD |Pstng Date|Entry Date|Time |User name |
|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|4912696602|2015 |344|100399 |HMH3|0001|5G29MH3S11| | 0.00 | 0.00 | 9,061 |CS | 9,061 |CS | | | | |IN1165B074|26.01.2016|01.08.2015|01.08.2015|01:54:15|A70475 |
Its contains over more than 1 million rows. Previously I was converting this file first in a CSV using following Python code:
def hasNumbers(inputString):
return any(char.isdigit() for char in inputString)
with open("Input.txt") as infile:
for line in infile:
with open("Out.csv", "a") as myfile:
if line[:1] == '|':
if hasNumbers(line) == True:
newline = line[1:-2].replace(' ','')
newline.translate(None, ",!.;")
myfile.write(newline+'\n')
Is there a better way to convert this kind of file to CSV format or how should I directly parse it to pandas?
You could use itertools.ifilter to filter only the table data and a csv.reader to parse the important rows as follows:
import csv
import itertools
import StringIO
with open('input.txt', 'rb') as f_input:
for line in itertools.ifilter(lambda x: len(x) > 2 and x[0] == '|' and x[1].isalpha(), f_input):
header = [cols.strip() for cols in next(csv.reader(StringIO.StringIO(line), delimiter='|', skipinitialspace=True))][1:-1]
break
with open('input.txt', 'rb') as f_input, open('output.csv', 'wb') as f_output:
csv_output = csv.writer(f_output)
csv_output.writerow(header)
for line in itertools.ifilter(lambda x: len(x) > 2 and x[0] == '|' and x[1] != '-' and not x[1].isalpha(), f_input):
csv_input = csv.reader(StringIO.StringIO(line), delimiter='|', skipinitialspace=True)
csv_output.writerow([col.strip().translate(None, ",!.;") for col in next(csv_input)[1:-1]])
This would give you an output csv file as follows:
Mat. Doc.,MatYr,MvT,Material,Plnt,SLoc,Batch,Customer,Amount in LC,Amount,Quantity,BUn,Qty in UnE,EUn,PO,MatYr,Mat. Doc.,Order,Profit Ctr,SLED/BBD,Pstng Date,Entry Date,Time,User name
4912693062,2015,551,100062,HDC2,0001,5G30MC1A11,,903,906,0083,CS,2,EA,,,,,IN1165B085,26012016,01082015,01082015,01:13:16,O33462
4912693063,2015,501,166,HDC2,0004,,,000,000,2,EA,2,EA,,,,,IN1165B085,,01082015,01082015,01:13:17,O33462
4912693320,2015,551,101343,HDC2,0001,5G28MC1A11,,5373,5372,0500,CS,12,EA,,,,,IN1165B085,25012016,01082015,01082015,01:16:30,O33462
4912696602,2015,344,100399,HMH3,0001,5G29MH3S11,,000,000,9061,CS,9061,CS,,,,,IN1165B074,26012016,01082015,01082015,01:54:15,A70475
Here is an approach based on the answer in How to convert a SAP .txt extraction into a .csv file, which does not require opening the input file twice.
Here's the code from that answer, with #Martin Evans's clever technique of testing row[1].isalpha to determine if you have a header, and only export it once:
import csv
ii = 0
with open('file.txt', 'r', encoding='utf8', newline='') as f_input, \
open(str(ii + 1) + 'output.csv', 'w', encoding='utf8', newline='') as f_output:
input_lines = filter(lambda x: len(x) > 2 and x[0] == '|' and x[1] == ' ', f_input)
csv_input = csv.reader(input_lines, delimiter='|')
csv_output = csv.writer(f_output)
found_header = False
for row in csv_input:
if not found_header and row[1].isalpha():
found_header = True
csv_output.writerow(col.strip() for col in row[1:-1])

Categories

Resources