Parsing Messy Data - python

I'm relatively new to python and was wondering if I could get some assistance in parsing data so that it is easier to analyze.
My data is in the following form (each is an entire line):
20160930-07:06:54.481737|I|MTP_4|CL:BF K7-M7-N7 Restrict for maxAggressive: -4.237195
20160930-07:06:54.481738|I|MTP_4|CL:BF K7-M7-N7 BidPrice: -5.0 mktBestBid: -5.0 bidTheo: -4.096774 bidSeedEdge: 0.195028 bidUnseedEdge: CL:BF K7-M7-N7 = 0.14042 Min Bid: -6.0 Max Ticks Offset: 1 Max Aggressive Ticks: 1
This is my code so far
# Output file
output_filename = os.path.normpath("Mypath/testList.log")
# Overwrites the file
with open(output_filename, "w") as out_file:
out_file.write("")
# Open output file
with open(output_filename, "a") as out_file:
# Open input file in 'read' mode
with open("mypath/tradedata.log", "r") as in_file:
# Loop over each log line, Grabs lines with necessary data
for line in islice(in_file, 177004, 8349710):
out_file.write(line)
Would it be easiest to just go through and do it by keywords like; bidUnseedEdge, mktBesdBid, etc. ?

infilename = "path/data.log"
outfilename = "path/OutputData.csv"
with open(infilename, 'r') as infile,\
open(outfilename, "w") as outfile:
lineCounter = 0
for line in infile:
lineCounter += 1
if lineCounter % 1000000 == 0:
print lineCounter
data = line.split("|")
if len(data) < 4:
continue
bidsplit = data[3].split("bidTheo:")
namebid = data[3].split("BidPrice:")
if len(bidsplit) == 2:
bid = float(bidsplit[1].strip().split()[0])
bidname = namebid[0].strip().split(",")[0]
#print "bidTheo," + data[0] + "," + str(bid)
outfile.write("bidTheo," + data[0] + "," + bidname + "," + str(bid) + "\n")
offersplit = data[3].split("offerTheo:")
nameoffer = data[3].split("AskPrice:")
if len(offersplit) == 2:
offer = float(offersplit[1].strip().split()[0])
offername = nameoffer[0].strip().split(",")[0]
#print "offerTheo," + data[0] + "," + str(offer)
outfile.write("offerTheo," + data[0] + "," + offername + "," + str(offer) + "\n")
print "Done"

Related

How to split file with certain conditions for each end line of each file

I have a .txt file like this:
2019-03-29 12:03:07 line1
line2
line3
....
2019-03-30 07:05:09 line1
line2
....
2019-03-31 10:03:20 line1
line2
....
I split the file into several files, like this:
inputData = 'dirname\..'
numThrd = 3
def chunkFiles():
nline = sum(1 for line in open(inputData,'r', encoding='utf-8', errors='ignore'))
chunk_size = math.floor(nline/int(numThrd))
n_thread = int(numThrd)
j = 0
with open(inputData,'r', encoding='utf-8', errors='ignore') as fileout:
for i, line in enumerate(fileout):
if (i + 1 == j * chunk_size and j != n_thread) or i == nline:
out.close()
if i + 1 == 1 or (j != n_thread and i + 1 == j * chunk_size):
chunkFile = 'rawData' + str(j+1) + '.txt'
if os.path.isfile(chunkFile ):
break
out = open(chunkFile , 'w+', encoding='utf-8', errors='ignore')
j = j + 1
fLine = line[:-1]
if not matchLine:
if out.closed != True:
out.write(line)
if i % 1000 == 0 and i != 0:
print ('Processing line %i ...' % (i))
However, I want the split file to meet the condition that the last line in the chunk file must be right before the line that has the date.
recent output that I got:
rawData1.txt
2019-03-29 12:03:07 line1
line2
....
-------------------------
rawData2.txt
line50
line51
2019-03-30 07:05:09 line1
line2
.....
Desired output:
rawData1.txt
2019-03-29 12:03:07 line1
line2
line3
....
-------------------------
rawData2.txt
2019-03-30 07:05:09 line1
line2
....
what should I add to the script above to meet that conditions?
Thank you very much
You can produce the desired output by using a list to hold the lines you want to write (see below).
def write_chunk(filename, chunk):
with open(filename, "w") as out:
for i in chunk:
out.write(i)
chunk = []
n_chunk = 1
with open("data.txt") as f:
for line in f:
if not line[0].isspace() and chunk:
write_chunk("{}.txt".format(n_chunk), chunk)
chunk = []
n_chunk += 1
chunk.append(line)
# write final chunk
write_chunk("{}.txt".format(n_chunk), chunk)

python: write data in .csv file disorder

#write data in .csv file
def data_save_csv(type,data,id,name,header,since = None):
#get the date when storage data
date_storage()
#create the data storage directory
csv_parent_directory = os.path.join("dataset","csv",type,glovar.date)
#write data in .csv
if type == "group_members":
csv_file_prefix = "gm"
elif type == "group_feed":
csv_file_prefix = "gf"
elif type == "public_figure_posts":
csv_file_prefix = "pfp"
elif "user_" in type:
# create the data storage directory
csv_parent_directory = os.path.join("dataset", "csv", "user", type, glovar.date)
if type == "user_friends":
csv_file_prefix = "uf"
elif type == "user_likes":
csv_file_prefix = "ul"
elif type == "user_feed":
csv_file_prefix = "uf"
# create (mkdir) the csv_parent_directory
directory_create(csv_parent_directory)
if since:
csv_file_name = csv_file_prefix + "_" + since.strftime("%Y%m%d-%H%M%S") + "_" + time_storage() + id + "_" +name + ".csv"
else:
csv_file_name = csv_file_prefix + "_" + time_storage() + "_" + id + "_" +name + ".csv"
csv_file_directory = os.path.join(csv_parent_directory,csv_file_name)
if type == "user_feed":
feed = data
for item in feed:
# parse the feed data from group_download.py
print("id=" + item['id'] + ",")
print("permalink_url=" + item['permalink_url'] + ",")
print("created_time=" + item['created_time'] + ",")
print("updated_time=" + item['updated_time'] + ",")
print("name=" + item['from']['name'] + ",")
print("from_id=" + item['from']['id'] + ",")
print("message=" + item['message'] + ",")
print("link=" + item['link'] + ",")
print("likes_total_count=" + str(item['likes']['summary']['total_count']) + ",")
print("comments_total_count=" + str(item['comments']['summary']['total_count']) + ",")
with open(csv_file_directory,'w',newline='', encoding='utf-8') as csvfile:
writer = csv.writer(csvfile,delimiter=',',quotechar='"',quoting=csv.QUOTE_MINIMAL)
#csv header
writer.writerow(header)
#if data is group members(group_manage.py)
if type == "group_members" or "user_friends" or "user_likes":
row = []
for i in range(len(data)):
for k in data[i].keys():
if isinstance(data[i][k],bool):
data[i][k] = str(data[i][k])
row.append(data[i][k])
writer.writerow(row)
row = []
#if data is group feed(group_download.py)
elif type == "group_feed" or "public_figure_posts" or "user_feed":
feed = data
for item in feed:
#parse the feed data from group_download.py
row = [item['id'],item['permalink_url'],item['created_time'],item['updated_time'],item['from']['name'],item['from']['id'],item['message'],item['link'],item['likes']['summary']['total_count'],item['comments']['summary']['total_count']]
writer.writerow(row)
csvfile.close()
Write a python program to write data in .csv file, when the type is “user_feed”,
I print the items of the data:
id=110286969468305_112459422584393,
permalink_url=https://www.facebook.com/110286969468305/posts/112459422584393,
created_time=2016-12-18T12:44:52+0000,
updated_time=2016-12-18T12:47:10+0000,
name=Dewi Nurfitri Oktaviani,
from_id=10202749157833181,
message=Hi, nice to meet you,
link=,
likes_total_count=0,
comments_total_count=1,
They are right, but when write the data in the .csv file, I found the sequence of the data does not match the head order, the head is :
header = ["POST ID", "Permalink", "Create time", "Updated time", "Author", "Author ID", "Message", "Link", "Likes", "Comments"]
and you can see that in this method "data_save_csv",
elif type == "group_feed" or "public_figure_posts" or "user_feed":
feed = data
for item in feed:
#parse the feed data from group_download.py
row = [item['id'],item['permalink_url'],item['created_time'],item['updated_time'],item['from']['name'],item['from']['id'],item['message'],item['link'],item['likes']['summary']['total_count'],item['comments']['summary']['total_count']]
writer.writerow(row)
You can see that the sequence of the data item is the same with that in the head, but when I open the csv file, I found the sequence of the head item is right, but the sequence of the data item is disorder, no "id" data, and the other items order is not in the right order.
could you please help me?
Problem 1: This line
if type == "group_members" or "user_friends" or "user_likes":
isn't doing what you want. The expression always evaluates to True. Possible replacements:
if type == "group_members" or type == "user_friends" or type == "user_likes":
if type in ("group_members", "user_friends", "user_likes", ):
if type in {"group_members", "user_friends", "user_likes", }:
and this line
elif type == "group_feed" or "public_figure_posts" or "user_feed":
has the same problem. You should fix both lines and try again.

Python Count paragraph

Hello all so i've been tasked to count lines and paragraphs. Counting every line is obviously easy but im stuck on counting the paragraphs. If a paragraph has no character it will give back the number zero and for every paragraph is an increment higher. For example an input file is: Input and an Output should come out Output
so my code is:
def insert_line_para_nums(infile, outfile):
f = open(infile, 'r')
out = open(outfile, 'w')
linecount = 0
for i in f:
paragraphcount = 0
if '\n' in i:
linecount += 1
if len(i) < 2: paragraphcount *= 0
elif len(i) > 2: paragraphcount = paragraphcount + 1
out.write('%-4d %4d %s' % (paragraphcount, linecount, i))
f.close()
out.close()
def insert_line_para_nums(infile, outfile):
f = open(infile, 'r')
out = open(outfile, 'w')
linecount = 0
paragraphcount = 0
empty = True
for i in f:
if '\n' in i:
linecount += 1
if len(i) < 2:
empty = True
elif len(i) > 2 and empty is True:
paragraphcount = paragraphcount + 1
empty = False
if empty is True:
paragraphnumber = 0
else:
paragraphnumber = paragraphcount
out.write('%-4d %4d %s' % (paragraphnumber, linecount, i))
f.close()
out.close()
This is one way to do it, and not the prettiest.
import re
f = open('a.txt', 'r')
paragraph = 0
lines = f.readlines()
for idx, line in enumerate(lines):
if not line == '\n':
m = re.search(r'\w', line)
str = m.group(0)
try:
# if the line is a newline, and the previous line has a str in it, then
# count it as a paragraph.
if line == '\n' and str in lines[idx-1]:
paragraph +=1
except:
pass
if lines[-1] != '\n': # if the last line is not a new line, count a paragraph.
paragraph +=1
print paragraph

Python script need help adding an export .txt into it

f = open('f.txt', 'r')
import re
for line in f:
u = line.split(':')[0]
p = line.split(':')[1]
m = re.search(r'\d+$', u)
if m is not None:
m1 = re.search(r'\d+',p)
if m1 is None:
print(u + ':' + p.rstrip() + m.group())
f.close()
f = open('f.txt', 'r')
for line in f:
l = line.rstrip().split(':')[1]
m = re.search(r'\d+', l)
if m is None:
print(line.rstrip() + '123')
f.close()
f = open('f.txt', 'r')
for line in f:
l = line.rstrip().split(':')[1]
m = re.search(r'\d+', l)
if m is None:
print(line.rstrip() + '1')
f.close()
f = open('f.txt', 'r')
for line in f:
u = line.split(':')[0]
p = line.split(':')[1]
m = re.search(r'\d+$', u)
if m is not None and len(m.group()) == 4:
if int(m.group()) < 2013 and int(m.group()) > 1950:
m1 = re.search(r'\d+$',p)
if m1 is None:
print(u + ':' + p.rstrip() + m.group()[2:])
f.close()
f = open('f.txt', 'r')
for line in f:
s = line.split(':')[1]
m = re.search(r'\d+', s)
if m is not None:
newone = line.split(':')[0] + ':' + re.sub(r'\d+',"", s).rstrip()
if newone[-1:] != ':':
print(line.split(':')[0] + ':' + re.sub(r'\d+',"", s).rstrip())
f.close()
this is my .py scrit I had made, It works fine but it doesnt export a .txt once it has finished editing all of the lines in f.txt - it just closes
Could I get some help in adding some code to make it export as finished.txt - thanks in advance
You are not opening a file in writing mode in any part of the code.
You do it like this:
e = open('filename.txt', 'w')
Then, write lines to it:
e.write('example string')

Python Write To File Missing Lines

I'm having trouble using python to write strings into a file:
(what I'm trying to do is using python to generate some C programs)
The code I have is the following:
filename = "test.txt"
i = 0
string = "image"
tempstr = ""
average1 = "average"
average2 = "average*average"
output = ""
FILE = open(filename,"w")
while i < 20:
j = 0
output = "square_sum = square_sum + "
while j < 20:
tempstr = string + "_" + str(i) + "_" + str(j)
output = output + tempstr + "*" + tempstr + " + " + average2 + " - 2*" + average1 + "*" + tempstr
if j != 19:
output = output + " + "
if j == 19:
output = output + ";"
j = j + 1
output = output + "\n"
i = i + 1
print(output)
FILE.writelines(output)
FILE.close
The print gives me correct output, but the FILE has last line missing and some of the second last line missing. What's the problem in writing strings into file?
Thank you!
Probably help if you called the method...
FILE.close()
The problem is that you aren't calling the close() method, just mentioning it in the last line. You need parens to invoke a function.
Python's with statement can make that unnecessary though:
with open(filename,"w") as the_file:
while i < 20:
j = 0
output = "square_sum = square_sum + "
...
print(output)
the_file.writelines(output)
When the with clause is exited, the_file will be closed automatically.
Try:
with open(filename,"w") as FILE:
while i < 20:
# rest of your code with proper indent...
no close needed...
First, a Pythonified version of your code:
img = 'image_{i}_{j}'
avg = 'average'
clause = '{img}*{img} + {avg}*{avg} - 2*{avg}*{img}'.format(img=img, avg=avg)
clauses = (clause.format(i=i, j=j) for i in xrange(20) for j in xrange(20))
joinstr = '\n + '
output = 'square_sum = {};'.format(joinstr.join(clauses))
fname = 'output.c'
with open(fname, 'w') as outf:
print output
outf.write(output)
Second, it looks like you are hoping to speed up your C code by fanatical inlining. I very much doubt the speed gains will justify your efforts over something like
maxi = 20;
maxj = 20;
sum = 0;
sqsum = 0;
for(i=0; i<maxi; i++)
for(j=0; j<maxj; j++) {
t = image[i][j];
sum += t;
sqsum += t*t;
}
square_sum = sqsum + maxi*maxj*average*average - 2*sum*average;
Looks like your indentation may be incorrect, but just some other comments about your code:
writelines() writes the content of a list or iterator to the file.
Since your outputting a single string, just use write().
lines ["lineone\n", "line two\n"]
f = open("myfile.txt", "w")
f.writelines(lines)
f.close()
Or just:
output = "big long string\nOf something important\n"
f = open("myfile.txt", "w")
f.write(output)
f.close()
As another side note it maybe helpful to use the += operator.
output += "more text"
# is equivalent to
output = output + "more text"

Categories

Resources