Long time listener, first time caller! So, I have this Python script that is for parsing a Google Base Feed text file. It's taking out particular pieces of data and creating a formatted file I can upload on to Bing Shopping. After finally getting it to run, I've discovered that it just outputs blank files instead of the cleaned up data I wanted. What am I missing here? I really appreciate any help! Fair warning, I'm a pretty big Python newb, and I've had a lot of help writing this already.
import sys,os
import pandas as pd
import datetime
def remove_quotes(data):
lines = data.split('\n')
for i, line in enumerate(lines):
lines[i] = lines[i].replace('"','')
print lines[i]
return data
def tab_error(index, line, output):
count = len(line.split('\t'))
if count != 19:
err = 'Tab issue at line {linenum} : {numtabs} extra tabs'.\
format(linenum=index,numtabs=(count-19))
print err
output.write(err+'\n')
return True
return False
def html_error(index, line, output):
htmltags = ['&fract12', '&39','&', '&qt;', '<', '&rt;','"','>','quot','’']
for tag in htmltags:
if line.find(tag) > 0:
err = 'HTML issue at line {linenum}'.\
format(linenum=index)
print err
output.write(err+'\n')
return True
return False
def read_data(filename):
with open(filename,'r') as infile:
data = infile.read()
return data
def tabs_check(data, output, filename):
with open(filename,'w') as cleanfile:
header = ''
for x in xrange(19):
header += 'x'+str(x+1)+'\t'
cleanfile.write(header)
# for each line in the file
for i, line in enumerate(data.split('\r')[1:]):
# check line for tabs error
data_error = tab_error(i, line, output)
newline = line.replace('"','')
newline=newline.strip()
if not data_error:
cleanfile.write('\n'+newline)
def html_check(data, output, filename):
with open(filename,'w') as cleanfile:
# for each line in the file
lines = data.split('\n')
cleanfile.write(lines[0])
for i, line in enumerate(lines[1:]):
# check line for HTML errors
data_error = html_error(i, line, output)
newline = line.replace('"','')
newline=newline.strip()
if not data_error and newline:
cleanfile.write('\n'+newline)
if __name__ == '__main__':
# Clean tabs
filename = sys.argv[1]
ts = datetime.datetime.now().isoformat()
print ts
with open('bing_errors.txt','w') as output:
# print 'Removing quotes within .. product description and ...'
# data = remove_quotes(data)
print 'Removing lines with more than 19 tabs...'
data = read_data(filename)
tabs_check(data, output, 'clean19.txt')
# Delete and reorder columns
print 'Deleting and reordering columns...'
df = pd.read_table('clean19.txt')
tmp = df[['x8','x2','x3','x4','x6','x1','x5']]
tmp.columns = ['MPID',
'Brand (BrandorManufacturer)',
'Title',
'Item Description',
'Price',
'ProductURL',
'ImageURL']
tmp.to_csv('tmp.txt', index=False, sep='\t')
os.remove('clean19.txt')
#HTML errors
print 'Checking for HTML errors...'
data = read_data('tmp.txt')
html_check(data, output, 'BT1.txt')
os.remove('tmp.txt')
# row = tmp[tmp['MPID'] == 8724]
# print row
Related
I am using a script in fusion360 called importsplinecsv
I was wondering if it was possible to modify the script so that it would import one row every 10th row?
as the amount of rows that are being imported are very large and bloating.
if I could get some help that would be awesome.
here is the text
Author-Autodesk Inc.
Description-Import spline from csv file
import adsk.core, adsk.fusion, traceback
import io
def run(context):
ui = None
try:
app = adsk.core.Application.get()
ui = app.userInterface
# Get all components in the active design.
product = app.activeProduct
design = adsk.fusion.Design.cast(product)
title = 'Import Spline csv'
if not design:
ui.messageBox('No active Fusion design', title)
return
dlg = ui.createFileDialog()
dlg.title = 'Open CSV File'
dlg.filter = 'Comma Separated Values (*.csv);;All Files (*.*)'
if dlg.showOpen() != adsk.core.DialogResults.DialogOK :
return
filename = dlg.filename
with io.open(filename, 'r', encoding='utf-8-sig') as f:
points = adsk.core.ObjectCollection.create()
line = f.readline()
data = []
while line:
pntStrArr = line.split(',')
for pntStr in pntStrArr:
try:
data.append(float(pntStr))
except:
break
if len(data) >= 3 :
point = adsk.core.Point3D.create(data[0], data[1], data[2])
points.add(point)
line = f.readline()
data.clear()
if points.count:
root = design.rootComponent
sketch = root.sketches.add(root.xYConstructionPlane)
sketch.sketchCurves.sketchFittedSplines.add(points)
else:
ui.messageBox('No valid points', title)
except:
if ui:
ui.messageBox('Failed:\n{}'.format(traceback.format_exc()))
I have not used this library before but try:
for i, line in enumerate(f):
if i%10==0:
then your import command here
f is your filepointer
i will be the linenumber and line will be your line
dlg = ui.createFileDialog()
dlg.title = 'Open CSV File'
dlg.filter = 'Comma Separated Values (*.csv);;All Files (*.*)'
if dlg.showOpen() != adsk.core.DialogResults.DialogOK :
return
filename = dlg.filename
with io.open(filename, 'r', encoding='utf-8-sig') as f:
points = adsk.core.ObjectCollection.create()
for i, line in enumerate(f):
if i%10==0:
while line:
pntStrArr = line.split(',')
for pntStr in pntStrArr:
try:
data.append(float(pntStr))
except:
break
if len(data) >= 3 :
point = adsk.core.Point3D.create(data[0], data[1], data[2])
points.add(point)
line = f.readline()
data.clear()
if points.count:
root = design.rootComponent
sketch = root.sketches.add(root.xYConstructionPlane)
sketch.sketchCurves.sketchFittedSplines.add(points)
else:
ui.messageBox('No valid points', title)
except:
if ui:
ui.messageBox('Failed:\n{}'.format(traceback.format_exc()))
I would like to extract some info (between strings e.g. oldtime: ... oldtime!>) from a text file and write it in a CSV file. My input text file is like this:
=======================
oldtime:
hours:1:hours!>
minutes:12:minutes!>
oldtime!>
newtime:
hours:15:hours!>
minutes:17:minutes!>
newtime!>
oldtime:
hours:11:hours!>
minutes:22:minutes!>
oldtime!>
newtime:
hours:5:hours!>
minutes:17:minutes!>
newtime!>
==========================
I started with this but I can not go any further.
with open(inputfile, 'r') as f, open(outputfile.cvs, 'a') as f1:
f1.write("oldtime; newtime \n")
for row in f:
if "oldtime:" in str(row):
temp = re.split(r'(#oldtime[\n\r]|[\n\r]#oldtime!>)', str(row))
???
if "newtime:" in str(row):
temp = re.split(r'(#newtime[\n\r]|[\n\r]#newtime!>)', str(row))
I would like to get as an output a csv-file like this
oldtime newtime
01:12 15:17
11:22 05:17
Could you please help me? Thank you.
This is one approach using Regex and csv module.
Ex:
import re
import csv
with open(filename) as infile, open(filename_1, "w") as outfile:
data = infile.read()
hrs = re.findall(r"hours:(\d+):hours", data) #Get all HRS
mins = re.findall(r"minutes:(\d+):minutes", data) #Get All Mins
data = zip(hrs, mins)
writer = csv.writer(outfile) #Write CSV
writer.writerow(["oldtime", "newtime"]) #Header
for m, n in zip(data[0::2], data[1::2]):
writer.writerow([":".join(m), ":".join(n)]) #Write OLD time & New Time
Another solution close to the Rakesh solution assuming your file has always the same structure (oldtime -> hour -> min -> newtime -> hour -> min ...).
Extract all the number of a string with regex formula: match = re.findall(r'\d+', str_file)
Convert this list by joining hours and minutes: dates = [i+ ":" + j for i, j in zip(match[::2], match[1::2])]
Create a dataframe using the pandas module
Export the data
Here the code:
# Import module
import pandas as pd
with open("../temp.txt", 'r') as f:
# Read file as a string
str_file = f.read()
# Extract all numbers
match = re.findall(r'\d+', str_file)
print(match)
# ['1', '12', '15', '17', '11', '22', '5', '17']
# create dates
dates = [i+ ":" + j for i, j in zip(match[::2], match[1::2])]
print(dates)
# ['1:12', '15:17', '11:22', '5:17']
# create dataframe
df = pd.DataFrame({"oldtime": dates[::2],
"newtime": dates[1::2]})
print(df)
# oldtime newtime
# 0 1:12 15:17
# 1 11:22 5:17
# Export the data
df.to_csv("output.csv", index= False)
EDIT 1:
Assuming than the oldtime and newtime blocks can be swiped. Here I read the file line per line and categorise the oldtime and newtime in a dictionary. There are many slice but working on my test file.
# Import module
import pandas as pd
with open("../temp.txt", 'r') as f:
# Read file as a string
list_split = ["oldtime:", "newtime:"]
dates = {"oldtime:": [], "newtime:": []}
line = f.readline().rstrip('\n')
while True:
line = line.rstrip('\n')
print([line])
if line in list_split:
key = line
hours = f.readline().rstrip('\n').split(":")[1]
minutes = f.readline().rstrip('\n').split(":")[1]
dates[key].append(hours+':'+minutes)
line = f.readline()
if not line:
break
print(dates)
# {'oldtime:': ['1:12', '11:22'], 'newtime:': ['15:17', '5:17']}
# create dataframe
df = pd.DataFrame({"oldtime": dates["oldtime:"],
"newtime": dates["newtime:"]})
print(df)
# oldtime newtime
# 0 1:12 15:17
# 1 11:22 5:17
# Export the data
df.to_csv("output.csv", index=False)
EDIT 2:
import pandas as pd
with open("../temp.txt", 'r') as f:
# Read file as a string
list_split = ["oldtime:", "newtime:"]
dates = {"oldtime": [], "newtime": []}
line = f.readline().rstrip('\n')
while True:
# Ignore blank lines
if ("oldtime:" in line) or ("newtime:" in line):
# Process new "oldtime" or "newtime" block
# Class : either "oldtime" or "newtime"
class_time = line.replace(" ", "").rstrip('\n')[:-1]
# Default hour - minute values
hours = "24"
minutes = "60"
# Read next line
line = f.readline().rstrip('\n')
# While block not ended
while class_time + "!>" not in line:
# If hour in line: update hour
if 'hour' in line:
hours = line.split(":")[1]
# If minute in line: update minute
elif 'minute' in line:
minutes = line.split(":")[1]
# Read next line
line = f.readline().rstrip('\n')
# End block
# Add block read to dictionary
dates[class_time].append(hours+':'+minutes)
# Read next line
line = f.readline()
# If end of file: exit
if not line:
break
# create dataframe
df = pd.DataFrame({"oldtime": dates["oldtime"],
"newtime": dates["newtime"]})
# Export the data
df.to_csv("output.csv", index=False)
Hope that Help !
great question :).
Here's a simple solution I did, delaminating the string on the ":" character, turning the number strings into integers, combining them with :, and then writing them to a csv.
Here is the code:
import csv
f = "data.txt"
with open('data.txt','r') as f:
data = f.read()
data = data.split(sep=':')
nums = []
for i in data:
try:
nums.append(int(i))
except ValueError:
pass
times = []
for i in range(len(nums)):
if i%2 ==0:
times.append(str(nums[i]) + ":" + str(nums[i+1]))
num_rows = len(times)/2
with open('time_data.csv','w+',newline='') as f:
writer = csv.writer(f)
writer.writerow(['oldtime','newtime'])
for i in range(len(times)):
if i%2==0:
writer.writerow([times[i],times[i+1]])
After reading Rakesh's answer, I wrote this:
import re
import csv
list_i = ''
file_name = 'data.txt'
file_name1 = 'data_1.txt'
with open(file_name,'r') as f, open(file_name1,'w',newline='') as f1:
data = f.read()
list_1 = re.findall(r'hours:\d+:hours',data)
list_2 = re.findall(r'minutes:\d+:minutes',data)
for i in list_1:
list_i += i
list_2_i = ''
for i in list_2:
list_2_i += i
list_1 = re.findall(r'\d+',list_i)
list_2 = re.findall(r'\d+',list_2_i)
data = []
for i in range(len(list_1)):
if i%2==0:
data.append([str(list_1[i]) + ':' + str(list_2[i]),str(list_1[i+1]) + ':' + str(list_2[i+1])])
writer = csv.writer(f1)
writer.writerow(['oldtime','newtime'])
for i in data:
writer.writerow(i)
Also #Rakesh your code returned the error:
TypeError: 'zip' object is not subscriptable
Is there a way to fix this? :)
I am not sure of why i am getting the error mentioned.
Here, is the code which contains the numpyload.txt
def load(name):
print("start reading file with target")
wfile = open(name, "r")
line = wfile.readline().replace("\n", "")
print line
splits = line.split(",")
print splits
datalen = len(splits)
print datalen
wfile.close()
X = np.loadtxt(open(name), delimiter=',', usecols=range(0, datalen), skiprows=0)
print("done")
return np.array(X)
Here is the sample output of the csv file. *Note not listing all as there is 501 items in the csv file.
Id,asmSize,bytesSize,asmCompressionRate,bytesCompressionRate,ab_ratio,abc_ratio,ab2abc_ratio,sp_,...
I have a script to clean urls to get base domains from example.com/example1 and example.com/example2 down to example.com My issue is when it goes to through the file of urls it will have duplicate base domains. I want to remove the duplicates while printing the urls to a file. below is the code I currently have.
enter from Tkinter import *
import tkFileDialog
import re
def main():
fileOpen = Tk()
fileOpen.withdraw() #hiding tkinter window
file_path = tkFileDialog.askopenfilename(
title="Open file", filetypes=[("txt file",".txt")])
if file_path != "":
print "you chose file with path:", file_path
else:
print "you didn't open anything!"
fin = open(file_path)
fout = open("URL Cleaned.txt", "wt")
for line in fin.readlines():
editor = (line.replace('[.]', '.')
.replace('[dot]', '.')
.replace('hxxp://www.', '')
.replace('hxxps://www.', '')
.replace('hxxps://', '')
.replace('hxxp://', '')
.replace('www.', '')
.replace('http://www.', '')
.replace('https://www.', '')
.replace('https://', '')
.replace('http://', ''))
editor = re.sub(r'/.*', '', editor)
if __name__ == '__main__':
main()
Any help is appreciated. I have scoured the posts and tried all of the suggestions for my issue and have not found one that works.
You can use regular expresion to find the base domains.
If you have one url per line in your file:
import re
def main():
file = open("url.txt",'r')
domains = set()
# will works for any web like https://www.domain.com/something/somethingmore... , also without www, without https or just for www.domain.org
matcher= re.compile("(h..ps?://)?(?P<domain>(www\.)?[^/]*)/?.*")
for line in file:
# make here any replace you need with obfuscated urls like: line = line.replace('[.]','.')
if line[-1] == '\n': # remove "\n" from end of line if present
line = line[0:-1]
match = matcher.search(line)
if match != None: # If a url has been found
domains.add(match.group('domain'))
print domains
file.close()
main()
For example, with this file, it will print:
set(['platinum-shakers.net', 'wmi.ns01.us', 'adservice.no-ip.org', 'samczeruno.pl', 'java.ns1.name', 'microsoft.dhcp.biz', 'ids.us01.us', 'devsite.quostar.com', 'orlandmart.com'])
perhaps you could use a regular expression:
import re
p = re.compile(r".*\.com/(.*)") # to get for instance 'example1' or 'example2' etc.
with open(file_path) as fin, open("URL Cleaned.txt", "wt") as fout:
lines = fin.readlines():
bases = set(re.search(p, line).groups()[0] for line in lines if len(line) > 1)
for b in bases:
fout.write(b)
Using with open(..) auto closes the files after the executing the block of code
Output:
Using a text file with:
www.example.com/example1
www.example.com/example2
# blank lines are accounted for
www.example.com/example3
www.example.com/example4
www.example.com/example4 # as are duplicates
as the lines, I got the output,
example1
example2
example3
example4
here is my code for readinng individual cell of one csv file. but want to read multiple csv file one by one from .txt file where csv file paths are located.
import csv
ifile = open ("C:\Users\BKA4ABT\Desktop\Test_Specification\RDBI.csv", "rb")
data = list(csv.reader(ifile, delimiter = ';'))
REQ = []
RES = []
n = len(data)
for i in range(n):
x = data[i][1]
y = data[i][2]
REQ.append (x)
RES.append (y)
i += 1
for j in range(2,n):
try:
if REQ[j] != '' and RES[j]!= '': # ignore blank cell
print REQ[j], ' ', RES[j]
except:
pass
j += 1
And csv file paths are stored in a .txt file like
C:\Desktop\Test_Specification\RDBI.csv
C:\Desktop\Test_Specification\ECUreset.csv
C:\Desktop\Test_Specification\RDTC.csv
and so on..
You can read stuff stored in files into variables. And you can use variables with strings in them anywhere you can use a literal string. So...
with open('mytxtfile.txt', 'r') as txt_file:
for line in txt_file:
file_name = line.strip() # or was it trim()? I keep mixing them up
ifile = open(file_name, 'rb')
# ... the rest of your code goes here
Maybe we can fix this up a little...
import csv
with open('mytxtfile.txt', 'r') as txt_file:
for line in txt_file:
file_name = line.strip()
csv_file = csv.reader(open(file_name, 'rb', delimiter=';'))
for record in csv_file[1:]: # skip header row
req = record[1]
res = record[2]
if len(req + res):
print req, ' ', res
you just need to add a while which will read your file containing your list of files & paths upon your first open statement, for example
from __future__ import with_statement
with open("myfile_which_contains_file_path.txt") as f:
for line in f:
ifile = open(line, 'rb')
# here the rest of your code
You need to use a raw string string your path contains \
import csv
file_list = r"C:\Users\BKA4ABT\Desktop\Test_Specification\RDBI.csv"
with open(file_list) as f:
for line in f:
with open(line.strip(), 'rb') as the_file:
reader = csv.reader(the_file, delimiter=';')
for row in reader:
req,res = row[1:3]
if req and res:
print('{0} {1}'.format(req, res))