Excel limit for force_unicode(url)) - python

My target is that I open an excel file, add a new column and save it as a new file. Unfortunately the code crashes at the save position.
import pandas as pd
path = '//My Documents/Python/'
fileName = "test.xlsx"
ef = pd.ExcelFile(path+fileName)
df = pd.read_excel(path+fileName, sheet_name=ef.sheet_names[0])
i = 1
for test in df['Content']:
try:
df['Content'] = df['Content'].astype(str)
except:
print("An exception occurred")
break
i += 1
print('success')
df.to_excel('/My Documents/Python/test_NEW.xlsx')
The error message
with link or location/anchor > 255 characters since it exceeds Excel's limit for URLS force_unicode(url)) Traceback (most recent call last):
Now my question is, is the save method wrong? How can I save the the columns with more than 256 characters or how can I cut the link after 256 characters ?
Thank you in advance! I would be very happy to receive an answer.

Related

struct.error: unpack requires a string argument of length 12

I am trying to follow a tutorial from Coding Robin to create a HAAR classifier: http://coding-robin.de/2013/07/22/train-your-own-opencv-haar-classifier.html.
I am at the part where I need to merge all the .vec files. I am trying to execute the python script given and I am getting the following error:
Traceback (most recent call last):
File "mergevec.py", line 170, in <module>
merge_vec_files(vec_directory, output_filename)
File "mergevec.py", line 133, in merge_vec_files
val = struct.unpack('<iihh', content[:12])
struct.error: unpack requires a string argument of length 12
Here is the code from the python script:
# Get the value for the first image size
prev_image_size = 0
try:
with open(files[0], 'rb') as vecfile:
content = ''.join(str(line) for line in vecfile.readlines())
val = struct.unpack('<iihh', content[:12])
prev_image_size = val[1]
except IOError as e:
print('An IO error occured while processing the file: {0}'.format(f))
exception_response(e)
# Get the total number of images
total_num_images = 0
for f in files:
try:
with open(f, 'rb') as vecfile:
content = ''.join(str(line) for line in vecfile.readlines())
val = struct.unpack('<iihh', content[:12])
num_images = val[0]
image_size = val[1]
if image_size != prev_image_size:
err_msg = """The image sizes in the .vec files differ. These values must be the same. \n The image size of file {0}: {1}\n
The image size of previous files: {0}""".format(f, image_size, prev_image_size)
sys.exit(err_msg)
total_num_images += num_images
except IOError as e:
print('An IO error occured while processing the file: {0}'.format(f))
exception_response(e)
I tried looking through solutions, but can't find a solution that fits this specific problem. Any help will be appreciated.
Thank you!
I figured it out by going to the github page for the tutorial. Apparently, I had to delete any vec files that had a length of zero.
Your problem is this bit:
content[:12]
The string is not guaranteed to be 12 characters long; it could be fewer. Add a length check and handle it separately, or try: except: and give the user a saner error message like "Invalid input in file ...".

UnicodeEncodeError in Python CSV manipulation script

I have a script that was working earlier but now stops due to UnicodeEncodeError.
I am using Python 3.4.3.
The full error message is the following:
Traceback (most recent call last):
File "R:/A/APIDevelopment/ScivalPubsExternal/Combine/ScivalPubsExt.py", line 58, in <module>
outputFD.writerow(row)
File "C:\Python34\lib\encodings\cp1252.py", line 19, in encode
return codecs.charmap_encode(input,self.errors,encoding_table)[0]
UnicodeEncodeError: 'charmap' codec can't encode character '\x8a' in position 413: character maps to <undefined>
How can I address this error?
The Python script is the following below:
import pdb
import csv,sys,os
import glob
import os
import codecs
os.chdir('R:/A/APIDevelopment/ScivalPubsExternal/Combine')
joinedFileOut='ScivalUpdate'
csvSourceDir="R:/A/APIDevelopment/ScivalPubsExternal/Combine/AustralianUniversities"
# create dictionary from Codes file (Institution names and codes)
codes = csv.reader(open('Codes.csv'))
#rows of the file are stored as lists/arrays
InstitutionCodesDict = {}
InstitutionYearsDict = {}
for row in codes:
#keys: instnames, #values: instcodes
InstitutionCodesDict[row[0]] = row[1]
#define year dictionary with empty values field
InstitutionYearsDict[row[0]] = []
#to create a fiel descriptor for the outputfile, wt means text mode (also rt opr r is the same)
with open(joinedFileOut,'wt') as csvWriteFD:
#write the file (it is still empty here)
outputFD=csv.writer(csvWriteFD,delimiter=',')
#with closes the file at the end, if exception occurs then before that
# open each scival file, create file descriptor (encoding needed) and then read it and print the name of the file
if not glob.glob(csvSourceDir+"/*.csv"):
print("CSV source files not found")
sys.exit()
for scivalFile in glob.glob(csvSourceDir+"/*.csv"):
#with open(scivalFile,"rt", encoding="utf8") as csvInFD:
with open(scivalFile,"rt", encoding="ISO-8859-1") as csvInFD:
fileFD = csv.reader(csvInFD)
print(scivalFile)
#create condition for loop
printon=False
#reads all rows in file and creates lists/arrays of each row
for row in fileFD:
if len(row)>1:
#the next printon part is skipped when looping through the rows above the data because it is not set to true
if printon:
#inserts instcode and inst sequentially to each row where there is data and after the header row
row.insert(0, InstitutionCode)
row.insert(0, Institution)
if row[10].strip() == "-":
row[10] = " "
else:
p = row[10].zfill(8)
q = p[0:4] + '-' + p[4:]
row[10] = q
#writes output file
outputFD.writerow(row)
else:
if "Publications at" in row[1]:
#get institution name from cell B1
Institution=row[1].replace('Publications at the ', "").replace('Publications at ',"")
print(Institution)
#lookup institution code from dictionary
InstitutionCode=InstitutionCodesDict[Institution]
#printon gets set to TRUE after the header column
if "Title" in row[0]: printon=True
if "Publication years" in row[0]:
#get the year to print it later to see which years were pulled
year=row[1]
#add year to institution in dictionary
if not year in InstitutionYearsDict[Institution]:
InstitutionYearsDict[Institution].append(year)
# Write a report showing the institution name followed by the years for
# which we have that institution's data.
with open("Instyears.txt","w") as instReportFD:
for inst in (InstitutionYearsDict):
instReportFD.write(inst)
for yr in InstitutionYearsDict[inst]:
instReportFD.write(" "+yr)
instReportFD.write("\n")
Make sure to use the correct encoding of your source and destination files. You open files in three locations:
codes = csv.reader(open('Codes.csv'))
: : :
with open(joinedFileOut,'wt') as csvWriteFD:
outputFD=csv.writer(csvWriteFD,delimiter=',')
: : :
with open(scivalFile,"rt", encoding="ISO-8859-1") as csvInFD:
fileFD = csv.reader(csvInFD)
This should look something like:
# Use the correct encoding. If you made this file on
# Windows it is likely Windows-1252 (also known as cp1252):
with open('Codes.csv', encoding='cp1252') as f:
codes = csv.reader(f)
: : :
# The output encoding can be anything you want. UTF-8
# supports all Unicode characters. Windows apps tend to like
# the files to start with a UTF-8 BOM if the file is UTF-8,
# so 'utf-8-sig' is an option.
with open(joinedFileOut,'w', encoding='utf-8-sig') as csvWriteFD:
outputFD=csv.writer(csvWriteFD)
: : :
# This file is probably the cause of your problem and is not ISO-8859-1.
# Maybe UTF-8 instead? 'utf-8-sig' will safely handle and remove a UTF-8 BOM
# if present.
with open(scivalFile,'r', encoding='utf-8-sig') as csvInFD:
fileFD = csv.reader(csvInFD)
The error is caused by an attempt to write a string containing a U+008A character using the default cp1252 encoding of your system. It is trivial to fix, just declare a latin1 encoding (or iso-8859-1) for your output file (because it just outputs the original byte without conversion):
with open(joinedFileOut,'wt', encoding='latin1') as csvWriteFD:
But this will only hide the real problem: where does this 0x8a character come from? My advice is to intercept the exception and dump the line where it occurs:
try:
outputFD.writerow(row)
except UnicodeEncodeError:
# print row, the name of the file being processed and the line number
It is probably caused by one of the input files not being is-8859-1 encoded but more probably utf8 encoded...

Prevent datetime.strptime from exit in case of format mismatch

I am parsing dates from a measurement file (about 200k lines). The format is a date and a measurement. The date format is "2013-08-07-20-46" or in time format "%Y-%m-%d-%H-%M". Ever so often the time stamp has a bad character. (The data came from a serial link which had interruptions). The entry would look like : 201-08-11-05-15 .
My parsing line to convert the time string into seconds is :
time.mktime(datetime.datetime.strptime(dt, "%Y-%m-%d-%H-%M").timetuple())
I got it online and don't fully understand how it work. (But it works)
My problem is to prevent the program from throwing error exit when a format mismatch happens. Is there a way to prevent the strptime to no exit but gracefully return an error flag in which case I would simple discard the data line and move on to the next. Yes, I can perform a pattern check with regexp but I was wondering if some smart mismatch handling is already built into strptime.
Append # Anand S Kumar
It worked for a few bad lines but then it failed.
fp = open('bmp085.dat', 'r')
for line in fp:
[dt,t,p]= string.split(line)
try:
sec= time.mktime(datetime.datetime.strptime(dt, "%Y-%m-%d-%H-%M").timetuple()) - sec0
except ValueError:
print 'Bad data : ' + line
continue #If you are doing this in a loop (looping over the lines) so that it moves onto next iteration
print sec, p ,t
t_list.append(sec)
p_list.append(p)
fp.close()
Output:
288240.0 1014.48 24.2
288540.0 1014.57 24.2
288840.0 1014.46 24.2
Bad data : �013-08-11-05-05 24.2! 1014.49
Bad data : 2013=0▒-11-05-10 �24.2 1014.57
Bad data : 201�-08-11-05-15 24.1 1014.57
Bad data : "0�#-08-1!-p5-22 24.1 1014.6
Traceback (most recent call last):
File "<stdin>", line 2, in <module>
ValueError: too many values to unpack
>>>
Append # Anand S Kumar
It crashed again.
for line in fp:
print line
dt,t,p = line.split(' ',2)
try:
sec= time.mktime(datetime.datetime.strptime(dt, "%Y-%m-%d-%H-%M").timetuple()) - sec0
except ValueError:
print 'Bad data : ' + line
continue #If you are doing this in a loop (looping over the lines) so that it moves onto next iteration
print sec, p ,t
Failed :
2013-08-11�06-t5 03/9 9014.y
Bad data : 2013-08-11�06-t5 03/9 9014.y
2013-08-11-06-50 (23. 1014.96
295440.0 (23. 1014.96
2013-08-11-06%55 23.9 !�1015.01
Traceback (most recent call last):
File "<stdin>", line 5, in <module>
TypeError: must be string without null bytes, not str
>>> fp.close()
>>>
You can use try..except catching any ValueError and if any such value error occurs, move onto the next line. Example -
try:
time.mktime(datetime.datetime.strptime(dt, "%Y-%m-%d-%H-%M").timetuple())
except ValueError:
continue #If you are doing this in a loop (looping over the lines) so that it moves onto next iteration
If you are doing something else (maybe like a function call for each line , then return None or so in the except block)
The second ValueError you are getting should be occuring in line -
[dt,t,p]= string.split(line)
This issue is occur because there maybe a particular line that is resulting in more than 3 elements. One thing you can do for this would be to use the maxspplit argument from str.split() to split maximum 3 times. Example -
dt,t,p = line.split(None,2)
Or if you really want to use string.split() -
[dt,t,p]= string.split(line,None,2)
Or if you are not expecting space inside any of the fields, you can include the line causing the ValueError inside the try..except block and treat it as a bad line.
Use try - except in a for-loop:
for dt in data:
try:
print time.mktime(datetime.datetime.strptime(dt, "%Y-%m-%d-%H-%M").timetuple())
except ValueError:
print "Wrong format!"
continue
Output for data = ["1998-05-14-15-45","11998-05-14-15-45","2002-05-14-15-45"]:
895153500.0
Wrong format!
1021383900.0

how can I make a file created in a function readable by other functions?

I need the contents of a file made by some function be able to be read by other functions. The closest I've come is to import a function within another function. The following code is what is what I'm using. According to the tutorials I've read python will either open a file if it exists or create one if not.What's happening is in "def space" the file "loader.py" is duplicated with no content.
def load(): # all this is input with a couple of filters
first = input("1st lot#: ") #
last = input("last lot#: ") #
for a in range(first,last+1): #
x = raw_input("?:")
while x==(""):
print " Error",
x=raw_input("?")
while int(x)> 35:
print"Error",
x=raw_input("?")
num= x #python thinks this is a tuple
num= str(num)
f=open("loader.py","a") #this is the file I want to share
f.write(num)
f.close()
f=open("loader.py","r") #just shows that the file is being
print f.read() #appened
f.close()
print "Finished loading"
def spacer():
count=0
f=open("loader.py","r") #this is what I thought would open the
#file but just opens a new 1 with the
#same name
length=len(f.read())
print type(f.read(count))
print f.read(count)
print f.read(count+1)
for a in range(1,length+1):
print f.read(count)
vector1= int(f.read(count))
vector2 = int(f.read(count+1))
if vector1==vector2:
space= 0
if vector1< vector2:
space= vector2-vector1
else:
space= (35-vector1)+vector2
count=+1
b= open ("store_space.py","w")
b.write(space)
b.close()
load()
spacer()
this what I get
1st lot#: 1
last lot#: 1
?:2
25342423555619333523452624356232184517181933235991010111348287989469658293435253195472514148238543246547722232633834632
Finished loading # This is the end of "def load" it shows the file is being appended
<type 'str'> # this is from "def spacer" I realized python was creating another
# file named "loader.py with nothing in it. You can see this in the
#error msgs below
Traceback (most recent call last):
File "C:/Python27/ex1", line 56, in <module>
spacer()
File "C:/Python27/ex1", line 41, in spacer
vector1= int(f.read(count))
ValueError: invalid literal for int() with base 10: ''tion within another function but this only causes the imported function to run.
The file probably has content, but you're not reading it properly. You have:
count=0
#...
vector1= int(f.read(count))
You told Python to read 0 bytes, so it returns an empty string. Then it tries to convert the empty string to an int, and this fails as the error says, because an empty string is not a valid representation of an integer value.

"list index out of range" in python

I have a code in python to index a text file that contain arabic words. I tested the code on an english text and it works well ,but it gives me an error when i tested an arabic one.
Note: the text file is saved in unicode encoding not in ANSI encoding.
This is my code:
from whoosh import fields, index
import os.path
import csv
import codecs
from whoosh.qparser import QueryParser
# This list associates a name with each position in a row
columns = ["juza","chapter","verse","voc"]
schema = fields.Schema(juza=fields.NUMERIC,
chapter=fields.NUMERIC,
verse=fields.NUMERIC,
voc=fields.TEXT)
# Create the Whoosh index
indexname = "indexdir"
if not os.path.exists(indexname):
os.mkdir(indexname)
ix = index.create_in(indexname, schema)
# Open a writer for the index
with ix.writer() as writer:
with open("h.txt", 'r') as txtfile:
lines=txtfile.readlines()
# Read each row in the file
for i in lines:
# Create a dictionary to hold the document values for this row
doc = {}
thisline=i.split()
u=0
# Read the values for the row enumerated like
# (0, "juza"), (1, "chapter"), etc.
for w in thisline:
# Get the field name from the "columns" list
fieldname = columns[u]
u+=1
#if isinstance(w, basestring):
# w = unicode(w)
doc[fieldname] = w
# Pass the dictionary to the add_document method
writer.add_document(**doc)
with ix.searcher() as searcher:
query = QueryParser("voc", ix.schema).parse(u"بسم")
results = searcher.search(query)
print(len(results))
print(results[1])
Then the error is :
Traceback (most recent call last):
File "C:\Python27\yarab.py", line 38, in <module>
fieldname = columns[u]
IndexError: list index out of range
this is a sample of the file:
1 1 1 كتاب
1 1 2 قرأ
1 1 3 لعب
1 1 4 كتاب
While I cannot see anything obviously wrong with that, I would make sure you're designing for error. Make sure you catch any situation where split() returns more than expected amount of elements and handle it promptly (e.g. print and terminate). It looks like you might be dealing with ill-formatted data.
You missed the header of Unicode in your script. the first line should be:
encoding: utf-8
Also to open a file with the unicode encoding use:
import codecs
with codecs.open("s.txt",encoding='utf-8') as txtfile:

Categories

Resources