python unicode csv export using pyramid - python

I'm trying to export mongodb that has non ascii characters into csv format.
Right now, I'm dabbling with pyramid and using pyramid.response.
from pyramid.response import Response
from mycart.Member import Member
#view_config(context="mycart:resources.Member", name='', request_method="POST", permission = 'admin')
def member_export( context, request):
filename = 'member-'+time.strftime("%Y%m%d%H%M%S")+".csv"
download_path = os.getcwd() + '/MyCart/mycart/static/downloads/'+filename
member = Members(request)
my_list = [['First Name,Last Name']]
record = member.get_all_member( )
for r in record:
mystr = [ r['fname'], r['lname']]
my_list.append(mystr)
with open(download_path, 'wb') as f:
fileWriter = csv.writer(f, delimiter=',',quotechar='|', quoting=csv.QUOTE_MINIMAL)
for l in my_list:
print(l)
fileWriter.writerow(l)
size = os.path.getsize(download_path)
response = Response(content_type='application/force-download', content_disposition='attachment; filename=' + filename)
response.app_iter = open(download_path , 'rb')
response.content_length = size
return response
In mongoDB, first name is showing 王, when I'm using print, it too is showing 王. However, when I used excel to open it up, it shows random stuff - ç¾…
However, when I tried to view it in shell
$ more member-20130227141550.csv
It managed to display the non ascii character correctly.
How should I rectify this problem?

I'm not a Windows guy, so I am not sure whether the problem may be with your code or with excel just not handling non-ascii characters nicely. But I have noticed that you are writing your file with python csv module, which is notorious for headaches with unicode.
Other users have reported success with using unicodecsv as a replacement for the csv module. Perhaps you could try dropping in this module as a csv writer and see if your problem magically goes away.

Related

Set up a crawler and downloaded tweets. Unable to parse JSON file

I have been trying to parse a JSON file and it keeps giving me additional data errors. Since I am new to Python, I have no idea how I can resolve this. It seems there are multiple objects within the file. How do I parse it without getting any errors?
Edit: (Not my code but I am trying to work on it)
import json
import csv
import io
'''
creates a .csv file using a Twitter .json file
the fields have to be set manually
'''
data_json = io.open('filename', mode='r', encoding='utf-8').read() #reads in
the JSON file
data_python = json.loads(data_json)
csv_out = io.open('filename', mode='w', encoding='utf-8') #opens csv file
fields = u'created_at,text,screen_name,followers,friends,rt,fav' #field
names
csv_out.write(fields)
csv_out.write(u'\n')
for line in data_python:
#writes a row and gets the fields from the json object
#screen_name and followers/friends are found on the second level hence two
get methods
row = [line.get('created_at'),
'"' + line.get('text').replace('"','""') + '"', #creates double
quotes
line.get('user').get('screen_name'),
unicode(line.get('user').get('followers_count')),
unicode(line.get('user').get('friends_count')),
unicode(line.get('retweet_count')),
unicode(line.get('favorite_count'))]
row_joined = u','.join(row)
csv_out.write(row_joined)
csv_out.write(u'\n')
csv_out.close()
Edit 2: I found another recipe to parse it but there is no way for me to save the output. Any recommendations?
import json
import re
json_as_string = open('filename.json', 'r')
# Call this as a recursive function if your json is highly nested
lines = [re.sub("[\[\{\]]*", "", one_object.rstrip()) for one_object in
json_as_string.readlines()]
json_as_list = "".join(lines).split('}')
for elem in json_as_list:
if len(elem) > 0:
print(json.loads(json.dumps("{" + elem[::1] + "}")))

How to generate a file without saving it to disk in python?

I'm using Python 2.7 and Django 1.7.
I have a method in my admin interface that generates some kind of a csv file.
def generate_csv(args):
...
#some code that generates a dictionary to be written as csv
....
# this creates a directory and returns its filepath
dirname = create_csv_dir('stock')
csvpath = os.path.join(dirname, 'mycsv_file.csv')
fieldnames = [#some field names]
# this function creates the csv file in the directory shown by the csvpath
newcsv(data, csvheader, csvpath, fieldnames)
# this automatically starts a download from that directory
return HttpResponseRedirect('/media/csv/stock/%s' % csvfile)
All in all I create a csv file, save it somewhere on the disk, and then pass its URL to the user for download.
I was thinking if all this can be done without writing to disc. I googled around a bit and maybe content disposition attachment might help me, but I got lost in documentation a bit.
Anyway if there's an easier way of doing this I'd love to know.
Thanks to #Ragora, you pointed me towards the right direction.
I rewrote the newcsv method:
from io import StringIO
import csv
def newcsv(data, csvheader, fieldnames):
"""
Create a new csv file that represents generated data.
"""
new_csvfile = StringIO.StringIO()
wr = csv.writer(new_csvfile, quoting=csv.QUOTE_ALL)
wr.writerow(csvheader)
wr = csv.DictWriter(new_csvfile, fieldnames = fieldnames)
for key in data.keys():
wr.writerow(data[key])
return new_csvfile
and in the admin:
csvfile = newcsv(data, csvheader, fieldnames)
response = HttpResponse(csvfile.getvalue(), content_type='text/csv')
response['Content-Disposition'] = 'attachment; filename=stock.csv'
return response
If it annoys you that you are saving a file to disk, just add the application/octet-stream content-type to the Content-Disposition header then delete the file from disk.
If this header (Content-Disposition) is used in a response with the application/octet- stream content-type, the implied suggestion is that the user agent should not display the response, but directly enter a `save response as...' dialog.

CSV new-line character seen in unquoted field error

the following code worked until today when I imported from a Windows machine and got this error:
new-line character seen in unquoted field - do you need to open the file in universal-newline mode?
import csv
class CSV:
def __init__(self, file=None):
self.file = file
def read_file(self):
data = []
file_read = csv.reader(self.file)
for row in file_read:
data.append(row)
return data
def get_row_count(self):
return len(self.read_file())
def get_column_count(self):
new_data = self.read_file()
return len(new_data[0])
def get_data(self, rows=1):
data = self.read_file()
return data[:rows]
How can I fix this issue?
def upload_configurator(request, id=None):
"""
A view that allows the user to configurator the uploaded CSV.
"""
upload = Upload.objects.get(id=id)
csvobject = CSV(upload.filepath)
upload.num_records = csvobject.get_row_count()
upload.num_columns = csvobject.get_column_count()
upload.save()
form = ConfiguratorForm()
row_count = csvobject.get_row_count()
colum_count = csvobject.get_column_count()
first_row = csvobject.get_data(rows=1)
first_two_rows = csvobject.get_data(rows=5)
It'll be good to see the csv file itself, but this might work for you, give it a try, replace:
file_read = csv.reader(self.file)
with:
file_read = csv.reader(self.file, dialect=csv.excel_tab)
Or, open a file with universal newline mode and pass it to csv.reader, like:
reader = csv.reader(open(self.file, 'rU'), dialect=csv.excel_tab)
Or, use splitlines(), like this:
def read_file(self):
with open(self.file, 'r') as f:
data = [row for row in csv.reader(f.read().splitlines())]
return data
I realize this is an old post, but I ran into the same problem and don't see the correct answer so I will give it a try
Python Error:
_csv.Error: new-line character seen in unquoted field
Caused by trying to read Macintosh (pre OS X formatted) CSV files. These are text files that use CR for end of line. If using MS Office make sure you select either plain CSV format or CSV (MS-DOS). Do not use CSV (Macintosh) as save-as type.
My preferred EOL version would be LF (Unix/Linux/Apple), but I don't think MS Office provides the option to save in this format.
For Mac OS X, save your CSV file in "Windows Comma Separated (.csv)" format.
If this happens to you on mac (as it did to me):
Save the file as CSV (MS-DOS Comma-Separated)
Run the following script
with open(csv_filename, 'rU') as csvfile:
csvreader = csv.reader(csvfile)
for row in csvreader:
print ', '.join(row)
Try to run dos2unix on your windows imported files first
This is an error that I faced. I had saved .csv file in MAC OSX.
While saving, save it as "Windows Comma Separated Values (.csv)" which resolved the issue.
This worked for me on OSX.
# allow variable to opened as files
from io import StringIO
# library to map other strange (accented) characters back into UTF-8
from unidecode import unidecode
# cleanse input file with Windows formating to plain UTF-8 string
with open(filename, 'rb') as fID:
uncleansedBytes = fID.read()
# decode the file using the correct encoding scheme
# (probably this old windows one)
uncleansedText = uncleansedBytes.decode('Windows-1252')
# replace carriage-returns with new-lines
cleansedText = uncleansedText.replace('\r', '\n')
# map any other non UTF-8 characters into UTF-8
asciiText = unidecode(cleansedText)
# read each line of the csv file and store as an array of dicts,
# use first line as field names for each dict.
reader = csv.DictReader(StringIO(cleansedText))
for line_entry in reader:
# do something with your read data
I know this has been answered for quite some time but not solve my problem. I am using DictReader and StringIO for my csv reading due to some other complications. I was able to solve problem more simply by replacing delimiters explicitly:
with urllib.request.urlopen(q) as response:
raw_data = response.read()
encoding = response.info().get_content_charset('utf8')
data = raw_data.decode(encoding)
if '\r\n' not in data:
# proably a windows delimited thing...try to update it
data = data.replace('\r', '\r\n')
Might not be reasonable for enormous CSV files, but worked well for my use case.
Alternative and fast solution : I faced the same error. I reopened the "wierd" csv file in GNUMERIC on my lubuntu machine and exported the file as csv file. This corrected the issue.

Getting "newline inside string" while reading the csv file in Python?

I have this utils.py file in Django Architecture:
def range_data(ip):
r = []
f = open(os.path.join(settings.PROJECT_ROOT, 'static', 'csv ',
'GeoIPCountryWhois.csv'))
for num,row in enumerate(csv.reader(f)):
if row[0] <= ip <= row[1]:
r.append([r[4]])
return r
else:
continue
return r
Here the ip parameter is just the IPv4 Address, I am using open source MAXMIND GeoIPCountrywhois.csv file.
Some starting content of GeopIOCountrywhois.csv:
"1.0.0.0","1.0.0.255","16777216","16777471","AU","Australia"
"1.0.1.0","1.0.3.255","16777472","16778239","CN","China"
"1.0.4.0","1.0.7.255","16778240","16779263","AU","Australia"
"1.0.8.0","1.0.15.255","16779264","16781311","CN","China"
"1.0.16.0","1.0.31.255","16781312","16785407","JP","Japan"
"1.0.32.0","1.0.63.255","16785408","16793599","CN","China"
"1.0.64.0","1.0.127.255","16793600","16809983","JP","Japan"
"1.0.128.0","1.0.255.255","16809984","16842751","TH","Thailand"
I have also read about the issue, But didn't found so much understandable. Would you please help me to solve that error?
According to my method in utils, I am checking country name of paasing parameter IP address to the method.
had similar problem earlier today, there was an end quote missing from a line and the solution is by instructing reader to perform no special processing of quote characters (quoting=csv.QUOTE_NONE).
You can preprocess the csv by removing the newline like below.
import csv
content = open("GeoIPCountryWhois.csv", "r").read().replace('\r\n','\n')
with open("GeoIPCountryWhois2.csv", "w") as g:
g.write(content)
Then Use GeoIPCountryWhois2 for csv reader.
A wild Guess using a lineterminator may solve your problem
for num,row in enumerate(csv.reader(f,lineterminator='\n'))
See also: http://docs.python.org/lib/csv-fmt-params.html
You must open your files as binary:
def range_data(ip):
r = []
f = open(os.path.join(settings.PROJECT_ROOT, 'static', 'csv ',
'GeoIPCountryWhois.csv'), 'rb')
for num,row in enumerate(csv.reader(f)):
# Your things.
Note the 'rb' mode there; otherwise the file could be opened with native line endings, and the CSV reader doesn't handle the various forms very well. Certainly the copy of GeoIPCountryWhois.csv that I downloaded has clean \n line endings.
This is documented for the .reader() method:
If csvfile is a file object, it must be opened with the ‘b’ flag on platforms where that makes a difference.
If, however, your csv file is so corrupted as to still contain unexpected newline characters in unexpected places, use this file subclass instead as a stop-gap measure:
class CleanlinesFile(file):
def next(self):
line = super(CleanlinesFile, self).next()
return line.replace('\r', '').replace('\n', '') + '\n'
This class guarantees there will be no newlines anywhere in the returned results except as the very last character (just the way the csv module wants it). Use it instead of the open call; the 'rb' mode modifier becomes optional in this case:
def range_data(ip):
r = []
f = CleanlinesFile(os.path.join(settings.PROJECT_ROOT, 'static', 'csv ',
'GeoIPCountryWhois.csv'))
for num,row in enumerate(csv.reader(f)):
# Your things.

process large text file in python

I have a very large file (3.8G) that is an extract of users from a system at my school. I need to reprocess that file so that it just contains their ID and email address, comma separated.
I have very little experience with this and would like to use it as a learning exercise for Python.
The file has entries that look like this:
dn: uid=123456789012345,ou=Students,o=system.edu,o=system
LoginId: 0099886
mail: fflintstone#system.edu
dn: uid=543210987654321,ou=Students,o=system.edu,o=system
LoginId: 0083156
mail: brubble#system.edu
I am trying to get a file that looks like:
0099886,fflintstone#system.edu
0083156,brubble#system.edu
Any tips or code?
That actually looks like an LDIF file to me. The python-ldap library has a pure-Python LDIF handling library that could help if your file possesses some of the nasty gotchas possible in LDIF, e.g. Base64-encoded values, entry folding, etc.
You could use it like so:
import csv
import ldif
class ParseRecords(ldif.LDIFParser):
def __init__(self, csv_writer):
self.csv_writer = csv_writer
def handle(self, dn, entry):
self.csv_writer.writerow([entry['LoginId'], entry['mail']])
with open('/path/to/large_file') as input, with open('output_file', 'wb') as output:
csv_writer = csv.writer(output)
csv_writer.writerow(['LoginId', 'Mail'])
ParseRecords(input, csv_writer).parse()
Edit
So to extract from a live LDAP directory, using the python-ldap library you would want to do something like this:
import csv
import ldap
con = ldap.initialize('ldap://server.fqdn.system.edu')
# if you're LDAP directory requires authentication
# con.bind_s(username, password)
try:
with open('output_file', 'wb') as output:
csv_writer = csv.writer(output)
csv_writer.writerow(['LoginId', 'Mail'])
for dn, attrs in con.search_s('ou=Students,o=system.edu,o=system', ldap.SCOPE_SUBTREE, attrlist = ['LoginId','mail']:
csv_writer.writerow([attrs['LoginId'], attrs['mail']])
finally:
# even if you don't have credentials, it's usually good to unbind
con.unbind_s()
It's probably worthwhile reading through the documentation for the ldap module, especially the example.
Note that in the example above, I completely skipped supplying a filter, which you would probably want to do in production. A filter in LDAP is similar to the WHERE clause in a SQL statement; it restricts what objects are returned. Microsoft actually has a good guide on LDAP filters. The canonical reference for LDAP filters is RFC 4515.
Similarly, if there are potentially several thousand entries even after applying an appropriate filter, you may need to look into the LDAP paging control, though using that would, again, make the example more complex. Hopefully that's enough to get you started, but if anything comes up, feel free to ask or open a new question.
Good luck.
Assuming that the structure of each entry will always be the same, just do something like this:
import csv
# Open the file
f = open("/path/to/large.file", "r")
# Create an output file
output_file = open("/desired/path/to/final/file", "w")
# Use the CSV module to make use of existing functionality.
final_file = csv.writer(output_file)
# Write the header row - can be skipped if headers not needed.
final_file.writerow(["LoginID","EmailAddress"])
# Set up our temporary cache for a user
current_user = []
# Iterate over the large file
# Note that we are avoiding loading the entire file into memory
for line in f:
if line.startswith("LoginID"):
current_user.append(line[9:].strip())
# If more information is desired, simply add it to the conditions here
# (additional elif's should do)
# and add it to the current user.
elif line.startswith("mail"):
current_user.append(line[6:].strip())
# Once you know you have reached the end of a user entry
# write the row to the final file
# and clear your temporary list.
final_file.writerow(current_user)
current_user = []
# Skip lines that aren't interesting.
else:
continue
Again assuming your file is well-formed:
with open(inputfilename) as inputfile, with open(outputfilename) as outputfile:
mail = loginid = ''
for line in inputfile:
line = inputfile.split(':')
if line[0] not in ('LoginId', 'mail'):
continue
if line[0] == 'LoginId':
loginid = line[1].strip()
if line[0] == 'mail':
mail = line[1].strip()
if mail and loginid:
output.write(loginid + ',' + mail + '\n')
mail = loginid = ''
Essentially equivalent to the other methods.
To open the file you'll want to use something like the with keyword to ensure it closes properly even if something goes wrong:
with open(<your_file>, "r") as f:
# Do stuff
As for actually parsing out that information, I'd recommend building a dictionary of ID email pairs. You'll also need a variable for the uid and the email.
data = {}
uid = 0
email = ""
To actually parse through the file (the stuff run while your file is open) you can do something like this:
for line in f:
if "uid=" in line:
# Parse the user id out by grabbing the substring between the first = and ,
uid = line[line.find("=")+1:line.find(",")]
elif "mail:" in line:
# Parse the email out by grabbing everything from the : to the end (removing the newline character)
email = line[line.find(": ")+2:-1]
# Given the formatting you've provided, this comes second so we can make an entry into the dict here
data[uid] = email
Using the CSV writer (remember to import csv at the beginning of the file) we can output like this:
writer = csv.writer(<filename>)
writer.writerow("User, Email")
for id, mail in data.iteritems:
writer.writerow(id + "," + mail)
Another option is to open the writer before the file, write the header, then read the lines from the file at the same time as writing to the CSV. This avoids dumping the information into memory, which might be highly desirable. So putting it all together we get
writer = csv.writer(<filename>)
writer.writerow("User, Email")
with open(<your_file>, "r") as f:
for line in f:
if "uid=" in line:
# Parse the user id out by grabbing the substring between the first = and ,
uid = line[line.find("=")+1:line.find(",")]
elif "mail:" in line:
# Parse the email out by grabbing everything from the : to the end (removing the newline character)
email = line[line.find(": ")+2:-1]
# Given the formatting you've provided, this comes second so we can make an entry into the dict here
writer.writerow(iid + "," + email)

Categories

Resources