I am building an recommendation engine. This json file contains event data, I want to convert it into a dataframe. I tried read_json method but it give an error
UnicodeDecodeError:'charmap'codec can't decode byte 0x81
in position 21573281:charactermaps to <undefined>
Below is some entries from json:
{"_id":{"$oid":"57a30ce268fd0809ec4d194f"},"session":{"start_timestamp":{"$numberLong":"1470183490481"},"session_id":"def5faa9-20160803-001810481"},"metrics":{},"arrival_timestamp":{"$numberLong":"1470183523054"},"event_type":"OfferViewed","event_timestamp":{"$numberLong":"1470183505399"},"event_version":"3.0","application":{"package_name":"com.think.vito","title":"Vito","version_code":"5","app_id":"7ffa58dab3c646cea642e961ff8a8070","cognito_identity_pool_id":"us-east-1:4d9cf803-0487-44ec-be27-1e160d15df74","version_name":"2.0.0.0","sdk":{"version":"2.2.2","name":"aws-sdk-android"}},"client":{"cognito_id":"us-east-1:2e26918b-f7b1-471e-9df4-b931509f7d37","client_id":"ee0b61b0-85cf-4b2f-960e-e2aedef5faa9"},"device":{"locale":{"country":"US","code":"en_US","language":"en"},"platform":{"version":"5.1.1","name":"ANDROID"},"make":"YU","model":"AO5510"},"attributes":{"Category":"120000","CustomerID":"4078","OfferID":"45436"}}
{"_id":{"$oid":"57a30ce268fd0809ec4d1950"},"session":{"start_timestamp":{"$numberLong":"1470183490481"},"session_id":"def5faa9-20160803-001810481"},"metrics":{},"arrival_timestamp":{"$numberLong":"1470183523054"},"event_type":"ContextMenuItemSelected","event_timestamp":{"$numberLong":"1470183500206"},"event_version":"3.0","application":{"package_name":"com.think.vito","title":"Vito","version_code":"5","app_id":"7ffa58dab3c646cea642e961ff8a8070","cognito_identity_pool_id":"us-east-1:4d9cf803-0487-44ec-be27-1e160d15df74","version_name":"2.0.0.0","sdk":{"version":"2.2.2","name":"aws-sdk-android"}},"client":{"cognito_id":"us-east-1:2e26918b-f7b1-471e-9df4-b931509f7d37","client_id":"ee0b61b0-85cf-4b2f-960e-e2aedef5faa9"},"device":{"locale":{"country":"US","code":"en_US","language":"en"},"platform":{"version":"5.1.1","name":"ANDROID"},"make":"YU","model":"AO5510"},"attributes":{"MenuItem":"OfferList","CustomerID":"4078"}}
{"_id":{"$oid":"57a30ce268fd0809ec4d1951"},"session":{"start_timestamp":{"$numberLong":"1470183490481"},"session_id":"def5faa9-20160803-001810481"},"metrics":{},"arrival_timestamp":{"$numberLong":"1470183523054"},"event_type":"CategoryPageCategorySelection","event_timestamp":{"$numberLong":"1470183499171"},"event_version":"3.0","application":{"package_name":"com.think.vito","title":"Vito","version_code":"5","app_id":"7ffa58dab3c646cea642e961ff8a8070","cognito_identity_pool_id":"us-east-1:4d9cf803-0487-44ec-be27-1e160d15df74","version_name":"2.0.0.0","sdk":{"version":"2.2.2","name":"aws-sdk-android"}},"client":{"cognito_id":"us-east-1:2e26918b-f7b1-471e-9df4-b931509f7d37","client_id":"ee0b61b0-85cf-4b2f-960e-e2aedef5faa9"},"device":{"locale":{"country":"US","code":"en_US","language":"en"},"platform":{"version":"5.1.1","name":"ANDROID"},"make":"YU","model":"AO5510"},"attributes":{"Category":"Recharge","CustomerID":"4078"}}
{"_id":{"$oid":"57a30ce268fd0809ec4d1952"},"session":{"start_timestamp":{"$numberLong":"1470183490481"},"session_id":"def5faa9-20160803-001810481"},"metrics":{},"arrival_timestamp":{"$numberLong":"1470183523054"},"event_type":"_session.start","event_timestamp":{"$numberLong":"1470183490481"},"event_version":"3.0","application":{"package_name":"com.think.vito","title":"Vito","version_code":"5","app_id":"7ffa58dab3c646cea642e961ff8a8070","cognito_identity_pool_id":"us-east-1:4d9cf803-0487-44ec-be27-1e160d15df74","version_name":"2.0.0.0","sdk":{"version":"2.2.2","name":"aws-sdk-android"}},"client":{"cognito_id":"us-east-1:2e26918b-f7b1-471e-9df4-b931509f7d37","client_id":"ee0b61b0-85cf-4b2f-960e-e2aedef5faa9"},"device":{"locale":{"country":"US","code":"en_US","language":"en"},"platform":{"version":"5.1.1","name":"ANDROID"},"make":"YU","model":"AO5510"},"attributes":{"CustomerID":"4078"}}
{"_id":{"$oid":"57a30ce268fd0809ec4d1953"},"session":{"start_timestamp":{"$numberLong":"1470181311752"},"session_id":"def5faa9-20160802-234151752","stop_timestamp":{"$numberLong":"1470181484875"}},"metrics":{},"arrival_timestamp":{"$numberLong":"1470183523054"},"event_type":"_session.stop","event_timestamp":{"$numberLong":"1470183490480"},"event_version":"3.0","application":{"package_name":"com.think.vito","title":"Vito","version_code":"5","app_id":"7ffa58dab3c646cea642e961ff8a8070","cognito_identity_pool_id":"us-east-1:4d9cf803-0487-44ec-be27-1e160d15df74","version_name":"2.0.0.0","sdk":{"version":"2.2.2","name":"aws-sdk-android"}},"client":{"cognito_id":"us-east-1:2e26918b-f7b1-471e-9df4-b931509f7d37","client_id":"ee0b61b0-85cf-4b2f-960e-e2aedef5faa9"},"device":{"locale":{"country":"US","code":"en_US","language":"en"},"platform":{"version":"5.1.1","name":"ANDROID"},"make":"YU","model":"AO5510"},"attributes":{}}
{"_id":{"$oid":"57a30ce268fd0809ec4d1954"},"session":{"start_timestamp":{"$numberLong":"1470193238841"},"session_id":"7b606a93-20160803-030038841"},"metrics":{},"arrival_timestamp":{"$numberLong":"1470193295093"},"event_type":"_session.start","event_timestamp":{"$numberLong":"1470193238844"},"event_version":"3.0","application":{"package_name":"com.think.vito","title":"Vito","version_code":"2","app_id":"7ffa58dab3c646cea642e961ff8a8070","cognito_identity_pool_id":"us-east-1:4d9cf803-0487-44ec-be27-1e160d15df74","version_name":"1.0.2","sdk":{"version":"2.2.2","name":"aws-sdk-android"}},"client":{"cognito_id":"us-east-1:e96515c9-5824-4c66-a42f-33cceb78b6e3","client_id":"efed74fd-40d8-41a2-b37e-e85c7b606a93"},"device":{"locale":{"country":"GB","code":"en_GB","language":"en"},"platform":{"version":"5.1.1","name":"ANDROID"},"make":"samsung","model":"SM-J200G"},"attributes":{}}
{"_id":{"$oid":"57a30ce268fd0809ec4d1955"},"session":{"start_timestamp":{"$numberLong":"1470193253960"},"session_id":"7b606a93-20160803-030053960","stop_timestamp":{"$numberLong":"1470193256359"}},"metrics":{},"arrival_timestamp":{"$numberLong":"1470193404776"},"event_type":"_session.stop","event_timestamp":{"$numberLong":"1470193278227"},"event_version":"3.0","application":{"package_name":"com.think.vito","title":"Vito","version_code":"2","app_id":"7ffa58dab3c646cea642e961ff8a8070","cognito_identity_pool_id":"us-east-1:4d9cf803-0487-44ec-be27-1e160d15df74","version_name":"1.0.2","sdk":{"version":"2.2.2","name":"aws-sdk-android"}},"client":{"cognito_id":"us-east-1:e96515c9-5824-4c66-a42f-33cceb78b6e3","client_id":"efed74fd-40d8-41a2-b37e-e85c7b606a93"},"device":{"locale":{"country":"GB","code":"en_GB","language":"en"},"platform":{"version":"5.1.1","name":"ANDROID"},"make":"samsung","model":"SM-J200G"},"attributes":{}}
{"_id":{"$oid":"57a30ce268fd0809ec4d1956"},"session":{"start_timestamp":{"$numberLong":"1470193253960"},"session_id":"7b606a93-20160803-030053960"},"metrics":{},"arrival_timestamp":{"$numberLong":"1470193404776"},"event_type":"_session.start","event_timestamp":{"$numberLong":"1470193253960"},"event_version":"3.0","application":{"package_name":"com.think.vito","title":"Vito","version_code":"2","app_id":"7ffa58dab3c646cea642e961ff8a8070","cognito_identity_pool_id":"us-east-1:4d9cf803-0487-44ec-be27-1e160d15df74","version_name":"1.0.2","sdk":{"version":"2.2.2","name":"aws-sdk-android"}},"client":{"cognito_id":"us-east-1:e96515c9-5824-4c66-a42f-33cceb78b6e3","client_id":"efed74fd-40d8-41a2-b37e-e85c7b606a93"},"device":{"locale":{"country":"GB","code":"en_GB","language":"en"},"platform":{"version":"5.1.1","name":"ANDROID"},"make":"samsung","model":"SM-J200G"},"attributes":{}}
{"_id":{"$oid":"57a30ce268fd0809ec4d1957"},"session":{"start_timestamp":{"$numberLong":"1470193238841"},"session_id":"7b606a93-20160803-030038841","stop_timestamp":{"$numberLong":"1470193244581"}},"metrics":{},"arrival_timestamp":{"$numberLong":"1470193404776"},"event_type":"_session.stop","event_timestamp":{"$numberLong":"1470193253959"},"event_version":"3.0","application":{"package_name":"com.think.vito","title":"Vito","version_code":"2","app_id":"7ffa58dab3c646cea642e961ff8a8070","cognito_identity_pool_id":"us-east-1:4d9cf803-0487-44ec-be27-1e160d15df74","version_name":"1.0.2","sdk":{"version":"2.2.2","name":"aws-sdk-android"}},"client":{"cognito_id":"us-east-1:e96515c9-5824-4c66-a42f-33cceb78b6e3","client_id":"efed74fd-40d8-41a2-b37e-e85c7b606a93"},"device":{"locale":{"country":"GB","code":"en_GB","language":"en"},"platform":{"version":"5.1.1","name":"ANDROID"},"make":"samsung","model":"SM-J200G"},"attributes":{}}
{"_id":{"$oid":"57a30ce268fd0809ec4d1958"},"session":{"start_timestamp":{"$numberLong":"1470193331290"},"session_id":"7b606a93-20160803-030211290"},"metrics":{},"arrival_timestamp":{"$numberLong":"1470193404776"},"event_type":"_session.start","event_timestamp":{"$numberLong":"1470193331291"},"event_version":"3.0","application":{"package_name":"com.think.vito","title":"Vito","version_code":"2","app_id":"7ffa58dab3c646cea642e961ff8a8070","cognito_identity_pool_id":"us-east-1:4d9cf803-0487-44ec-be27-1e160d15df74","version_name":"1.0.2","sdk":{"version":"2.2.2","name":"aws-sdk-android"}},"client":{"cognito_id":"us-east-1:e96515c9-5824-4c66-a42f-33cceb78b6e3","client_id":"efed74fd-40d8-41a2-b37e-e85c7b606a93"},"device":{"locale":{"country":"GB","code":"en_GB","language":"en"},"platform":{"version":"5.1.1","name":"ANDROID"},"make":"samsung","model":"SM-J200G"},"attributes":{}}
Wrong encoding. Explicitely read it as utf-8 e.g. (edit: +'dirty' Line Feeds (LF aka. \n)
with open(datafilename, encoding="utf8") as f:
# Reading file as list of lines
data = f.readlines()
# Removing useless whitespaces
data = [line.rstrip() for line in data]
# Joining lines together
data = ''.join(data)
# Loading dataframe from json str
df = pandas.read_json(datafile)
You could try using:
import json
with open('myfile.json') as json_data:
d = json.load(json_data)
print(d)
Without more info its difficult to advise.
As the error says, you have an issue with the encoding. When you read in the file, you need to change the encoding:
file = open(filename, encoding="utf8")
im trying to archive the following:
input: xls file
output: csv file
I want to read the xls and do some manipulations (rewrite the headers (original: customernumer, csv needs Customer_Number__c), removing some columns, etc.
Right now I'm already reading the xls and try to write as csv (without any manipulations), but I'm struggling because of the coding.
The original file contains some "special" characters like "/", "\", and most impoartant "ä, ü, ö, ß".
I get the following error:
UnicodeEncodeError: 'ascii' codec can't encode character u'\xe4' in position 8: ordinal not in range(128)
I have no clue which special characters can be in a file, this changes from time to time.
here is my current sandbox code:
# -*- coding: utf-8 -*-
__author__ = 'adieball'
import xlrd
import csv
from os import sys
import argparse
def main():
parser = argparse.ArgumentParser()
parser.add_argument("inname", type=str,
help="Names of the Input File in single quotes")
parser.add_argument("--outname", type=str,
help="Optional enter the name of the output (csv) file. if nothing is given, "
"we use the name of the input file and add .csv to it")
args = parser.parse_args()
if args.outname is None:
outname = args.inname + ".csv"
else:
outname = args.outname
wb = xlrd.open_workbook(args.inname)
xl_sheet = wb.sheet_by_index(0)
print args.inname
print ('Retrieved worksheet: %s' % xl_sheet.name)
print outname
output = open(outname, 'wb')
wr = csv.writer(output, quoting=csv.QUOTE_ALL)
for rownum in xrange(wb.sheet_by_index(0).nrows):
wr.writerow(wb.sheet_by_index(0).row_values(rownum))
output.close()
anything I can do here to make sure these special characters get written to the csv in the same way as they appeared in the original xls?
thanks
andre
a simple
from os import sys
reload(sys)
sys.setdefaultencoding("utf-8")
did the trick
Andre
You could convert the script to Python 3, and then set the write mode when opening the the output file to "w" instead to write Unicode. Not trying to evangelize, but Python 3 makes this sort of thing easier. If you wanna stay with Python 2 checkout this guide: https://docs.python.org/2/howto/unicode.html
If you want to write a utf-8 encoded file, you have to use the codecs.open. Try this small example:
o1 = open('/tmp/o1.txt', 'wb')
try:
o1.write(u'\u20ac')
except Exception, exc:
print exc
o1.close()
import codecs
o2 = codecs.open('/tmp/o2.txt', 'w', 'utf-8')
o2.write(u'\u20ac')
o2.close()
Why not using UnicodeWriter class as in examples in csv doc https://docs.python.org/2/library/csv.html#examples . I think it should solve your problem.
If not I'll propose you different look to your problem if you have Excel - use win32com, Dispatch excel, and use Excel Object model. You can use build-in excel functions to rename, delete columns etc. and then save it as csv.
E.g.
import win32com.client
excelInstance = win32com.client.gencache.EnsureDispatch('Excel.Application')
workbook = excelInstance.Workbooks.Open(filepath)
worksheet = workbook.Worksheets('WorksheetName')
#### do what you like
worksheet.UsedRange.Find('customernumer').Value2 = 'Customer_Number__c'
####
workbook.SaveAs('Filename.csv', 6) #6 means csv in XlFileFormat enumeration