I've written a webscraper that scrapes NBA box score data off of basketball-reference. The specific webpage that my error:
UnicodeEncodeError: 'charmap' codec can't encode character '\u0107' in position 11: character maps to <undefined>
is occurring on is here. Lastly, the specific player data that is tripping it up and throwing this specific UnicodeEncodeError is this one (although I am sure the error is more generalized and will be produced with any character that contains an obscure accent mark).
The minimal reproducible code:
def get_boxscore_basic_table(tag): #used to only get specific tables
tag_id = tag.get("id")
tag_class = tag.get("class")
return (tag_id and tag_class) and ("basic" in tag_id and "section_wrapper" in tag_class and not "toggleable" in tag_class)
import requests
from bs4 import BeautifulSoup
import lxml
import csv
import re
website = 'https://www.basketball-reference.com/boxscores/202003110MIA.html'
r = requests.get(website).text
soup = BeautifulSoup(r, 'lxml')
tables = soup.find_all(get_boxscore_basic_table)
in_file = open('boxscore.csv', 'w', newline='')
csv_writer = csv.writer(in_file)
column_names = ['Player','Name','MP','FG','FGA','FG%','3P','3PA','3P%','FT','FTA','FT%','ORB','DRB','TRB','AST','STL','BLK','TOV','PF','PTS','+/-']
csv_writer.writerow(column_names)
for table in tables:
rows = table.select('tbody tr')
for row in rows:
building_player = [] #temporary container to hold player and stats
player_name = row.th.text
if 'Reserves' not in player_name:
building_player.append(player_name)
stats = row.select('td.right')
for stat in stats:
building_player.append(stat.text)
csv_writer.writerow(building_player) #writing to csv
in_file.close()
What is the best way around this?
I've seen some stuff online about changing the encoding and specifically using the.encode('utf-8') method on the string before writing to the csv but it seems that this .encode() method, although it stops an error from being thrown, has several of its own problems. For instance; player_name.encode('utf-8') before writing to csv turns the name 'Willy Hernangómez' into 'b'Willy Hernang\xc3\xb3mez'' within by csv... not exactly a step in the right direction.
Any help with this and an explanation as to what is happening would be much appreciated!
use
in_file = open('boxscore.csv', 'w', newline='', encoding='utf-8')
instead of
in_file = open('boxscore.csv', 'w', newline='')
and keep everything the same. Make sure you open Excel in utf-8 encoding
I am building an recommendation engine. This json file contains event data, I want to convert it into a dataframe. I tried read_json method but it give an error
UnicodeDecodeError:'charmap'codec can't decode byte 0x81
in position 21573281:charactermaps to <undefined>
Below is some entries from json:
{"_id":{"$oid":"57a30ce268fd0809ec4d194f"},"session":{"start_timestamp":{"$numberLong":"1470183490481"},"session_id":"def5faa9-20160803-001810481"},"metrics":{},"arrival_timestamp":{"$numberLong":"1470183523054"},"event_type":"OfferViewed","event_timestamp":{"$numberLong":"1470183505399"},"event_version":"3.0","application":{"package_name":"com.think.vito","title":"Vito","version_code":"5","app_id":"7ffa58dab3c646cea642e961ff8a8070","cognito_identity_pool_id":"us-east-1:4d9cf803-0487-44ec-be27-1e160d15df74","version_name":"2.0.0.0","sdk":{"version":"2.2.2","name":"aws-sdk-android"}},"client":{"cognito_id":"us-east-1:2e26918b-f7b1-471e-9df4-b931509f7d37","client_id":"ee0b61b0-85cf-4b2f-960e-e2aedef5faa9"},"device":{"locale":{"country":"US","code":"en_US","language":"en"},"platform":{"version":"5.1.1","name":"ANDROID"},"make":"YU","model":"AO5510"},"attributes":{"Category":"120000","CustomerID":"4078","OfferID":"45436"}}
{"_id":{"$oid":"57a30ce268fd0809ec4d1950"},"session":{"start_timestamp":{"$numberLong":"1470183490481"},"session_id":"def5faa9-20160803-001810481"},"metrics":{},"arrival_timestamp":{"$numberLong":"1470183523054"},"event_type":"ContextMenuItemSelected","event_timestamp":{"$numberLong":"1470183500206"},"event_version":"3.0","application":{"package_name":"com.think.vito","title":"Vito","version_code":"5","app_id":"7ffa58dab3c646cea642e961ff8a8070","cognito_identity_pool_id":"us-east-1:4d9cf803-0487-44ec-be27-1e160d15df74","version_name":"2.0.0.0","sdk":{"version":"2.2.2","name":"aws-sdk-android"}},"client":{"cognito_id":"us-east-1:2e26918b-f7b1-471e-9df4-b931509f7d37","client_id":"ee0b61b0-85cf-4b2f-960e-e2aedef5faa9"},"device":{"locale":{"country":"US","code":"en_US","language":"en"},"platform":{"version":"5.1.1","name":"ANDROID"},"make":"YU","model":"AO5510"},"attributes":{"MenuItem":"OfferList","CustomerID":"4078"}}
{"_id":{"$oid":"57a30ce268fd0809ec4d1951"},"session":{"start_timestamp":{"$numberLong":"1470183490481"},"session_id":"def5faa9-20160803-001810481"},"metrics":{},"arrival_timestamp":{"$numberLong":"1470183523054"},"event_type":"CategoryPageCategorySelection","event_timestamp":{"$numberLong":"1470183499171"},"event_version":"3.0","application":{"package_name":"com.think.vito","title":"Vito","version_code":"5","app_id":"7ffa58dab3c646cea642e961ff8a8070","cognito_identity_pool_id":"us-east-1:4d9cf803-0487-44ec-be27-1e160d15df74","version_name":"2.0.0.0","sdk":{"version":"2.2.2","name":"aws-sdk-android"}},"client":{"cognito_id":"us-east-1:2e26918b-f7b1-471e-9df4-b931509f7d37","client_id":"ee0b61b0-85cf-4b2f-960e-e2aedef5faa9"},"device":{"locale":{"country":"US","code":"en_US","language":"en"},"platform":{"version":"5.1.1","name":"ANDROID"},"make":"YU","model":"AO5510"},"attributes":{"Category":"Recharge","CustomerID":"4078"}}
{"_id":{"$oid":"57a30ce268fd0809ec4d1952"},"session":{"start_timestamp":{"$numberLong":"1470183490481"},"session_id":"def5faa9-20160803-001810481"},"metrics":{},"arrival_timestamp":{"$numberLong":"1470183523054"},"event_type":"_session.start","event_timestamp":{"$numberLong":"1470183490481"},"event_version":"3.0","application":{"package_name":"com.think.vito","title":"Vito","version_code":"5","app_id":"7ffa58dab3c646cea642e961ff8a8070","cognito_identity_pool_id":"us-east-1:4d9cf803-0487-44ec-be27-1e160d15df74","version_name":"2.0.0.0","sdk":{"version":"2.2.2","name":"aws-sdk-android"}},"client":{"cognito_id":"us-east-1:2e26918b-f7b1-471e-9df4-b931509f7d37","client_id":"ee0b61b0-85cf-4b2f-960e-e2aedef5faa9"},"device":{"locale":{"country":"US","code":"en_US","language":"en"},"platform":{"version":"5.1.1","name":"ANDROID"},"make":"YU","model":"AO5510"},"attributes":{"CustomerID":"4078"}}
{"_id":{"$oid":"57a30ce268fd0809ec4d1953"},"session":{"start_timestamp":{"$numberLong":"1470181311752"},"session_id":"def5faa9-20160802-234151752","stop_timestamp":{"$numberLong":"1470181484875"}},"metrics":{},"arrival_timestamp":{"$numberLong":"1470183523054"},"event_type":"_session.stop","event_timestamp":{"$numberLong":"1470183490480"},"event_version":"3.0","application":{"package_name":"com.think.vito","title":"Vito","version_code":"5","app_id":"7ffa58dab3c646cea642e961ff8a8070","cognito_identity_pool_id":"us-east-1:4d9cf803-0487-44ec-be27-1e160d15df74","version_name":"2.0.0.0","sdk":{"version":"2.2.2","name":"aws-sdk-android"}},"client":{"cognito_id":"us-east-1:2e26918b-f7b1-471e-9df4-b931509f7d37","client_id":"ee0b61b0-85cf-4b2f-960e-e2aedef5faa9"},"device":{"locale":{"country":"US","code":"en_US","language":"en"},"platform":{"version":"5.1.1","name":"ANDROID"},"make":"YU","model":"AO5510"},"attributes":{}}
{"_id":{"$oid":"57a30ce268fd0809ec4d1954"},"session":{"start_timestamp":{"$numberLong":"1470193238841"},"session_id":"7b606a93-20160803-030038841"},"metrics":{},"arrival_timestamp":{"$numberLong":"1470193295093"},"event_type":"_session.start","event_timestamp":{"$numberLong":"1470193238844"},"event_version":"3.0","application":{"package_name":"com.think.vito","title":"Vito","version_code":"2","app_id":"7ffa58dab3c646cea642e961ff8a8070","cognito_identity_pool_id":"us-east-1:4d9cf803-0487-44ec-be27-1e160d15df74","version_name":"1.0.2","sdk":{"version":"2.2.2","name":"aws-sdk-android"}},"client":{"cognito_id":"us-east-1:e96515c9-5824-4c66-a42f-33cceb78b6e3","client_id":"efed74fd-40d8-41a2-b37e-e85c7b606a93"},"device":{"locale":{"country":"GB","code":"en_GB","language":"en"},"platform":{"version":"5.1.1","name":"ANDROID"},"make":"samsung","model":"SM-J200G"},"attributes":{}}
{"_id":{"$oid":"57a30ce268fd0809ec4d1955"},"session":{"start_timestamp":{"$numberLong":"1470193253960"},"session_id":"7b606a93-20160803-030053960","stop_timestamp":{"$numberLong":"1470193256359"}},"metrics":{},"arrival_timestamp":{"$numberLong":"1470193404776"},"event_type":"_session.stop","event_timestamp":{"$numberLong":"1470193278227"},"event_version":"3.0","application":{"package_name":"com.think.vito","title":"Vito","version_code":"2","app_id":"7ffa58dab3c646cea642e961ff8a8070","cognito_identity_pool_id":"us-east-1:4d9cf803-0487-44ec-be27-1e160d15df74","version_name":"1.0.2","sdk":{"version":"2.2.2","name":"aws-sdk-android"}},"client":{"cognito_id":"us-east-1:e96515c9-5824-4c66-a42f-33cceb78b6e3","client_id":"efed74fd-40d8-41a2-b37e-e85c7b606a93"},"device":{"locale":{"country":"GB","code":"en_GB","language":"en"},"platform":{"version":"5.1.1","name":"ANDROID"},"make":"samsung","model":"SM-J200G"},"attributes":{}}
{"_id":{"$oid":"57a30ce268fd0809ec4d1956"},"session":{"start_timestamp":{"$numberLong":"1470193253960"},"session_id":"7b606a93-20160803-030053960"},"metrics":{},"arrival_timestamp":{"$numberLong":"1470193404776"},"event_type":"_session.start","event_timestamp":{"$numberLong":"1470193253960"},"event_version":"3.0","application":{"package_name":"com.think.vito","title":"Vito","version_code":"2","app_id":"7ffa58dab3c646cea642e961ff8a8070","cognito_identity_pool_id":"us-east-1:4d9cf803-0487-44ec-be27-1e160d15df74","version_name":"1.0.2","sdk":{"version":"2.2.2","name":"aws-sdk-android"}},"client":{"cognito_id":"us-east-1:e96515c9-5824-4c66-a42f-33cceb78b6e3","client_id":"efed74fd-40d8-41a2-b37e-e85c7b606a93"},"device":{"locale":{"country":"GB","code":"en_GB","language":"en"},"platform":{"version":"5.1.1","name":"ANDROID"},"make":"samsung","model":"SM-J200G"},"attributes":{}}
{"_id":{"$oid":"57a30ce268fd0809ec4d1957"},"session":{"start_timestamp":{"$numberLong":"1470193238841"},"session_id":"7b606a93-20160803-030038841","stop_timestamp":{"$numberLong":"1470193244581"}},"metrics":{},"arrival_timestamp":{"$numberLong":"1470193404776"},"event_type":"_session.stop","event_timestamp":{"$numberLong":"1470193253959"},"event_version":"3.0","application":{"package_name":"com.think.vito","title":"Vito","version_code":"2","app_id":"7ffa58dab3c646cea642e961ff8a8070","cognito_identity_pool_id":"us-east-1:4d9cf803-0487-44ec-be27-1e160d15df74","version_name":"1.0.2","sdk":{"version":"2.2.2","name":"aws-sdk-android"}},"client":{"cognito_id":"us-east-1:e96515c9-5824-4c66-a42f-33cceb78b6e3","client_id":"efed74fd-40d8-41a2-b37e-e85c7b606a93"},"device":{"locale":{"country":"GB","code":"en_GB","language":"en"},"platform":{"version":"5.1.1","name":"ANDROID"},"make":"samsung","model":"SM-J200G"},"attributes":{}}
{"_id":{"$oid":"57a30ce268fd0809ec4d1958"},"session":{"start_timestamp":{"$numberLong":"1470193331290"},"session_id":"7b606a93-20160803-030211290"},"metrics":{},"arrival_timestamp":{"$numberLong":"1470193404776"},"event_type":"_session.start","event_timestamp":{"$numberLong":"1470193331291"},"event_version":"3.0","application":{"package_name":"com.think.vito","title":"Vito","version_code":"2","app_id":"7ffa58dab3c646cea642e961ff8a8070","cognito_identity_pool_id":"us-east-1:4d9cf803-0487-44ec-be27-1e160d15df74","version_name":"1.0.2","sdk":{"version":"2.2.2","name":"aws-sdk-android"}},"client":{"cognito_id":"us-east-1:e96515c9-5824-4c66-a42f-33cceb78b6e3","client_id":"efed74fd-40d8-41a2-b37e-e85c7b606a93"},"device":{"locale":{"country":"GB","code":"en_GB","language":"en"},"platform":{"version":"5.1.1","name":"ANDROID"},"make":"samsung","model":"SM-J200G"},"attributes":{}}
Wrong encoding. Explicitely read it as utf-8 e.g. (edit: +'dirty' Line Feeds (LF aka. \n)
with open(datafilename, encoding="utf8") as f:
# Reading file as list of lines
data = f.readlines()
# Removing useless whitespaces
data = [line.rstrip() for line in data]
# Joining lines together
data = ''.join(data)
# Loading dataframe from json str
df = pandas.read_json(datafile)
You could try using:
import json
with open('myfile.json') as json_data:
d = json.load(json_data)
print(d)
Without more info its difficult to advise.
As the error says, you have an issue with the encoding. When you read in the file, you need to change the encoding:
file = open(filename, encoding="utf8")