I am following an online tutorial for making a chatbot...here is the code for the beginning portion
import sqlite3
import json
from datetime import datetime
timeframe = '2015-01'
sql_transaction = []
connection = sqlite3.connect('{}.db'.format(timeframe))
c = connection.cursor()
def create_table():
c.execute("CREATE TABLE IF NOT EXISTS parent_reply(parent_id TEXT PRIMARY KEY, comment_id TEXT UNIQUE, parent TEXT, comment TEXT, subreddit TEXT, unix INT, score INT)")
def format_data(data):
data = data.replace('\n', ' newlinechar ').replace('\r', ' newlinechar ').replace('"', "'")
return data
def find_parent(pid):
try:
sql = "SELECT comment FROM parent_reply WHERE comment_id = '{}' LIMIT 1".format(pid)
c.execute(sql)
result = c.fetchone()
if result != None:
return result[0]
else: return False
except Exception as e:
#print(str(e))
return False
if __name__ == '__main__':
create_table()
row_counter = 0
paired_rows = 0
with open('C:/Users/oriba/Desktop/Month of Reddit/RC_2015-01'.format(timeframe.split('-')[0], timeframe), encoding='ISO-8859-1', buffering=1000) as f:
for row in f:
print(row)
row_counter += 1
row = json.load(row)
parent_id = row['parent_id']
body = format_data(row['body'])
created_utc = row['created_utc']
score = row['score']
comment_id = row['name']
subreddit = row['subreddit']
parent_data = find_parent(parent_id)
When that runs, I get this error:
Traceback (most recent call last):
File "C:/Users/oriba/Desktop/Month of Reddit/chatbot.py", line 37, in <module>
for row in f:
File "C:\Users\oriba\AppData\Local\Programs\Python\Python36\lib\encodings\cp1252.py", line 23, in decode
return codecs.charmap_decode(input,self.errors,decoding_table)[0]
UnicodeDecodeError: 'charmap' codec can't decode byte 0x90 in position 97: character maps to <undefined>
After searching online, I found that adding "encoding='ISO-8859-1'" to 'with open()' should fix it...then I get this error:
Traceback (most recent call last):
File "C:/Users/oriba/Desktop/Month of Reddit/chatbot.py", line 40, in <module>
row = json.load(row)
File "C:\Users\oriba\AppData\Local\Programs\Python\Python36\lib\json\__init__.py", line 296, in load
BZh91AY&SYÔAÙÕÿî»ÿÿÿúÿÿÿÿÿÿÿÿc*è` 1Ï. ñÕ ¢U±Ã$'¤;\=# ÝX9kl´ÜιKW; É# Ò PQáGF PÝ Û P :è
return loads(fp.read(),
AttributeError: 'str' object has no attribute 'read'
And now I'm at a loss. I know this is a lot, this is complex for me. I appreciate any help :)
json.loads() does the job.
loads() read from a str object while load() read from a File object
You code is
for row in f:
...
row here is a str
Related
My file path is
C:/Users/Ratul/Downloads/Machine_Learning_Data/reddit_data/reddit_data/
There are many folders in that directory. I need to look through those directories and open files that starts with 'RC_'
Here's my code:
import sqlite3
import json
import os
from datetime import datetime
timeframe = '2015-05'
sql_transaction = []
connection = sqlite3.connect('{}.db'.format(timeframe))
c = connection.cursor()
def create_table():
c.execute("CREATE TABLE IF NOT EXISTS parent_reply(parent_id TEXT PRIMARY KEY, comment_id TEXT UNIQUE, parent TEXT, comment TEXT, subreddit TEXT, unix INT, score INT)")
def format_data(data):
data = data.replace('\n',' newlinechar ').replace('\r',' newlinechar ').replace('"',"'")
return data
def find_parent(pid):
try:
sql = "SELECT comment FROM parent_reply WHERE comment_id = '{}' LIMIT 1".format(pid)
c.execute(sql)
result = c.fetchone()
if result != None:
return result[0]
else: return False
except Exception as e:
#print(str(e))
return False
if __name__ == '__main__':
create_table()
row_counter = 0
paired_rows = 0
with open('C:/Users/Ratul/Downloads/Machine_Learning_Data/reddit_data/reddit_data/{}/RC_{}'.format(timeframe.split('-')[0],timeframe), buffering=1000) as f:
for row in f:
row_counter += 1
row = json.loads(row)
parent_id = row['parent_id']
body = format_data(row['body'])
created_utc = row['created_utc']
score = row['score']
comment_id = row['name']
subreddit = row['subreddit']
parent_data = find_parent(parent_id)
# maybe check for a child, if child, is our new score superior? If so, replace. If not...
if score >= 2:
existing_comment_score = find_existing_score(parent_id)
But it seems there is some mistake in the path. I get an error
Traceback (most recent call last): File
"C:/Users/Ratul/AppData/Local/Programs/Python/Python37/test02.py",
line 36, in
with open('C:/Users/Ratul/Downloads/Machine_Learning_Data/reddit_data/reddit_data/{}/RC_{}'.format(timeframe.split('-')[0],timeframe), buffering=1000) as f: FileNotFoundError: [Errno 2] No such file or
directory:
'C:/Users/Ratul/Downloads/Machine_Learning_Data/reddit_data/reddit_data/2015/RC_2015-05'
I'm not sure what wrong I did there. Please help.
Use How to debug small programs (#1) and
print('C:/Users/Ratul/Downloads/Machine_Learning_Data/reddit_data/reddit_data/{}/RC_{}'.format(
timeframe.split('-')[0],timeframe))
instead of open. Check if all exists - because for some of your values it does not exist. Hence the error.
If most of your files exist, it is far easier to handle the error itself:
myname = 'C:/Users/Ratul/Downloads/Machine_Learning_Data/reddit_data/reddit_data/{}/RC_{}'.format(timeframe.split('-')[0],timeframe)
try:
with open(myname, buffering=1000) as f:
for row in f:
row_counter += 1
row = json.loads(row)
parent_id = row['parent_id']
body = format_data(row['body'])
created_utc = row['created_utc']
score = row['score']
comment_id = row['name']
subreddit = row['subreddit']
parent_data = find_parent(parent_id)
# maybe check for a child, if child, is our new score superior? If so, replace. If not...
if score >= 2:
existing_comment_score = find_existing_score(parent_id)
except FileNotFoundError as fnfError:
print(myname)
print(fnfError)
The open() command does not care about you using \ or / - if using \ you should escape it or use raw strings (aka: r'C:\some\dir\file.txt') - your syntax is ok as is - open() will use the appropriate directory delimiters under windows even if you give it 'c:/somedir/file.txt'
Readup: About error handling
I've got class for uploading my csv files with holidays to my fullcalendar. It looks like this:
class UploadVacationsView(APIView):
def put(self, request, *args, **kwargs):
try:
# check file type
mime = MimeTypes()
url = urllib.pathname2url(request.FILES['file']._name)
mime_type = mime.guess_type(url)
if 'text/csv' not in mime_type:
raise APIException(code=400, detail='File type must be CSV')
vacations_list =[]
csv_file = StringIO(request.data.get('file', None).read().decode('utf-8'))
user_tz = pytz.timezone(request.user.common_settings.time_zone)
schedule_file = ScheduleFile.objects.create(user=request.user)
instance_hebcal = HebcalService()
events = instance_hebcal.process_csv(csv_file, user_tz)
...
And in the other class, I've got a method that works with csv files:
class HebcalService(...):
def process_csv(self, csv_file, user_tz):
events = []
csv_input = csv.reader(csv_file.readlines(), dialect=csv.excel)
curr_row = 1
start_date = None
end_date = None
start_name = None
holiday_name = ''
last_event = {'subject': '',
'date': '',
}
for row in list(csv_input)[1:]:
subject, date, time, _, _, _, _ = row[:7]
curr_row += 1
row = [unicode(cell.strip(), 'utf-8') for cell in row]
if 'lighting' in subject and not start_date:
start_date = user_tz.localize(format_datetime(date, time))
if date == last_event['date']:
start_name = last_event['subject']
Everything is ok when working with english holiday's names but when I encounter hebrew names it shots an error:
Traceback (most recent call last):
File "/home/stas/work/vacation/vmode/apps/marketplaces/base/api/views.py", line 47, in put
events = instance_hebcal.process_csv(csv_file, user_tz)
File "/home/stas/work/vacation/vmode/apps/marketplaces/base/services/hebcal.py", line 106, in process_csv
for row in list(csv_input)[1:]:
UnicodeEncodeError: 'ascii' codec can't encode characters in position 19-23: ordinal not in range(128)
I've read about making all strings to unicode but don't understand where it gets that default ASCII encoding, how can I handle it and save string with holiday_name from csv file?
I am trying to save the query results from postgresql into a csv file but the csv file sometimes lacks the headers but writes all the details of the queries.
import psycopg2
import csv
try:
conn = psycopg2.connect(database = '', user = '', host = '', password = '')
except:
print ("I am unable to connect to the database")
cursor = conn.cursor()
query = """select * from"""
cursor.execute(query)
result = cursor.fetchall()
with open("kiker.csv","wb") as csvfile:
writer = csv.DictWriter(csvfile, fieldnames = ["Builder", "Subdivision", "Spec", "Build", "Cancel", "Price", "Sq_Ft", "PPSF", "Realtor", "Project ID"], extrasaction = 'ignore')
writer.writeheader()
writer.writerow(result)
print "Query 1 Created"
Error:
Traceback (most recent call last):
File "C:\\connecting.py", line 45, in <module>
writer.writerow(result)
File "C:\Python27\lib\csv.py", line 152, in writerow
return self.writer.writerow(self._dict_to_list(rowdict))
File "C:\Python27\lib\csv.py", line 149, in _dict_to_list
return [rowdict.get(key, self.restval) for key in self.fieldnames]
AttributeError: 'list' object has no attribute 'get'
I tried both the methods below, but both of them fail to include the header information from postgresql.
c = csv.writer(open("kiker.csv","wb"))
for row in result:
c.writerow(row)
and
fp = open("kiker.csv","wb")
myFile = csv.writer(fp)
myFile.writerows(result)
fp.close()
How can I fix this?
I used Pandas to get around the situation. Worked like a treat.
cursor.execute(query)
result = cursor.fetchall()
first = pd.DataFrame(result, columns = ["Builder","Subdivision","Spec","Build","Cancel","Price","Sq_Ft","PPSF","Realtor","Project ID"])
first.to_csv("kiker.csv",index = False)
DictWriter expects dicts, not tuples: https://docs.python.org/3.6/library/csv.html#writer-objects
I am trying to parse an HTML link into the code and take its source code as list of strings. As I have to use get some relevant data from it, I am decoding everything into UTF-8 scheme.
I am also using beautifulsoup4 which extracts the text in decoded form.
This is my code that I have used.
def do_underline(line,mistakes):
last = u'</u></font>'
first = u"<u><font color='red'>"
a = [i.decode(encoding='UTF-8', errors='ignore') for i in line]
lenm = len(mistakes)
for i in range(lenm):
a.insert(mistakes[lenm-i-1][2],last)
a.insert(mistakes[lenm-i-1][1],first)
b = u''
return b.join(a)
def readURL(u):
"""
URL -> List
Opens a webpage's source code and extract it text
along with blank and new lines.
enumerate all lines.(including blank and new lines
"""
global line_dict,q
line_dict = {}
p = opener.open(u)
p1 = p.readlines()
q = [i.decode(encoding = 'UTF-8',errors='ignore') for i in p1]
q1 = [BeautifulSoup(i).get_text() for i in q]
q2 = list(enumerate(q1))
line_dict = {i:j for (i,j) in enumerate(q)}
return q2
def process_file(f):
"""
(.html file) -> List of Spelling Mistakes
"""
global line_dict
re = readURL(f)
de = del_blankempty(re)
fd = form_dict(de)
fflist = []
chklst = []
for i in fd:
chklst = chklst + list_braces(i,line_dict)
fflist = fflist + find_index_mistakes(i,fd)
final_list = list(set(is_inside_braces_or_not(chklst,fflist)))
final_dict = {i:sorted(list(set([final_list[j] for j in range(len(final_list)) if final_list[j][0] == i])),key=lambda student: student[1]) for i in fd}
for i in line_dict:
if i in fd:
line_dict[i] = do_underline(line_dict[i],final_dict[i])
else:
line_dict[i] = line_dict[i]
create_html_file(line_dict)
print "Your Task is completed"
def create_html_file(a):
import io
fl = io.open('Spellcheck1.html','w', encoding='UTF-8')
for i in a:
fl.write(a[i])
print "Your HTML text file is created"
I am getting the following error every time i run the script.
Traceback (most recent call last):
File "checker.py", line 258, in <module>
process_file('https://www.fanfiction.net/s/9421614/1/The-Night-Blooming-Flower')
File "checker.py", line 243, in process_file
line_dict[i] = do_underline(line_dict[i],final_dict[i])
File "checker.py", line 89, in do_underline
a = [i.decode(encoding='UTF-8', errors='ignore') for i in line]
File "/usr/lib/python2.7/encodings/utf_8.py", line 16, in decode
return codecs.utf_8_decode(input, errors, True)
UnicodeEncodeError: 'ascii' codec can't encode character u'\xf3' in position 0: ordinal not in range(128)
Any suggestions how i can remove this error.
if there is a way which decodes evrything into UTF-8 coming from the given link, then i think it will solve the problem.
I'm trying to parse a MARC file downloaded from the Library of Congress. I've successfully downloaded the record using the PyZ3950, but when I try to parse the file using PyMarc, I get the following error:
Traceback (most recent call last):
File "test.py", line 13, in <module>
for record in reader:
File "build/bdist.macosx-10.9-intel/egg/pymarc/reader.py", line 83, in next
ValueError: invalid literal for int() with base 10: '<PyZ3'
And here is my full code:
from PyZ3950 import zoom, zmarc
from pymarc import MARCReader
conn = zoom.Connection('z3950.loc.gov', 7090)
conn.databaseName = 'VOYAGER'
conn.preferredRecordSyntax = 'USMARC'
query = zoom.Query('CCL', 'ti="1066 and all that"')
res = conn.search(query)
reader = MARCReader(str(res))
for record in reader:
print record.title()
conn.close()
Your statement:
res = conn.search(query)
return a ResultSet, accordingly to http://www.panix.com/~asl2/software/PyZ3950/zoom.html
Each record r in the resultSet have the data in r.data
So, you have to feed the MARCReader with each r.data or with them all concatenated.
This will work:
from PyZ3950 import zoom, zmarc
from pymarc import MARCReader
conn = zoom.Connection('z3950.loc.gov', 7090)
conn.databaseName = 'VOYAGER'
conn.preferredRecordSyntax = 'USMARC'
query = zoom.Query('CCL', 'ti="1066 and all that"')
res = conn.search(query)
marc = ''
for r in res:
marc = marc + r.data
reader = MARCReader(marc)
for record in reader:
print record.title()
conn.close()