My current professor is using Python 2.7 for examples in class, but other professors that I will be taking classes from in the future have suggested I use Python 3.5. I am trying to convert my current Professor's examples from 2.7 to 3.5. Right now I'm having an issue with the urllib2 package, which I understand has been split in Python 3.
The original code in the iPython notebook looks like this :
import csv
import urllib2
data_url = 'http://archive.ics.uci.edu/ml/machine-learning- databases/adult/adult.data'
response = urllib2.urlopen(data_url)
myreader = csv.reader(response)
for i in range(5):
row = next(myreader)
print ','.join(row)
Which I have converted to:
import csv
import urllib.request
data_url = 'http://archive.ics.uci.edu/ml/machine-learning- databases/adult/adult.data'
response = urllib.request.urlopen(data_url)
myreader = csv.reader(response)
for i in range(5):
row = next(myreader)
print(','.join(row))
But that leaves me with the error:
Error Traceback (most recent call last)
<ipython-input-19-20da479e256f> in <module>()
7 myreader = csv.reader(response)
8 for i in range(5):
----> 9 row = next(myreader)
10 print(','.join(row))
Error: iterator should return strings, not bytes (did you open the file in text mode?)
I'm unsure how to proceed from here. Any ideas?
Wrap response with another iterator which decode bytes to string and yield the strings:
import csv
import urllib.request
def decode_iter(it):
# iterate line by line
for line in it:
# convert bytes to string using `bytes.decode`
yield line.decode()
data_url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data'
response = urllib.request.urlopen(data_url)
myreader = csv.reader(decode_iter(response))
for i in range(5):
row = next(myreader)
print(','.join(row))
UPDATE
Instead of decode_iter, you can use codecs.iter_decode:
import csv
import codecs
import urllib.request
data_url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data'
response = urllib.request.urlopen(data_url)
myreader = csv.reader(codecs.iterdecode(response, 'utf-8'))
for i in range(5):
row = next(myreader)
print(','.join(row))
Related
I'll preface by saying I'm a novice with Python, but I'm trying to encode a single column from a CSV to Base64 and write to another CSV. The file has 3 columns (consumer_id, sms_number, email_address) and I only want to encode the 'consumer_id'. Here is what I have as of now:
import base64
with open('File1.csv') as csvfile:
with open('File2.csv', 'w') as newfile:
reader = csv.DictReader(csvfile)
for i, r in enumerate(reader):
# writing csv headers
if i == 0:
newfile.write(','.join(r) + '\n')
# convert 'ID' column to Base64
r['consumer_id'] = base64.b64decode(parse.unquote(row['consumer_id']))
# writing the new row to the file
newfile.write(','.join(r.values()) + '\n')
The error I get is
Traceback (most recent call last):
File "c:\script.py", line 93, in <module>
r['consumer_id'] = base64.b64decode(parse.unquote(row['consumer_id']))
NameError: name 'parse' is not defined. Did you mean: 'vars'?
There are a few errors:
you did not import urllib, which is the reson for the error message that you got: -> from urllib import parse
you want to encode, not decode: -> base64.b64encode
you're also missing the import csv
row is not defined: -> change r to row
Full code:
import base64
import csv
from urllib import parse
with open('C:/temp/File1.csv') as csvfile:
with open('C:/temp/File2.csv', 'w') as newfile:
reader = csv.DictReader(csvfile)
for i, row in enumerate(reader):
# writing csv headers
if i == 0:
newfile.write(','.join(row) + '\n')
# convert 'ID' column to Base64
row['consumer_id'] = base64.b64encode(parse.unquote(row['consumer_id']).encode()).decode()
# writing the new row to the file
newfile.write(','.join(row.values()) + '\n')
What is the proper way to read text file from internet.
For example text file here https://gist.githubusercontent.com/deekayen/4148741/raw/01c6252ccc5b5fb307c1bb899c95989a8a284616/1-1000.txt
Code below works but produces extra 'b in front of each word
from urllib.request import urlopen
#url = 'https://raw.githubusercontent.com/first20hours/google-10000-english/master/google-10000-english.txt'
url = 'https://gist.githubusercontent.com/deekayen/4148741/raw/01c6252ccc5b5fb307c1bb899c95989a8a284616/1-1000.txt'
#data = urlopen(url)
#print('H w')
# it's a file like object and works just like a file
l = set()
data = urlopen(url)
for line in data: # files are iterable
word = line.strip()
print(word)
l.add(word)
print(l)
You have to decode each byte object to unicode. For that you can use the method decode('utf-8'). Here's the code:
from urllib.request import urlopen
url = 'https://gist.githubusercontent.com/deekayen/4148741/raw/01c6252ccc5b5fb307c1bb899c95989a8a284616/1-1000.txt'
l = set()
data = urlopen(url)
for line in data: # files are iterable
word = line.strip().decode('utf-8') # decode the line into unicode
print(word)
l.add(word)
print(l)
It's simple using pandas. Just execute
import pandas as pd
pd.read_csv('https://gist.githubusercontent.com/deekayen/4148741/raw/01c6252ccc5b5fb307c1bb899c95989a8a284616/1-1000.txt')
and you are all set :)
If I have a json file with multiple json objects such as following
{"created_at":"Sun Apr 16 02:00:14 +0000 2017","id":853427785339084800,"id_str":"853427785339084800"}
{"created_at":"Sun Apr 16 02:03:24 +0000 2017","id":853428582613475332,"id_str":"853428582613475332"}
I want to loop all the json objects add add new element together with value through user input in command line one by one. Is it possible because my format not in json array?
import json
import pandas as pd
from pprint import pprint
tweets_data_path = 'data.json'
tweets_data = []
tweets_file = open(tweets_data_path, "r")
for line in tweets_file:
try:
tweet = json.loads(line)
tweets_data.append(tweet)
except:
continue
pprint(tweets_data)
Your code runs fine, so I am assuming the question is on how to add user input as a new key-value pair for each json you load from the file. For python 2.7 use raw_input, for python 3 raw_input was renamed to input.
import json
import pandas as pd
from pprint import pprint
tweets_data_path = 'data.json'
tweets_data = []
tweets_file = open(tweets_data_path, "r")
for line in tweets_file:
try:
tweet = json.loads(line)
tweet["new_key"] = raw_input('provide input: ')
tweets_data.append(tweet)
except:
continue
pprint(tweets_data)
I'm building a simple scraper in order to learn python.
After writing the csvWriter function below, I'm having issues. It seems that the encoding can't be written to csv file (I assume this is because of price information I'm scraping).
Also, I'm wondering if I am correct in thinking that in this case, it is best to go from set -> list to get the information zipped and presented in the way that I want before writing.
Also - any general advice on how I am approaching this?
from bs4 import BeautifulSoup
import requests
import time
import csv
response = request.get('http://website.com/subdomain/logqueryhere')
baseurl = 'http://website.com'
soup = BeautifulSoup(response.text)
hotelInfo = soup.find_all("div", {'class': "hotel-wrap"})
#retrieveLinks: A function to generate a list of hotel URL's to be passed to the price checker.
def retrieveLinks():
for hotel in hotelInfo:
urllist = []
hotelLink = hotel.find('a', attrs={'class': ''})
urllist.append(hotelLink['href'])
scraper(urllist)
hotelnameset = set()
hotelurlset = set()
hotelpriceset = set()
# Scraper: A function to scrape from the lists generated above with retrieveLinks
def scraper(inputlist):
global hotelnameset
global hotelurlset
global hotelpriceset
#Use a set here to avoid any dupes.
for url in inputlist:
fullurl = baseurl + url
hotelurlset.add(str(fullurl))
hotelresponse = requests.get(fullurl)
hotelsoup = BeautifulSoup(hotelresponse.text)
hoteltitle = hotelsoup.find('div', attrs={'class': 'vcard'})
hotelhighprice = hotelsoup.find('div', attrs={'class': 'pricing'}).text
hotelpriceset.add(hotelhighprice)
for H1 in hoteltitle:
hotelName = hoteltitle.find('h1').text
hotelnameset.add(str(hotelName))
time.sleep(2)
csvWriter()
#csvWriter: A function to write the above mentioned sets/lists to a CSV file.
def csvWriter():
global hotelnameset
global hotelurlset
global hotelpriceset
csvname = list(hotelnameset)
csvurl = list(hotelurlset)
csvprice = list(hotelpriceset)
#lets zip the values we neded (until we learn a better way to do it)
zipped = zip(csvname, csvurl, csvprice)
c = csv.writer(open("hoteldata.csv", 'wb'))
for row in zipped:
c.writerow(row)
retrieveLinks()
Error is as follows -
± |Add_CSV_Writer U:2 ✗| → python main.py
Traceback (most recent call last):
File "main.py", line 62, in <module>
retrieveLinks()
File "main.py", line 18, in retrieveLinks
scraper(urllist)
File "main.py", line 44, in scraper
csvWriter()
File "main.py", line 60, in csvWriter
c.writerow(row)
UnicodeEncodeError: 'ascii' codec can't encode character u'\u20ac' in position 0: ordinal not in range(128)
Posting your actual error will really help! In any case, in python 2.X the CSV writer does not automatically encode unicode for you. You essentially have to write your own using unicodecsv (https://pypi.python.org/pypi/unicodecsv/0.9.0) or use one of the unicode CSV implementations on the web (1):
import unicodecsv
def csvWriter():
global hotelnameset
global hotelurlset
global hotelpriceset
csvname = list(hotelnameset)
csvurl = list(hotelurlset)
csvprice = list(hotelpriceset)
#lets zip the values we neded (until we learn a better way to do it)
zipped = zip(csvname, csvurl, csvprice)
with open('hoteldata.csv', 'wb') as f_in:
c = unicodecsv.writer(f_in, encoding='utf-8')
for row in zipped:
c.writerow(row)
I have question about parsing HTML tags with python.
My code looks like:
#!/usr/bin/python
# -*- coding: utf-8 -*-
from lxml import html
import requests
import urllib2
import sys
import re
import time
import urllib
import datetime
def get_web():
try:
input_sat = open('rtc.xml','w')
godina = datetime.date.today().strftime("%Y")
print godina
mjesec = datetime.date.today().strftime("%m")
print mjesec
for x in range (32):
if x < 1:
x = x + 1
var = x
url = 'http://www.rts.rs/page/tv/sr/broadcast/20/RTS+1.html?month={}&year={}&day={}&type=0'.format(mjesec, godina, var)
page = requests.get(url)
tree = html.fromstring(page.text)
a = tree.xpath('//div[#id="center"]/h1/text()') # datum
b = tree.xpath('//div[#class="ProgramTime"]/text()') # time
c = tree.xpath('//div[#class="ProgramName"]/text()')
e = tree.xpath('//div[#class="ProgramName"]/a[#class="recnik"]/text()')
for line in zip(a,b,c,e):
var = line[0]
print >> input_sat, line+'\n'
except:
pass
get_web()
The script works fine and gets tags from a URL, but how can I write them into a file for processing?
When I run my code with a for loop, it doesn't work. I don't know where the problem is.
I rewrote my code, it won't output what's on the page to a file.
As I understand it, your print() function is incorrect. You have to use the write() function of the handler, and also encode the text to UTF-8:
for line in zip(a,b,c,e):
var = line[0]
input_sat.write(line[0].encode('utf-8') + '\n')
It yields:
Programska šema - sreda, 01. jan 2014