I am trying to update the url with keywords from another file and read the contents of the url but it is throwing error
f = open('myfile.txt')
for line in iter(f):
id = line.strip('\n')
url_sell = 'https://example.com/getmarketsummary?market='+str(id)
df = pd.read_json(url_sell, orient='columns')
Below is the error
urllib2.URLError: <urlopen error no host given>
Use try, excep to debug.
for line in iter(f):
id = line.strip('\n')
try:
url_sell = 'https://example.com/getmarketsummary?market='+str(id)
except:
return id
Related
I`m trying to download a book link from a different file but when I executed... the file at the beginning runs well but it crash
downloading: http://31.42.184.140/main/94000/e5772a162f57b7c2b22c7ec8f6883002/%D0%91%D0%B0%D1%88%D1%82%D0%B0%20%D0%A2.%D0%9C.%2C%20%D0%B8%20%D0%B4%D1%80.%20-%20%D0%93%D0%B8%D0%B4%D1%80%D0%B0%D0%B2%D0%BB%D0%B8%D0%BA%D0%B0%2C%20%D0%B3%D0%B8%D0%B4%D1%80%D0%BE%D0%BC%D0%B0%D1%88%D0%B8%D0%BD%D1%8B%20%D0%B8%20%D0%B3%D0%B8%D0%B4%D1%80%D0%BE%D0%BF%D1%80%D0%B8%D0%B2%D0%BE%D0%B4%D1%8B%20%28%D0%A3%D1%87%D0%B5%D0%B1%D0%BD%D0%B8%D0%BA%20%D0%B4%D0%BB%D1%8F%20%D0%BC%D0%B0%D1%88%D0%B8%D0%BD%D0%BE%D1%81%D1%82%D1%80%D0%BE%D0%B8%D1%82%D0%B5%D0%BB%D1%8C%D0%BD%D1%8B%D1%85%20%D0%B2%D1%83%D0%B7%D0%BE%D0%B2%29.djvu
Traceback (most recent call last):
File "C:/Users/he/Desktop/New folder (4)/6.py", line 23, in <module>
exec(l)
File "<string>", line 1, in <module>
File "C:/Users/he/Desktop/New folder (4)/6.py", line 9, in download_url
with open(file_name, 'wb') as f:
OSError: [Errno 22] Invalid argument: '%D0%91%D0%B0%D1%88%D1%82%D0%B0%20%D0%A2.%D0%9C.%2C%20%D0%B8%20%D0%B4%D1%80.%20-%20%D0%93%D0%B8%D0%B4%D1%80%D0%B0%D0%B2%D0%BB%D0%B8%D0%BA%D0%B0%2C%20%D0%B3%D0%B8%D0%B4%D1%80%D0%BE%D0%BC%D0%B0%D1%88%D0%B8%D0%BD%D1%8B%20%D0%B8%20%D0%B3%D0%B8%D0%B4%D1%80%D0%BE%D0%BF%D1%80%D0%B8%D0%B2%D0%BE%D0%B4%D1%8B%20%28%D0%A3%D1%87%D0%B5%D0%B1%D0%BD%D0%B8%D0%BA%20%D0%B4%D0%BB%D1%8F%20%D0%BC%D0%B0%D1%88%D0%B8%D0%BD%D0%BE%D1%81%D1%82%D1%80%D0%BE%D0%B8%D1%82%D0%B5%D0%BB%D1%8C%D0%BD%D1%8B%D1%85%20%D0%B2%D1%83%D0%B7%D0%BE%D0%B2%29.djvu'
Here is the code:
import requests
from bs4 import BeautifulSoup
def download_url(url):
print("downloading: ", url)
file_name_start_pos = url.rfind("/") + 1
file_name = url[file_name_start_pos:]
r = requests.get(url, stream=True)
if r.status_code == requests.codes.ok:
with open(file_name, 'wb') as f:
for data in r:
f.write(data)
def n(f):
s = []
for l in range(1):
l = f.readline()
if l:
s.append(l)
return s
b = open('1.txt')
c = n(b)
while len(c) > 0:
for l in c:
exec(l)
c = n(b)
b.close()
File link:
download_url("http://31.42.184.140/main/94000/e5772a162f57b7c2b22c7ec8f6883002/%D0%91%D0%B0%D1%88%D1%82%D0%B0%20%D0%A2.%D0%9C.%2C%20%D0%B8%20%D0%B4%D1%80.%20-%20%D0%93%D0%B8%D0%B4%D1%80%D0%B0%D0%B2%D0%BB%D0%B8%D0%BA%D0%B0%2C%20%D0%B3%D0%B8%D0%B4%D1%80%D0%BE%D0%BC%D0%B0%D1%88%D0%B8%D0%BD%D1%8B%20%D0%B8%20%D0%B3%D0%B8%D0%B4%D1%80%D0%BE%D0%BF%D1%80%D0%B8%D0%B2%D0%BE%D0%B4%D1%8B%20%28%D0%A3%D1%87%D0%B5%D0%B1%D0%BD%D0%B8%D0%BA%20%D0%B4%D0%BB%D1%8F%20%D0%BC%D0%B0%D1%88%D0%B8%D0%BD%D0%BE%D1%81%D1%82%D1%80%D0%BE%D0%B8%D1%82%D0%B5%D0%BB%D1%8C%D0%BD%D1%8B%D1%85%20%D0%B2%D1%83%D0%B7%D0%BE%D0%B2%29.djvu")
You have link with some chars converted to percent values - to create url safe for internet - but this is not correct string for filename because system may not use % in filenames.
But you can convert it back using urllib.parser.unquote()
import urllib.parse
name = urllib.parse.unquote("%D0%91%D0%B0%D1%88%D1%82%D0%B0%20%D0%A2.%D0%9C.%2C%20%D0%B8%20%D0%B4 %D1%80.%20-%20%D0%93%D0%B
8%D0%B4%D1%80%D0%B0%D0%B2%D0%BB%D0%B8%D0%BA%D0%B0 %2C%20%D0%B3%D0%B8%D0%B4%D1%80%D0%BE%D0%BC%D0%B0%D1%88%D0%B8%D0%BD%D1%
8B%20 %D0%B8%20%D0%B3%D0%B8%D0%B4%D1%80%D0%BE%D0%BF%D1%80%D0%B8%D0%B2%D0%BE%D0%B4 %D1%8B%20%28%D0%A3%D1%87%D0%B5%D0%B1%D
0%BD%D0%B8%D0%BA%20%D0%B4%D0%BB%D1%8F %20%D0%BC%D0%B0%D1%88%D0%B8%D0%BD%D0%BE%D1%81%D1%82%D1%80%D0%BE%D0%B8%D1%82 %D0%B5
%D0%BB%D1%8C%D0%BD%D1%8B%D1%85%20%D0%B2%D1%83%D0%B7%D0%BE%D0%B2%29.djvu")
print(name)
Result:
'Башта Т.М., и д р. - Гидравлика , гидромашины и гидропривод ы (Учебник для машиностроит ельных вузов).djvu'
There is a problem with your file name.
You cannot use %D0%91%D0%B0%D1%88%D1%82%D0%B0%20%D0%A2.%D0%9C.%2C%20%D0%B8%20%D0%B4 %D1%80.%20-%20%D0%93%D0%B8%D0%B4%D1%80%D0%B0%D0%B2%D0%BB%D0%B8%D0%BA%D0%B0 %2C%20%D0%B3%D0%B8%D0%B4%D1%80%D0%BE%D0%BC%D0%B0%D1%88%D0%B8%D0%BD%D1%8B%20 %D0%B8%20%D0%B3%D0%B8%D0%B4%D1%80%D0%BE%D0%BF%D1%80%D0%B8%D0%B2%D0%BE%D0%B4 %D1%8B%20%28%D0%A3%D1%87%D0%B5%D0%B1%D0%BD%D0%B8%D0%BA%20%D0%B4%D0%BB%D1%8F %20%D0%BC%D0%B0%D1%88%D0%B8%D0%BD%D0%BE%D1%81%D1%82%D1%80%D0%BE%D0%B8%D1%82 %D0%B5%D0%BB%D1%8C%D0%BD%D1%8B%D1%85%20%D0%B2%D1%83%D0%B7%D0%BE%D0%B2%29.djvu As file name
I guess you want to use e5772a162f57b7c2b22c7ec8f6883002 as the file name, if not, you can choose another name.
If it's not you revising.
I have modified your code, I have successfully downloaded the file of your URL, and can view it without problems.
import requests
url = 'http://31.42.184.140/main/94000/e5772a162f57b7c2b22c7ec8f6883002/%D0%91%D0%B0%D1%88%D1%82%D0%B0%20%D0%A2.%D0%9C.%2C%20%D0%B8%20%D0%B4%D1%80.%20-%20%D0%93%D0%B8%D0%B4%D1%80%D0%B0%D0%B2%D0%BB%D0%B8%D0%BA%D0%B0%2C%20%D0%B3%D0%B8%D0%B4%D1%80%D0%BE%D0%BC%D0%B0%D1%88%D0%B8%D0%BD%D1%8B%20%D0%B8%20%D0%B3%D0%B8%D0%B4%D1%80%D0%BE%D0%BF%D1%80%D0%B8%D0%B2%D0%BE%D0%B4%D1%8B%20%28%D0%A3%D1%87%D0%B5%D0%B1%D0%BD%D0%B8%D0%BA%20%D0%B4%D0%BB%D1%8F%20%D0%BC%D0%B0%D1%88%D0%B8%D0%BD%D0%BE%D1%81%D1%82%D1%80%D0%BE%D0%B8%D1%82%D0%B5%D0%BB%D1%8C%D0%BD%D1%8B%D1%85%20%D0%B2%D1%83%D0%B7%D0%BE%D0%B2%29.djvu'
def download_url(url):
print("downloading: ", url)
file_name = f"{url.split('/')[5]}.{url.split('.')[-1]}"
print(url.split('/')[5])
r = requests.get(url, stream=True)
if r.status_code == requests.codes.ok:
with open(file_name, 'wb') as f:
f.write(r.content)
download_url(url=url)
I'm following this tutorial from the website: https://towardsdatascience.com/creating-the-twitter-sentiment-analysis-program-in-python-with-naive-bayes-classification-672e5589a7ed
Everything is good so far but I keep getting an error when trying to run this code.
def buildTrainingSet(corpusFile, tweetDataFile):
import csv
import time
corpus = []
with open(corpusFile,'rb') as csvfile:
lineReader = csv.reader(csvfile,delimiter=',', quotechar="\"")
for row in lineReader:
corpus.append({"tweet_id":row[2], "label":row[1], "topic":row[0]})
rate_limit = 180
sleep_time = 900/180
trainingDataSet = []
for tweet in corpus:
try:
status = twitter_api.GetStatus(tweet["tweet_id"])
print("Tweet fetched" + status.text)
tweet["text"] = status.text
trainingDataSet.append(tweet)
time.sleep(sleep_time)
except:
continue
# now we write them to the empty CSV file
with open(tweetDataFile,'wb') as csvfile:
linewriter = csv.writer(csvfile,delimiter=',',quotechar="\"")
for tweet in trainingDataSet:
try:
linewriter.writerow([tweet["tweet_id"], tweet["text"], tweet["label"], tweet["topic"]])
except Exception as e:
print(e)
return trainingDataSet
#================
corpusFile = "C:\Users\Vilma\Documents\CIS450\group prjt/corpus.csv"
tweetDataFile = "C:\Users\Vilma\Documents\CIS450\group prjt/tweetDataFile.csv"
trainingData = buildTrainingSet (corpusFile, tweetDataFile)
I keep getting this error:
File "<ipython-input-33-54fea359e8f9>", line 1
corpusFile = "C:\Users\Vilma\Documents\CIS450\group prjt/corpus.csv"
^
SyntaxError: (unicode error) 'unicodeescape' codec can't decode bytes in position 2-3: truncated \UXXXXXXXX escape
I even tried putting r' in front of C:\Users\Vilma\Documents\CIS450\group prjt/corpus.csvbut I still keeping getting error.
update: Fixed error, I put code as
corpusFile = r'C:\Users\Vilma\Documents\CIS450\group prjt\corpus.csv'
tweetDataFile = r'C:\Users\Vilma\Documents\CIS450\group prjt\tweetDataFile.csv'
However, a new error pops up:
File "<ipython-input-41-f44768dabc6e>", line 7, in buildTrainingSet
with open(corpusFile,'rb') as csvfile:
FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\Vilma\\Documents\\CIS450\\group prjt\\corpus.csv'
Try correcting your file path.
corpusFile = "C:\Users\Vilma\Documents\CIS450\group prjt/corpus.csv"
Should be:
corpusFile = "C:\Users\Vilma\Documents\CIS450\group prjt\corpus.csv"
Hope this helps!
You can use:
corpusFile = r"C:\Users\Vilma\Documents\CIS450\group prjt\corpus.csv"
If you are not finding the file, please make sure the file exists in the folder.
I'm using a standard try/except syntax for skipping rows in a csv file that aren't streaming properly and therefore can't be downloaded. My code:
for row in list_reader:
media_id = row['mediaId']
filename = row['mediaId']
saveFile = media.get_item(media_id)
stream_url = saveFile['streams'][0]['streamLocation']
try:
r = requests.get(stream_url, allow_redirects=True)
with open(os.path.join('./media', filename), 'wb') as ofile:
ofile.write(r.content)
counter += 1
except:
IndexError
print "error"
However after downloading a number of files the problem row comes up, the error is not handled and I get the error:
Traceback (most recent call last):
File "downloadmedia.py", line 28, in <module>
stream_url = saveFile['streams'][0]['streamLocation']
IndexError: list index out of range
I've tried an if/else syntax instead, using the length of the stream_url variable, but this gives the same error. Can someone explain why the error handling doesn't work?
As stated in the comments, your try/except is in the wrong place. Through the error you provided, you can see that the index error occurs at the line stream_url = saveFile['streams'][0]['streamLocation']
You need to make sure the try/except is covering this line to prevent this.
for row in list_reader:
try:
media_id = row['mediaId']
filename = row['mediaId']
saveFile = media.get_item(media_id)
stream_url = saveFile['streams'][0]['streamLocation']
r = requests.get(stream_url, allow_redirects=True)
with open(os.path.join('./media', filename), 'wb') as ofile:
ofile.write(r.content)
counter += 1
except IndexError:
print "error"
import requests
from bs4 import BeautifulSoup
import csv
from urlparse import urljoin
import urllib2
outfile = open("./battingall.csv", "wb")
writer = csv.writer(outfile)
base_url = 'http://www.baseball-reference.com'
player_url = 'http://www.baseball-reference.com/players/'
alphabet = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z']
players = 'shtml'
gamel = '&t=b&year='
game_logs = 'http://www.baseball-reference.com/players/gl.cgi?id='
years = ['2015','2014','2013','2012','2011','2010','2009','2008']
drounders = []
for dround in alphabet:
drounders.append(player_url + dround)
urlz = []
for ab in drounders:
data = requests.get(ab)
soup = BeautifulSoup(data.content)
for link in soup.find_all('a'):
if link.has_attr('href'):
urlz.append(base_url + link['href'])
yent = []
for ant in urlz:
for d in drounders:
for y in years:
if players in ant:
if len(ant) < 60:
if d in ant:
yent.append(game_logs + ant[44:-6] + gamel + y)
for j in yent:
try:
data = requests.get(j)
soup = BeautifulSoup(data.content)
table = soup.find('table', attrs={'id': 'batting_gamelogs'})
tablea = j[52:59]
tableb= soup.find("b", text='Throws:').next_sibling.strip()
tablec= soup.find("b", text='Height:').next_sibling.strip()
tabled= soup.find("b", text='Weight:').next_sibling.strip()
list_of_rows = []
for row in table.findAll('tr'):
list_of_cells = []
list_of_cells.append(tablea)
list_of_cells.append(j[len(j)-4:])
list_of_cells.append(tableb)
list_of_cells.append(tablec)
list_of_cells.append(tabled)
for cell in row.findAll('td'):
text = cell.text.replace(' ', '').encode("utf-8")
list_of_cells.append(text)
list_of_rows.append(list_of_cells)
print list_of_rows
writer.writerows(list_of_rows)
except (AttributeError,NameError):
pass
When I run this code to get gamelog batting data I keep getting an error:
Traceback (most recent call last):
File "battinggamelogs.py", line 44, in <module>
data = requests.get(j)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site- packages/requests/api.py", line 65, in get
return request('get', url, **kwargs)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site- packages/requests/api.py", line 49, in request
response = session.request(method=method, url=url, **kwargs)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/requests/sessions.py", line 461, in request
resp = self.send(prep, **send_kwargs)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/requests/sessions.py", line 573, in send
r = adapter.send(request, **kwargs)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/requests/adapters.py", line 415, in send
raise ConnectionError(err, request=request)
requests.exceptions.ConnectionError: ('Connection aborted.', BadStatusLine("''",))
I need a way to bypass this error to keep going. I think the reason the error comes up because there is no table to get data from.
You can wrap your requests.get() block in a try/except. You need to catch the requests.exceptions.ConnectionError that is being generated.
for ab in drounders:
try:
data = requests.get(ab)
soup = BeautifulSoup(data.content)
for link in soup.find_all('a'):
if link.has_attr('href'):
urlz.append(base_url + link['href'])
except requests.exceptions.ConnectionError:
pass
This is occurring because the connection, itself, has a problem, not because there is no data in the table. You aren't even getting that far.
Note: This is completely eating the exception by simply using pass (as you are also doing later in the code block). It may be better to do something like this:
except requests.exceptions.ConnectionError:
print("Failed to open {}".format(ab))
This will provide you with a message on the console of what URL is failing.
So I have the following lines of code in a function
sock = urllib.urlopen(url)
html = sock.read()
sock.close()
and they work fine when I call the function by hand. However, when I call the function in a loop (using the same urls as earlier) I get the following error:
> Traceback (most recent call last):
File "./headlines.py", line 256, in <module>
main(argv[1:])
File "./headlines.py", line 37, in main
write_articles(headline, output_folder + "articles_" + term +"/")
File "./headlines.py", line 232, in write_articles
print get_blogs(headline, 5)
File "/Users/michaelnussbaum08/Documents/College/Sophmore_Year/Quarter_2/Innovation/Headlines/_code/get_content.py", line 41, in get_blogs
sock = urllib.urlopen(url)
File "/System/Library/Frameworks/Python.framework/Versions/2.6/lib/python2.6/urllib.py", line 87, in urlopen
return opener.open(url)
File "/System/Library/Frameworks/Python.framework/Versions/2.6/lib/python2.6/urllib.py", line 203, in open
return getattr(self, name)(url)
File "/System/Library/Frameworks/Python.framework/Versions/2.6/lib/python2.6/urllib.py", line 314, in open_http
if not host: raise IOError, ('http error', 'no host given')
IOError: [Errno http error] no host given
Any ideas?
Edit more code:
def get_blogs(term, num_results):
search_term = term.replace(" ", "+")
print "search_term: " + search_term
url = 'http://blogsearch.google.com/blogsearch_feeds?hl=en&q='+search_term+'&ie=utf-8&num=10&output=rss'
print "url: " +url
#error occurs on line below
sock = urllib.urlopen(url)
html = sock.read()
sock.close()
def write_articles(headline, output_folder, num_articles=5):
#calls get_blogs
if not os.path.exists(output_folder):
os.makedirs(output_folder)
output_file = output_folder+headline.strip("\n")+".txt"
f = open(output_file, 'a')
articles = get_articles(headline, num_articles)
blogs = get_blogs(headline, num_articles)
#NEW FUNCTION
#the loop that calls write_articles
for term in trend_list:
if do_find_max == True:
fill_search_term(term, output_folder)
headlines = headline_process(term, output_folder, max_headlines, do_find_max)
for headline in headlines:
try:
write_articles(headline, output_folder + "articles_" + term +"/")
except UnicodeEncodeError:
pass
I had this problem when a variable I was concatenating with the url, in your case search_term
url = 'http://blogsearch.google.com/blogsearch_feeds?hl=en&q='+search_term+'&ie=utf-8&num=10&output=rss'
had a newline character at the end. So make sure you do
search_term = search_term.strip()
You might also want to do
search_term = urllib2.quote(search_term)
to make sure your string is safe for a url
use urllib2 instead if you don't want to handle reading on a per block basis yourself.
This probably does what you expect.
import urllib2
req = urllib2.Request(url='http://stackoverflow.com/')
f = urllib2.urlopen(req)
print f.read()
In your function's loop, right before the call to urlopen, perhaps put a print statement:
print(url)
sock = urllib.urlopen(url)
This way, when you run the script and get the IOError, you will see the url which is causing the problem. The error "no host given" can be replicated if url equals something like 'http://'...