I want to download images. But for some reason the code execute without errors but it's not creating any images.I'm using Requests and BeautifulSoup. My IDE is VS Code
HEADERS = {'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246"}
def getData(link):
URL = link
request = requests.get(url=URL, headers=HEADERS)
result = BeautifulSoup(request.content, "lxml")
return result
address = "https://bisesargodha.edu.pk/content/ViewSeqImges.aspx?SubjectId=760&SeqNameAnsSubj=Sequence1"
response = getData(address)
table = response.find('table', attrs = {'id':'ContentPlaceHolder1_Table1'})
imgs = table.findAll('img')
imgNum = 1
for img in imgs:
image_url = f"https://bisesargodha.edu.pk/content/{img['src']}"
image_save = f"img-{imgNum}.jpg"
pull_image = requests.get(image_url, headers=HEADERS)
pull_image_contant = pull_image.content
with open(image_save, "wb+") as myfile:
myfile.write(pull_image_contant)
imgNum = imgNum + 1
You need to fetch and consume your response as a stream, try something like this:
img_num = 1
for img in imgs:
image_url = f"https://bisesargodha.edu.pk/content/{img['src']}"
image_save = f"img-{img_num}.jpg"
with requests.get(image_url, headers=HEADERS, stream=True) as resp:
resp.raise_for_status() # do some additional error handling if necessary
with open(image_save, "wb") as image_file:
for chunk in r.iter_content(chunk_size=8192):
image_file.write(chunk)
img_num = img_num + 1
If the issue still persists then maybe double check the image urls you are constructing and make sure they are really pointing to the right content.
I am iterating through multiple pages with the same url except for the number at the end. Once it reaches a 404, however, it freezes the program, even though I am catching the exception in a try block. Am I missing something here? Here is my code. The program hangs once it hits https://www.tenable.com/plugins/nessus/14587
import bs4 as bs
from urllib.request import urlopen, Request
import urllib
ID = 14580
while ID < 132734:
#ID == 14391
ID == 14580
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.3'}
reg_url = "https://www.tenable.com/plugins/nessus/" + str(ID)
req = Request(url=reg_url, headers=headers)
try:
source = urlopen(req).read()
except urllib.error.HTTPError as e:
if e.getcode() == 404: # check the return code
continue
raise
soup = bs.BeautifulSoup(source,'lxml')
print(ID)
print(reg_url)
print(soup.title.string)
ID += 1
UPDATED WORKING CODE:
import bs4 as bs
from urllib.request import urlopen, Request
import urllib
ID = 14580
while ID < 132734:
#ID == 14391
ID == 14580
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.3'}
reg_url = "https://www.tenable.com/plugins/nessus/" + str(ID)
req = Request(url=reg_url, headers=headers)
try:
source = urlopen(req).read()
except urllib.error.HTTPError as e:
if e.getcode() == 404: # check the return code
ID +=1
continue
raise
soup = bs.BeautifulSoup(source,'lxml')
print(ID)
print(reg_url)
print(soup.title.string)
ID += 1
I added another increment to ID inside the exception block as seen in the updated code and it works fine now
I'm trying to detect the availability of an item on Amazon. Why doesn't this code work?
from simplified_scrapy.request import req
from simplified_scrapy.simplified_doc import SimplifiedDoc
import requests
import re
from bs4 import BeautifulSoup
from collections import OrderedDict
from time import sleep
import time
from lxml import html
import json
def check(url):
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
page = requests.get(url, headers = headers)
for i in range(20):
sleep(3)
doc = html.fromstring(page.content)
XPATH_AVAILABILITY = '//div[#id ="availability"]//text()'
RAw_AVAILABILITY = doc.xpath(XPATH_AVAILABILITY)
AVAILABILITY = ''.join(RAw_AVAILABILITY).strip() if RAw_AVAILABILITY else None
return AVAILABILITY
file_name = raw_input("Enter file name: ")
filepath = "%s"%(file_name)
with open(filepath) as f:
listoflinks = [line.rstrip('\n') for line in f]
all_links = []
for i in listoflinks:
html = req.get(i)
doc = SimplifiedDoc(html)
amazon_links = doc.getElements('a')
amazon_links = amazon_links.containsOr(['https://www.amazon.com/','https://amzn.to/'],attr='href')
for a in amazon_links:
if a.href not in all_links:
all_links.append(a.href)
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
for i in all_links:
print "LINK:"
print i
response = requests.get(i, headers=headers)
#soup = BeautifulSoup(html, "lxml")
soup = BeautifulSoup(response.content, features="lxml")
title = soup.select("#productTitle")[0].get_text().strip()
if check(i) == 'In stock.':
price = soup.select("#priceblock_saleprice")[0].get_text()
else:
price = "UNAVAILABLE"
review_count = int(soup.select("#acrCustomerReviewText")[0].get_text().split()[0])
jsonObject = {'title': title, 'price': price, 'review_count': review_count}
print json.dumps(jsonObject, indent=2)
print "////////////////////////////////////////////////"
print "..............................................."
print "FINALLY..."
print "# OF LINKS RETRIEVED:"
print len(all_links)
When I execute it, this error appears:
File "scra.py", line 17, in check
doc = html.fromstring(page.content)
AttributeError: 'unicode' object has no attribute 'fromstring'
Please help me. I already tried converting page to pagedata = page.json() but it only made it worse.
Try using this instead of html.fromstring
doc = BeautifulSoup(page.content, 'html.parser')
doc = doc.prettify()
l feel puzzled.
My idea is that I want to send request to the url, and then extract the POST data in the web page, and then sent to the web page.When l used the urllib.request in python,l failed,but instead that l used the requests,it works!
Please tell me why....
Here is the code,and the annotation is code which l used urllib.request
import urllib.request
import http.cookiejar
import re
import requests
loginUrl='https://passport.csdn.net/account/login?from=http://my.csdn.net/my/mycsdn'
#Here is the urllib.request code
#cookies=http.cookiejar.MozillaCookieJar()
#handler=urllib.request.HTTPCookieProcessor(cookies)
#opener=urllib.request.build_opener(handler)
headers={
'Origin': 'http://passport.csdn.net',
'Referer':'http://passport.csdn.net/account/login?from=http%3A%2F%2Fmy.csdn.net%2Fmy%2Fmycsdn',
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.99 Safari/537.36 LBBROWSER'
}
#Here is the requests code
s = requests.Session()
data = s.get(loginUrl)
data = data.text
#request = urllib.request.Request(loginUrl)
#response = urllib.request.urlopen(request)
#data = response.read().decode('utf-8')
#l get the value of lt and execution from the web page
pattern_lt = re.compile('<input type="hidden" name="lt" value="(.*?)" />',re.S)
lt = re.findall(pattern_lt,data)
lt = lt[0]
pattern_exe = re.compile('<input type="hidden" name="execution" value="(.*?)" />',re.S)
exe = re.findall(pattern_exe,data)
exe = exe[0]
postDict = {
'username':'qinyufeng_hdq#163.com',
'password':'csdn690076598',
'lt':lt,
'execution':exe,
'_eventId':'submit'
}
r = s.post(loginUrl, data=postDict)
#postData = urllib.parse.urlencode(postDict).encode()
#request = urllib.request.Request(loginUrl, postData,headers)
#response = opener.open(request)
#data = response.read().decode('UTF-8')
print (r.text)
l'm not good at English and l hope you get my idea and thank you for reading my problem.
I am playing with Python and trying to parse internet page in order to automate my football watching process on Amazon Fire Tv.
I produced follow code to read HTML pages by URL:
from httplib import BadStatusLine
import urllib2
import logging
htmlWorker = html_worker.HtmlWorkerLiveFootball()
htmlWorker.get_list_of_matches(htmlWorker.URL)
class HtmlWorkerLiveFootball:
URL = 'http://livefootball.ws/'
def get_list_of_matches(self, url):
opener = urllib2.OpenerDirector()
for handler in [urllib2.HTTPHandler, urllib2.HTTPDefaultErrorHandler, HTTPMethodFallback, HEADRedirectHandler,
urllib2.HTTPErrorProcessor, urllib2.HTTPSHandler]:
opener.add_handler(handler())
opener.addheaders = [('User-agent',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.114 Safari/537.36'
)]
urllib2.install_opener(opener)
try:
logging.warning("request = %s" % opener.addheaders)
page = urllib2.urlopen(url)
logging.warning("result = %s" % page.read())
except urllib2.HTTPError, error:
logging.error("error code = %d" % error.code)
except BadStatusLine:
logging.error("could not fetch %s" % url)
class HeadRequest(urllib2.Request):
def get_method(self):
return "HEAD"
class HEADRedirectHandler(urllib2.HTTPRedirectHandler):
def redirect_request(self, req, fp, code, msg, headers, newurl):
if code in (301, 302, 303, 307):
logging.warning("redirect_request = %d" % code)
newurl = newurl.replace(' ', '%20')
logging.warning("new url = %s" % newurl)
logging.warning("headers = %s" % headers)
newheaders = dict((k, v) for k, v in req.headers.items()
if k.lower() not in ("content-length", "content-type"))
logging.debug("newheaders = %s" % newheaders)
request = HeadRequest(newurl, headers=newheaders, origin_req_host=req.get_origin_req_host(),
unverifiable=True)
request.add_header('User-agent',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.114 Safari/537.36')
request.add_header('Cookie', headers.dict['set-cookie'])
request.add_header('Host', "livefootball.ws")
request.add_header('Accept-Encoding', "gzip,deflate,sdch")
request.add_header('Accept', "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8")
request.add_header('Cache-Control', "max-age=0")
request.add_header('Accept-Language', "en-US,en;q=0.8,ru;q=0.6")
logging.warning("request = %s" % request.headers)
return request
else:
raise urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp)
class HTTPMethodFallback(urllib2.BaseHandler):
def http_error_405(self, req, fp, code, msg, headers):
logging.warning("http_error_405. Headers = %s" % headers)
fp.read()
fp.close()
newheaders = dict((k, v) for k, v in req.headers.items()
if k.lower() not in ("content-length", "content-type"))
return self.parent.open(urllib2.Request(req.get_full_url(),
headers=newheaders,
origin_req_host=req.get_origin_req_host(),
unverifiable=True))
It works for major amount of site all over internet, but unfortunately looks like site that I need trying to avoid DDOS attack with some unfamiliar for me mechanism (redirect + some stuff with cookies). I trying to emulate browser behavior, but have empty string at the end.
Here log that I have after executing this code:
WARNING:root:request = [('User-agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.114 Safari/537.36')]
WARNING:root:redirect_request = 307
WARNING:root:new url = http://livefootball.ws/?dos=1
WARNING:root:headers = Server: nginx
Date: Sun, 15 Jun 2014 14:11:03 GMT
Content-Type: text/html
Content-Length: 180
Connection: close
Set-Cookie: antid=6abeccafd9ac44951b4acc7f642649b7; path=/
Location: http://livefootball.ws/?dos=1
WARNING:root:request = {'Accept-language': 'en-US,en;q=0.8,ru;q=0.6', 'Accept-encoding': 'gzip,deflate,sdch', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'User-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.114 Safari/537.36', 'Host': 'livefootball.ws', 'Cookie': 'antid=6abeccafd9ac44951b4acc7f642649b7; path=/', 'Cache-control': 'max-age=0'}
WARNING:root:result =
How to read this page with python? Thanks.
If you want to read HTML pages by URL, you can use requests library instead urllib2. It is so easy to use:
import requests
session = requests.Session()
index_url = 'http://livefootball.ws/'
index_request = session.get(index_url)
#change encoding of the response
index_request.encoding = 'CP1251'
#print page content
print index_request.text