I want to download images. But for some reason the code execute without errors but it's not creating any images.I'm using Requests and BeautifulSoup. My IDE is VS Code
HEADERS = {'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246"}
def getData(link):
URL = link
request = requests.get(url=URL, headers=HEADERS)
result = BeautifulSoup(request.content, "lxml")
return result
address = "https://bisesargodha.edu.pk/content/ViewSeqImges.aspx?SubjectId=760&SeqNameAnsSubj=Sequence1"
response = getData(address)
table = response.find('table', attrs = {'id':'ContentPlaceHolder1_Table1'})
imgs = table.findAll('img')
imgNum = 1
for img in imgs:
image_url = f"https://bisesargodha.edu.pk/content/{img['src']}"
image_save = f"img-{imgNum}.jpg"
pull_image = requests.get(image_url, headers=HEADERS)
pull_image_contant = pull_image.content
with open(image_save, "wb+") as myfile:
myfile.write(pull_image_contant)
imgNum = imgNum + 1
You need to fetch and consume your response as a stream, try something like this:
img_num = 1
for img in imgs:
image_url = f"https://bisesargodha.edu.pk/content/{img['src']}"
image_save = f"img-{img_num}.jpg"
with requests.get(image_url, headers=HEADERS, stream=True) as resp:
resp.raise_for_status() # do some additional error handling if necessary
with open(image_save, "wb") as image_file:
for chunk in r.iter_content(chunk_size=8192):
image_file.write(chunk)
img_num = img_num + 1
If the issue still persists then maybe double check the image urls you are constructing and make sure they are really pointing to the right content.
Related
This code is from a separate submission.
If you look at the lines :
each[AuthorString]
each[Title]
im wondering where the user got these variables from?
I navigated to the json page
Link
and could not find these variables? maybe im in the wrong page? Screenshots will help
here is the code
import requests
session_ids = ['13619' ,'13736']
for session_id in session_ids:
url = 'https://cdn-solr.asco.org/solr/ml/mlselect'
payload = '?_format=json&wt=json&indent=true&q=SessionId:' + session_id + '&start=0&rows=30&sort=score%20desc,%20SessionId%20asc&fq=RecordType:sessions&facet=true&f.Year.facet.sort=index&facet.field={!key=Year}Year&facet.field={!key=subject_thes}subject_thes&facet.field={!key=MediaTypes}MediaTypes&facet.field={!key=fctSessionType}fctSessionType&facet.pivot={!key=MeetingName}fctMeetingName,fctTrack&spellcheck.maxCollationTries=100'
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36'}
jsonData = requests.get(url+payload, headers=headers).json()
sessionParticipationID = jsonData['response']['docs'][0]['SessionParticipationID']
session_id_list = '%20OR%20'.join(sessionParticipationID)
payload = '?_format=json&wt=json&indent=true&sort=PresentationOrderWithinSession%20asc,%20ISODateString%20asc,%20ISODateStringEnd%20asc&fl=_id,%20score,%20ISODateString,%20ISODateStringEnd,%20ISODateString_1,%20ISODateStringEnd_1,%20Year,%20Title,%20tempAbstractID,%20MediaID,%20VideoID,%20EdBookID,%20edBookTitle,%20PosterID,%20edBookTitle,%20SessionTitle,%20SessionTypeId,%20AuthorString,%20AbstID,%20Role,%20FullName,%20PosterBoard,%20Institution,%20ProgramTitle,%20MeetingName,%20FirstAuthor&q=_id:(' + session_id_list + ')&rows=' + str(len(sessionParticipationID))
jsonData = requests.get(url+payload, headers=headers).json()
title_auth = [] #<-- to make a list of {title:author} dictionary
for each in jsonData['response']['docs']:
title = each['Title'] #this line
author = each['AuthorString'] #and this
I'm trying to detect the availability of an item on Amazon. Why doesn't this code work?
from simplified_scrapy.request import req
from simplified_scrapy.simplified_doc import SimplifiedDoc
import requests
import re
from bs4 import BeautifulSoup
from collections import OrderedDict
from time import sleep
import time
from lxml import html
import json
def check(url):
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
page = requests.get(url, headers = headers)
for i in range(20):
sleep(3)
doc = html.fromstring(page.content)
XPATH_AVAILABILITY = '//div[#id ="availability"]//text()'
RAw_AVAILABILITY = doc.xpath(XPATH_AVAILABILITY)
AVAILABILITY = ''.join(RAw_AVAILABILITY).strip() if RAw_AVAILABILITY else None
return AVAILABILITY
file_name = raw_input("Enter file name: ")
filepath = "%s"%(file_name)
with open(filepath) as f:
listoflinks = [line.rstrip('\n') for line in f]
all_links = []
for i in listoflinks:
html = req.get(i)
doc = SimplifiedDoc(html)
amazon_links = doc.getElements('a')
amazon_links = amazon_links.containsOr(['https://www.amazon.com/','https://amzn.to/'],attr='href')
for a in amazon_links:
if a.href not in all_links:
all_links.append(a.href)
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
for i in all_links:
print "LINK:"
print i
response = requests.get(i, headers=headers)
#soup = BeautifulSoup(html, "lxml")
soup = BeautifulSoup(response.content, features="lxml")
title = soup.select("#productTitle")[0].get_text().strip()
if check(i) == 'In stock.':
price = soup.select("#priceblock_saleprice")[0].get_text()
else:
price = "UNAVAILABLE"
review_count = int(soup.select("#acrCustomerReviewText")[0].get_text().split()[0])
jsonObject = {'title': title, 'price': price, 'review_count': review_count}
print json.dumps(jsonObject, indent=2)
print "////////////////////////////////////////////////"
print "..............................................."
print "FINALLY..."
print "# OF LINKS RETRIEVED:"
print len(all_links)
When I execute it, this error appears:
File "scra.py", line 17, in check
doc = html.fromstring(page.content)
AttributeError: 'unicode' object has no attribute 'fromstring'
Please help me. I already tried converting page to pagedata = page.json() but it only made it worse.
Try using this instead of html.fromstring
doc = BeautifulSoup(page.content, 'html.parser')
doc = doc.prettify()
I need to get Backlight Image Data so I'm trying to get backlight images from pixabay. But only 16 images are downloaded by the following code.
I tried to find why, and I found the difference in the html source.
The images that I downloaded are in the tag "img srcset", and my source downloads the first picture in the srcset.
But the other pictures are in "img src", and my source can't download it.
Does anyone know what is the problem??
Code
from bs4 import BeautifulSoup
import urllib.request
import os.path
url="https://pixabay.com/images/search/backlight/"
opener = urllib.request.build_opener()
opener.addheaders = [('User-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1941.0 Safari/537.36')]
urllib.request.install_opener(opener)
req = urllib.request.Request(url)
response = urllib.request.urlopen(req)
source = response.read()
soup = BeautifulSoup(source, "html.parser")
img = soup.find_all("img")
cnt = 0
for image in img:
img_src=image.get("src")
if img_src[0]=='/':
continue
cnt += 1
print(img_src)
path = "C:/Users/Guest001/Test/" + str(cnt) + ".jpg"
print(path)
urllib.request.urlretrieve(img_src, path)
Some of the images have in src a /static/img/blank.gif and the real url is in the data-lazy attribute. Also some of the images have .png suffix. Here is a working example.
from bs4 import BeautifulSoup
import urllib.request
import os.path
url="https://pixabay.com/images/search/backlight/"
opener = urllib.request.build_opener()
opener.addheaders = [('User-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1941.0 Safari/537.36')]
urllib.request.install_opener(opener)
req = urllib.request.Request(url)
response = urllib.request.urlopen(req)
source = response.read()
soup = BeautifulSoup(source, "html.parser")
img = soup.find_all("img")
cnt = 0
for image in img:
img_src= image.get("src") if '.gif' not in image.get("src") else image.get('data-lazy')
if img_src[0]=='/':
continue
cnt += 1
print(img_src)
path = ''
if '.jpg' in img_src:
path = "C:/Users/Guest001/Test/" + str(cnt) + ".jpg"
elif '.png' in img_src:
path = "C:/Users/Guest001/Test/" + str(cnt) + ".png"
print(path)
urllib.request.urlretrieve(img_src, path)
First time trying make something in python. Decided that it was a img-scraper.
it's found and download all images, but they are all corrupted. Found info about wrong unicode in BeatySoup, but I did not understand what was wrong. img in jpg, gif and png.
I don't use urllib because site blocking it (403 forbidden)
from bs4 import BeautifulSoup
import requests
import time
url = 'some url'
r = requests.get(url)
html = r.text
soup = BeautifulSoup(html, 'lxml')
images = []
for img in soup.findAll('img', {'class': '_images'}):
images.append(img.get('data-url'));
for i in range(len(images)):
s = images[i]
cutname = s.split("/")[-1]
filename = cutname[:cutname.find("?")]
f = open(filename,'wb')
f.write((requests.get(s)).content)
f.close()
time.sleep(0.5)
Seems like you need to pass some headers. The bottom part of the code to write the image file out is by #Deepspace
from bs4 import BeautifulSoup
import requests
url = "https://www.webtoons.com/en/comedy/bluechair/ep-366-husk/viewer?title_no=199&episode_no=538"
headers={
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36',
'Referer' : url
}
r = requests.get(url, headers = headers)
soup=BeautifulSoup(r.content,'lxml')
imgs=[link['data-url'] for link in soup.select('#_imageList img')]
counter = 0
for img in imgs:
counter = counter + 1
filename = 'image' + str(counter) + '.jpg'
with open(filename, 'wb') as handle:
response = requests.get(img, stream=True, headers = headers)
if not response.ok:
print(response)
for block in response.iter_content(1024):
if not block:
break
handle.write(block)
I'm getting 'HTTP Error 405: Method Not Allowed' error. My code is
import urllib.request
import urllib.parse
try:
url = 'https://www.google.com/search'
values = {'q': 'python programming tutorials'}
data = urllib.parse.urlencode(values)
data = data.encode('utf-8') # data should be bytes
headers = {}
headers['User-Agent'] = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36"
req = urllib.request.Request(url, data, headers = headers)
resp = urllib.request.urlopen(req)
print("HERE")
respData = resp.read()
saveFile = open('withHeaders.txt', 'w')
saveFile.write(str(respData))
saveFile.close()
except Exception as e:
print(e)
The error I guess is in req = urllib.request.Request(url, data, headers = headers). What is the error, syntactical? What should be changed in code? And any conceptual mistake do correct me.
EDIT
Concept:
def URLRequest(url, params, method="GET"):
if method == "POST":
return urllib2.Request(url, data=urllib.urlencode(params))
else:
return urllib2.Request(url + "?" + urllib.urlencode(params))
You can use Requests library instead. It's much cleaner than urllib
import requests
q = 'Whatever you want to search'
url = 'https://www.google.com/search'
response = requests.get(url+'?'+'q='+q)
saveFile = open('response.txt', 'w')
savefile.write(response.text)
savefile.close()
Or if you want to stick to the urllib , you can do this:
import urllib.request
url = 'https://www.google.com/search'
q = 'Search Query'
headers = {'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36"}
request = urllib.request.Request(url+'?'+'q='+q, headers=headers)
response = urllib.request.urlopen(request).read() # the text of the response is here
saveFile = open('withHeaders.txt', 'w')
saveFile.write(str(response))
saveFile.close()
Here in reference to www.pythonforbeginners
# Importing the module
import urllib.request
# your search text
text="hi google"
# Define the url
url = 'http://www.google.com/#q='+text
# Add your headers
headers = {'User-Agent' : 'Mozilla 5.10'}
# Create the Request.
request = urllib.request.Request(url, None, headers)
# Getting the response
response = urllib.request.urlopen(request)
# Print the headers
print (response.read())