Scraper requests image corrupted - python

First time trying make something in python. Decided that it was a img-scraper.
it's found and download all images, but they are all corrupted. Found info about wrong unicode in BeatySoup, but I did not understand what was wrong. img in jpg, gif and png.
I don't use urllib because site blocking it (403 forbidden)
from bs4 import BeautifulSoup
import requests
import time
url = 'some url'
r = requests.get(url)
html = r.text
soup = BeautifulSoup(html, 'lxml')
images = []
for img in soup.findAll('img', {'class': '_images'}):
images.append(img.get('data-url'));
for i in range(len(images)):
s = images[i]
cutname = s.split("/")[-1]
filename = cutname[:cutname.find("?")]
f = open(filename,'wb')
f.write((requests.get(s)).content)
f.close()
time.sleep(0.5)

Seems like you need to pass some headers. The bottom part of the code to write the image file out is by #Deepspace
from bs4 import BeautifulSoup
import requests
url = "https://www.webtoons.com/en/comedy/bluechair/ep-366-husk/viewer?title_no=199&episode_no=538"
headers={
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36',
'Referer' : url
}
r = requests.get(url, headers = headers)
soup=BeautifulSoup(r.content,'lxml')
imgs=[link['data-url'] for link in soup.select('#_imageList img')]
counter = 0
for img in imgs:
counter = counter + 1
filename = 'image' + str(counter) + '.jpg'
with open(filename, 'wb') as handle:
response = requests.get(img, stream=True, headers = headers)
if not response.ok:
print(response)
for block in response.iter_content(1024):
if not block:
break
handle.write(block)

Related

Requests Not Creating Images Python

I want to download images. But for some reason the code execute without errors but it's not creating any images.I'm using Requests and BeautifulSoup. My IDE is VS Code
HEADERS = {'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246"}
def getData(link):
URL = link
request = requests.get(url=URL, headers=HEADERS)
result = BeautifulSoup(request.content, "lxml")
return result
address = "https://bisesargodha.edu.pk/content/ViewSeqImges.aspx?SubjectId=760&SeqNameAnsSubj=Sequence1"
response = getData(address)
table = response.find('table', attrs = {'id':'ContentPlaceHolder1_Table1'})
imgs = table.findAll('img')
imgNum = 1
for img in imgs:
image_url = f"https://bisesargodha.edu.pk/content/{img['src']}"
image_save = f"img-{imgNum}.jpg"
pull_image = requests.get(image_url, headers=HEADERS)
pull_image_contant = pull_image.content
with open(image_save, "wb+") as myfile:
myfile.write(pull_image_contant)
imgNum = imgNum + 1
You need to fetch and consume your response as a stream, try something like this:
img_num = 1
for img in imgs:
image_url = f"https://bisesargodha.edu.pk/content/{img['src']}"
image_save = f"img-{img_num}.jpg"
with requests.get(image_url, headers=HEADERS, stream=True) as resp:
resp.raise_for_status() # do some additional error handling if necessary
with open(image_save, "wb") as image_file:
for chunk in r.iter_content(chunk_size=8192):
image_file.write(chunk)
img_num = img_num + 1
If the issue still persists then maybe double check the image urls you are constructing and make sure they are really pointing to the right content.

how to extract a link inside <script> tag

I'm trying to get a .mp3 file in a link from TikTok sounds
the problem is I can't extract it because it's inside <"script"> tag
I'm using pycurl instead of requests
all i need is to extract this from the response then extract the URL from UrlList"
"playUrl":{"Uri":"musically-maliva-obj/7038595527327419141.mp3","UrlList":["https://sf16-ies-music-va.tiktokcdn.com/obj/musically-maliva-obj/7038595527327419141.mp3"]}
import pycurl
from io import BytesIO
import certifi
from bs4 import BeautifulSoup
url = "https://vm.tiktok.com/ZML1t1vW7/"
buffer = BytesIO()
c = pycurl.Curl()
c.setopt(pycurl.CAINFO, certifi.where())
c.setopt(c.URL, url)
c.setopt(pycurl.SSL_VERIFYPEER, 0)
c.setopt(pycurl.SSL_VERIFYHOST, 0)
c.setopt(pycurl.HTTPHEADER, ["User-Agent: Mozilla/5.0 (iPhone; CPU iPhone OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148"])
c.setopt(c.WRITEDATA, buffer)
c.setopt(c.FOLLOWLOCATION, True)
c.perform()
c.close()
body = buffer.getvalue()
response = body.decode('utf-8')
#response = response.split('"')
#response = response[1]
#response = response.split('.html?')
#response= response[0]
a = response.split("'") # gives me a list and i don't know how to search in it
soup = BeautifulSoup(response, 'html.parser') # cause response is a string
link = soup.find("script", id="sigi-persisted-data") #i tried to use bs4 but i couldn't find a reasult
print(link)
You can try extracting the json data, parse it to dictionary value and then navigate dictionary to get the data (json_data["/music/*-:id"]["musicData"]["playUrl"]["UrlList"][0])
import pycurl
from io import BytesIO
import certifi
from bs4 import BeautifulSoup
import re
import json
url = "https://vm.tiktok.com/ZML1t1vW7/"
buffer = BytesIO()
c = pycurl.Curl()
c.setopt(pycurl.CAINFO, certifi.where())
c.setopt(c.URL, url)
c.setopt(pycurl.SSL_VERIFYPEER, 0)
c.setopt(pycurl.SSL_VERIFYHOST, 0)
c.setopt(pycurl.HTTPHEADER, ["User-Agent: Mozilla/5.0 (iPhone; CPU iPhone OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148"])
c.setopt(c.WRITEDATA, buffer)
c.setopt(c.FOLLOWLOCATION, True)
c.perform()
c.close()
body = buffer.getvalue()
response = body.decode('utf-8')
soup = BeautifulSoup(response, 'html.parser')
scripts = soup.findAll("script")
for s in scripts:
s_str = str(s)
res = re.search(r'<script>window.__INIT_PROPS__ = (.*)</script>', s_str)
if res:
json_data = json.loads(res.group(1))
print(json_data["/music/*-:id"]["musicData"]["playUrl"]["UrlList"][0])
You can use a regular expressions pattern:
import re
...
print(re.search(r'"playUrl":"(.*)"', str(soup)).group(1))

How to get the value of a "hidden" href?

I'm working with web scraping to, at first, collect the total pages. I have tested the code I made for another site and however I am having a problem getting the next page link (href).
Here's the code:
from urllib.parse import urljoin
from bs4 import BeautifulSoup
import requests
userName = 'brendanm1975' # just for testing
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}
pages = []
with requests.Session() as session:
page_number = 1
url = "https://www.last.fm/user/"+userName+"/library/artists?page="
while True:
response = session.get(url, headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')
pages.append(url)
next_link = soup.find("li", class_="pagination-next")
if next_link is None:
break
url = urljoin(url, next_link["href"])
page_number += 1
As you can see, the href of this site presents the link as "?page=2", which does not allow me to get its content (https://www.last.fm/user/brendanm1975/library/artists?page=2).
I've already inspected the variables, and I'm getting the values.
print(url) # output: https://www.last.fm/user/brendanm1975/library/artists?page=
next_link.find('a').get('href') # output: '?page=2'
Does anyone know how to get around this?
What happens?
You try to urljoin(url, next_link["href"]) but next_link do not have an attribute href cause you are selecting the <li> not the <a>.
How to fix?
Option#1 - Just select the <a> in your urljoin():
url = urljoin(url, next_link.a["href"])
Option#2 - Select the <a> directly:
next_link = soup.select_one('li.pagination-next a')
Example
from urllib.parse import urljoin
from bs4 import BeautifulSoup
import requests
userName = 'brendanm1975' # just for testing
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}
pages = []
with requests.Session() as session:
url = "https://www.last.fm/user/"+userName+"/library/artists?page=1"
while True:
response = session.get(url, headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')
pages.append(url)
next_link = soup.find("li", class_="pagination-next")
if next_link is None:
break
url = urljoin(url, next_link.a["href"])
Output
['https://www.last.fm/user/brendanm1975/library/artists?page=1',
'https://www.last.fm/user/brendanm1975/library/artists?page=2',
'https://www.last.fm/user/brendanm1975/library/artists?page=3',
'https://www.last.fm/user/brendanm1975/library/artists?page=4',
'https://www.last.fm/user/brendanm1975/library/artists?page=5',
'https://www.last.fm/user/brendanm1975/library/artists?page=6',
'https://www.last.fm/user/brendanm1975/library/artists?page=7',
'https://www.last.fm/user/brendanm1975/library/artists?page=8',
'https://www.last.fm/user/brendanm1975/library/artists?page=9',
'https://www.last.fm/user/brendanm1975/library/artists?page=10',
'https://www.last.fm/user/brendanm1975/library/artists?page=11',
'https://www.last.fm/user/brendanm1975/library/artists?page=12',
'https://www.last.fm/user/brendanm1975/library/artists?page=13',
'https://www.last.fm/user/brendanm1975/library/artists?page=14',
'https://www.last.fm/user/brendanm1975/library/artists?page=15',
'https://www.last.fm/user/brendanm1975/library/artists?page=16',
'https://www.last.fm/user/brendanm1975/library/artists?page=17',
'https://www.last.fm/user/brendanm1975/library/artists?page=18',...]

Corrupted pdf when using requests (python)

I am trying to download all pdf files from one website but every pdf created is corrupted...
import requests
from bs4 import BeautifulSoup
url ="https://www.geeksforgeeks.org/how-to-extract-pdf-tables-in-python/"
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
links = soup.find_all('a')
i = 0
for link in links:
if('.pdf' in link.get('href', [])):
i += 1
print("Downloading file: ", i)
response = requests.get(link.get('href'))
pdf = open("pdf"+str(i)+".pdf", 'wb')
pdf.write(response.content)
pdf.close()
print("File ", i, " downloaded")
print("All PDF files downloaded")
Add headers to your requests
import requests
headers = {
'user-agent': 'Mozilla/5.0 (Macintosh; PPC Mac OS X 10_8_7 rv:5.0; en-US) AppleWebKit/533.31.5 (KHTML, like Gecko) Version/4.0 Safari/533.31.5',
}
from bs4 import BeautifulSoup
url ="https://www.geeksforgeeks.org/how-to-extract-pdf-tables-in-python/"
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
links = soup.find_all('a')
i = 0
for link in links:
if('.pdf' in link.get('href', [])):
i += 1
print("Downloading file: ", i)
response = requests.get(link.get('href'), headers=headers)
pdf = open("pdf"+str(i)+".pdf", 'wb')
pdf.write(response.content)
pdf.close()
print("File ", i, " downloaded")
print("All PDF files downloaded")

Why can I only scrape 16 photos from pixabay?

I need to get Backlight Image Data so I'm trying to get backlight images from pixabay. But only 16 images are downloaded by the following code.
I tried to find why, and I found the difference in the html source.
The images that I downloaded are in the tag "img srcset", and my source downloads the first picture in the srcset.
But the other pictures are in "img src", and my source can't download it.
Does anyone know what is the problem??
Code
from bs4 import BeautifulSoup
import urllib.request
import os.path
url="https://pixabay.com/images/search/backlight/"
opener = urllib.request.build_opener()
opener.addheaders = [('User-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1941.0 Safari/537.36')]
urllib.request.install_opener(opener)
req = urllib.request.Request(url)
response = urllib.request.urlopen(req)
source = response.read()
soup = BeautifulSoup(source, "html.parser")
img = soup.find_all("img")
cnt = 0
for image in img:
img_src=image.get("src")
if img_src[0]=='/':
continue
cnt += 1
print(img_src)
path = "C:/Users/Guest001/Test/" + str(cnt) + ".jpg"
print(path)
urllib.request.urlretrieve(img_src, path)
Some of the images have in src a /static/img/blank.gif and the real url is in the data-lazy attribute. Also some of the images have .png suffix. Here is a working example.
from bs4 import BeautifulSoup
import urllib.request
import os.path
url="https://pixabay.com/images/search/backlight/"
opener = urllib.request.build_opener()
opener.addheaders = [('User-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1941.0 Safari/537.36')]
urllib.request.install_opener(opener)
req = urllib.request.Request(url)
response = urllib.request.urlopen(req)
source = response.read()
soup = BeautifulSoup(source, "html.parser")
img = soup.find_all("img")
cnt = 0
for image in img:
img_src= image.get("src") if '.gif' not in image.get("src") else image.get('data-lazy')
if img_src[0]=='/':
continue
cnt += 1
print(img_src)
path = ''
if '.jpg' in img_src:
path = "C:/Users/Guest001/Test/" + str(cnt) + ".jpg"
elif '.png' in img_src:
path = "C:/Users/Guest001/Test/" + str(cnt) + ".png"
print(path)
urllib.request.urlretrieve(img_src, path)

Categories

Resources