How to download image with a long URL length? - python

I am trying to download a picture from "https://prnt.sc", but the URL of the image is so long and I also can‘t find that URL when printing r.content.
This is my code for getting the HTML:
import requests
import random
import string
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)
Chrome/76.0.3809.132 Safari/537.36 OPR/63.0.3368.71'
}
register_data = {"path" : "luk111"}
print (register_data)
with requests.Session() as s:
url = 'https://prnt.sc/luk111'
r = s.post(url, json=register_data, headers=headers)
print(r)
print (r.content)
The whole url has around 81954 characters, so I need a better way to download it. Any ideas?
This is my code for downloading the .jpg image:
import random
import urllib.request
def download_web_image(url):
name = "xddd"
full_name = "screen/" + str(name) + ".jpg"
urllib.request.urlretrieve(url,full_name)
xd = input("paste url")
download_web_image(xd)

This long url on page is not real url but image's data in base64
But first I turned off JavaScript in web browser and I checked this page without JavaScript because requests and BeautifulSoup can't run JavaScript.
I see normal url to image and I have no problem to download it.
import requests
from bs4 import BeautifulSoup as BS
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36 OPR/63.0.3368.71'
}
with requests.Session() as s:
url = 'https://prnt.sc/luk111'
r = s.get(url, headers=headers)
soup = BS(r.content, 'html.parser')
img_url = soup.find('img', {'id': 'screenshot-image'})['src']
r = s.get(img_url, headers=headers)
with open('temp.png', 'wb') as f:
f.write(r.content)

Related

How to get text paragraphs from a website: Error 403 Forbidden

I am trying to do web scraping with the help of requests and BeautifulSoup. But, the desired outcome is null.
My code is as follows:
def urlscrape(url):
page = requests.get(url).text
soup = BeautifulSoup(page, 'html')
text = [p.text for p in soup.find(class_='bg-white').find_all('p')]
print(url)
return text
The website is: https://www.afghanistan-analysts.org/en/reports/war-and-peace/taleban-prisoners-release-are-the-latest-proposals-legal/
I want all the <p> tags containing paragraphs to be extracted as texts.
you can try this:-
from bs4 import BeautifulSoup
import requests
import pandas as pd
headers = {
"Connection": "keep-alive",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36"
}
url = 'https://www.afghanistan-analysts.org/en/reports/war-and-peace/taleban-prisoners-release-are-the-latest-proposals-legal/'
page = requests.get(url, headers=headers)
soup = BeautifulSoup(page.text, 'html')
text = [p.text for p in soup.find_all('p')]
Try this ...
url="https://www.afghanistan-analysts.org/en/reports/war-and-peace/taleban-prisoners-release-are-the-latest-proposals-legal/"
user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'
headers = {'User-Agent': user_agent}
response = requests.get(url,headers=headers)
html = response.content
print(response.content)

It returns none when I get the id of the url using beatiful soup and how could i get the content of its id

It returns none when I get the id of the url using Beautiful Soup and how could I get the content of its id
import requests
import json
from bs4 import BeautifulSoup
URL = 'https://www.amazon.com/Ozeri-Digital-Multifunction-Kitchen-Elegant/dp/B01LAVADW2?pf_rd_p=3e7c8265-9bb7-5ab2-be71-1af95f06a1ad&pf_rd_r=52Z7DNQGKGV31B114R1K&pd_rd_wg=IAKey&ref_=pd_gw_ri&pd_rd_w=rDONb&pd_rd_r=b6b3cf66-c4a8-449a-8676-9027e8922b96'
headers = {"User-Agent":'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.100 Safari/537.36'}
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')
title = soup.find(id="productTitle")
print(title)
you have created a variable headers, but you didn't add it to your request, also, you are not checking your request response status code (which is 503)
fixing your code it should look something like this:
import requests
import json
from bs4 import BeautifulSoup
URL = 'https://www.amazon.com/Ozeri-Digital-Multifunction-Kitchen-Elegant/dp/B01LAVADW2?pf_rd_p=3e7c8265-9bb7-5ab2-be71-1af95f06a1ad&pf_rd_r=52Z7DNQGKGV31B114R1K&pd_rd_wg=IAKey&ref_=pd_gw_ri&pd_rd_w=rDONb&pd_rd_r=b6b3cf66-c4a8-449a-8676-9027e8922b96'
headers = {"User-Agent":'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.100 Safari/537.36'}
r = requests.get(URL, headers=headers)
if r.status_code == 200:
soup = BeautifulSoup(r.text)
title = soup.find(id="productTitle")
print(title.next)

BeautifulSoup does not show some tags in html page

If I visit this page here, I can see the image on the page with img tag upon inspection.
But when I try to get the page using requests and parsing with BeautifulSoup I can't access the same image. What am I missing here?
The code works fine, I get 200 as status_code from the request.
import requests
from bs4 import BeautifulSoup
url = 'https://mangadex.org/chapter/435396/2'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.76 Safari/537.36'}
page = requests.get(url,headers=headers)
print(page.status_code)
soup = BeautifulSoup(page.text,'html.parser')
img_tags = soup.find_all('img')
for img in img_tags:
print(img)
EDIT::
As per suggestion, the selenium option works fine. But is there a way to speed it up as BeautifulSoup does?
The page has JavaScript that needs to run in order to populate some of the elements on the page. You could use Selenium to run the page's JavaScript before accessing the image.
You can use API to get images. Code below gets all images from the page and print urls:
import requests
headers = {
'Accept': 'application/json, text/plain, */*',
'Referer': 'https://mangadex.org/chapter/435396/2',
'DNT': '1',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) '
'AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/73.0.3683.86 Safari/537.36',
}
params = (
('id', '435396'),
('type', 'chapter'),
('baseURL', '/api'),
)
response = requests.get('https://mangadex.org/api/', headers=headers, params=params)
data = response.json()
img_base_url = "https://s4.mangadex.org/data"
img_hash = data["hash"]
img_names = data["page_array"]
for img in img_names:
print(f"{img_base_url}/{img_hash}/{img}")
Output:
https://s4.mangadex.org/data/ac081a99e13d8765d48e55869cd5444c/x1.png
https://s4.mangadex.org/data/ac081a99e13d8765d48e55869cd5444c/x2.png
https://s4.mangadex.org/data/ac081a99e13d8765d48e55869cd5444c/x3.png
https://s4.mangadex.org/data/ac081a99e13d8765d48e55869cd5444c/x4.png
https://s4.mangadex.org/data/ac081a99e13d8765d48e55869cd5444c/x5.png
https://s4.mangadex.org/data/ac081a99e13d8765d48e55869cd5444c/x6.png
https://s4.mangadex.org/data/ac081a99e13d8765d48e55869cd5444c/x7.png
https://s4.mangadex.org/data/ac081a99e13d8765d48e55869cd5444c/x8.png
https://s4.mangadex.org/data/ac081a99e13d8765d48e55869cd5444c/x9.png
https://s4.mangadex.org/data/ac081a99e13d8765d48e55869cd5444c/x10.png
https://s4.mangadex.org/data/ac081a99e13d8765d48e55869cd5444c/x11.png
https://s4.mangadex.org/data/ac081a99e13d8765d48e55869cd5444c/x12.png
https://s4.mangadex.org/data/ac081a99e13d8765d48e55869cd5444c/x13.png
https://s4.mangadex.org/data/ac081a99e13d8765d48e55869cd5444c/x14.png
https://s4.mangadex.org/data/ac081a99e13d8765d48e55869cd5444c/x15.png
https://s4.mangadex.org/data/ac081a99e13d8765d48e55869cd5444c/x16.png
https://s4.mangadex.org/data/ac081a99e13d8765d48e55869cd5444c/x17.png
https://s4.mangadex.org/data/ac081a99e13d8765d48e55869cd5444c/x18.png
https://s4.mangadex.org/data/ac081a99e13d8765d48e55869cd5444c/x19.png
https://s4.mangadex.org/data/ac081a99e13d8765d48e55869cd5444c/x20.png
https://s4.mangadex.org/data/ac081a99e13d8765d48e55869cd5444c/x21.png
https://s4.mangadex.org/data/ac081a99e13d8765d48e55869cd5444c/x22.png
https://s4.mangadex.org/data/ac081a99e13d8765d48e55869cd5444c/x23.png
https://s4.mangadex.org/data/ac081a99e13d8765d48e55869cd5444c/x24.png
https://s4.mangadex.org/data/ac081a99e13d8765d48e55869cd5444c/x25.png
https://s4.mangadex.org/data/ac081a99e13d8765d48e55869cd5444c/x26.png
https://s4.mangadex.org/data/ac081a99e13d8765d48e55869cd5444c/x27.png
https://s4.mangadex.org/data/ac081a99e13d8765d48e55869cd5444c/x28.png
https://s4.mangadex.org/data/ac081a99e13d8765d48e55869cd5444c/x29.png
https://s4.mangadex.org/data/ac081a99e13d8765d48e55869cd5444c/x30.png
https://s4.mangadex.org/data/ac081a99e13d8765d48e55869cd5444c/x31.png
https://s4.mangadex.org/data/ac081a99e13d8765d48e55869cd5444c/x32.png
https://s4.mangadex.org/data/ac081a99e13d8765d48e55869cd5444c/x33.png
https://s4.mangadex.org/data/ac081a99e13d8765d48e55869cd5444c/x34.png
https://s4.mangadex.org/data/ac081a99e13d8765d48e55869cd5444c/x35.png
https://s4.mangadex.org/data/ac081a99e13d8765d48e55869cd5444c/x36.png

Download bing image search results using python (custom url)

I want to download bing search images using python code.
Example URL: https://www.bing.com/images/search?q=sketch%2520using%20iphone%2520students
My python code generates an url of bing search as shown in example. Next step, is to download all images shown in that link on my local desktop.
In my project i am generating some words in python and my code generates bing image search URL. All i need is to download images shown on that search page using python.
To download an image, you need to make a request to the image URL that ends with .png, .jpg etc.
But Bing provides a "m" attribute inside the <a> element that stores needed data in the JSON format from which you can parse the image URL that is stored in the "murl" key and download it afterward.
To download all images locally to your computer, you can use 2 methods:
# bs4
for index, url in enumerate(soup.select(".iusc"), start=1):
img_url = json.loads(url["m"])["murl"]
image = requests.get(img_url, headers=headers, timeout=30)
query = query.lower().replace(" ", "_")
if image.status_code == 200:
with open(f"images/{query}_image_{index}.jpg", 'wb') as file:
file.write(image.content)
# urllib
for index, url in enumerate(soup.select(".iusc"), start=1):
img_url = json.loads(url["m"])["murl"]
query = query.lower().replace(" ", "_")
opener = req.build_opener()
opener.addheaders=[("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.60 Safari/537.36")]
req.install_opener(opener)
req.urlretrieve(img_url, f"images/{query}_image_{index}.jpg")
In the first case, you can use context manager with open() to load the image locally. In the second case, you can use urllib.request.urlretrieve method of the urllib.request library.
Also, make sure you're using request headers user-agent to act as a "real" user visit. Because default requests user-agent is python-requests and websites understand that it's most likely a script that sends a request. Check what's your user-agent.
Note: An error might occur with the urllib.request.urlretrieve method where some of the request has got a captcha or something else that returns an unsuccessful status code. The biggest problem is it's hard to test for response code while requests provide a status_code method to test it.
Code and full example in online IDE:
from bs4 import BeautifulSoup
import requests, lxml, json
query = "sketch using iphone students"
# https://docs.python-requests.org/en/master/user/quickstart/#passing-parameters-in-urls
params = {
"q": query,
"first": 1
}
# https://docs.python-requests.org/en/master/user/quickstart/#custom-headers
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.60 Safari/537.36"
}
response = requests.get("https://www.bing.com/images/search", params=params, headers=headers, timeout=30)
soup = BeautifulSoup(response.text, "lxml")
for index, url in enumerate(soup.select(".iusc"), start=1):
img_url = json.loads(url["m"])["murl"]
image = requests.get(img_url, headers=headers, timeout=30)
query = query.lower().replace(" ", "_")
if image.status_code == 200:
with open(f"images/{query}_image_{index}.jpg", 'wb') as file:
file.write(image.content)
Using urllib.request.urlretrieve.
from bs4 import BeautifulSoup
import requests, lxml, json
import urllib.request as req
query = "sketch using iphone students"
# https://docs.python-requests.org/en/master/user/quickstart/#passing-parameters-in-urls
params = {
"q": query,
"first": 1
}
# https://docs.python-requests.org/en/master/user/quickstart/#custom-headers
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.60 Safari/537.36"
}
response = requests.get("https://www.bing.com/images/search", params=params, headers=headers, timeout=30)
soup = BeautifulSoup(response.text, "lxml")
for index, url in enumerate(soup.select(".iusc"), start=1):
img_url = json.loads(url["m"])["murl"]
query = query.lower().replace(" ", "_")
opener = req.build_opener()
opener.addheaders=[("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.60 Safari/537.36")]
req.install_opener(opener)
req.urlretrieve(img_url, f"images/{query}_image_{index}.jpg")
Output:
edit your code to find the designated image url and then use this code
use urllib.request
import urllib.request as req
imgurl ="https://i.ytimg.com/vi/Ks-_Mh1QhMc/hqdefault.jpg"
req.urlretrieve(imgurl, "image_name.jpg")

How to login to Amazon using BeautifulSoup

Referring to this post: Unable to log in to Amazon using Python
I tried using the suggested answer, but still cannot login.
I added code to display what the result is. It's inputting the email into the box, but I'm still seeing "Enter a valid email" in the result. I'm pretty sure I selected the form correctly, and the name's of the input fields are correct, but need a little guidance to debug this.
import bs4, requests
import os
import webbrowser
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.110 Safari/537.36'
}
from bs4 import BeautifulSoup
with requests.Session() as s:
s.headers = headers
r = s.get('https://www.amazon.com/ap/signin?_encoding=UTF8&ignoreAuthState=1&openid.assoc_handle=usflex&openid.claimed_id=http%3A%2F%2Fspecs.openid.net%2Fauth%2F2.0%2Fidentifier_select&openid.identity=http%3A%2F%2Fspecs.openid.net%2Fauth%2F2.0%2Fidentifier_select&openid.mode=checkid_setup&openid.ns=http%3A%2F%2Fspecs.openid.net%2Fauth%2F2.0&openid.ns.pape=http%3A%2F%2Fspecs.openid.net%2Fextensions%2Fpape%2F1.0&openid.pape.max_auth_age=0&openid.return_to=https%3A%2F%2Fwww.amazon.com%2F%3Fref_%3Dnav_signin&switch_account=')
soup = BeautifulSoup(r.content, "html.parser")
signin_data = {s["name"]: s["value"]
for s in soup.select("form[name=signIn]")[0].select("input[name]")
if s.has_attr("value")}
signin_data[u'email'] = 'xx'
signin_data[u'password'] = 'xx'
response = s.post('https://www.amazon.com/ap/signin', data=signin_data)
soup = bs4.BeautifulSoup(response.text, "html.parser")
html = response.content
path = os.path.abspath('temp.html')
url = 'file://' + path
with open(path, 'w') as f:
f.write(str(html))
webbrowser.open(url)
I don't know about BeautifulSoup, but here's how I did it using requests.
from getpass import getpass
import webbrowser
import requests
import os
amazon_username = raw_input("Amazon email: ")
amazon_password = getpass()
headers = {
"User-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.110 Safari/537.36",
"action": "sign-in",
"email": amazon_username,
"password": amazon_password
}
r = requests.get("https://www.amazon.com/gp/sign-in.html", headers=headers)
print(r.status_code)
r = requests.get("https://www.amazon.com/gp/flex/sign-in/select.html", headers=headers)
print(r.status_code)
r = requests.get("https://www.amazon.com/", headers=headers)
print(r.status_code)

Categories

Resources