how to scrape image src from homeshopping.pk? - python

i tried following code but he is showing progressive gif image src instead of original image shown in browser
from bs4 import BeautifulSoup
from requests import get
url = "https://homeshopping.pk/categories/Mobile-Phones-Price-Pakistan?page=1&AjaxRequest=1"
response = get(url)
soup = BeautifulSoup(response.text, features="lxml")
div = soup.find_all('div', class_='innerp')
for division in div:
image = division.find('img', class_="img-responsive")["src"]
print(image)
But this code shows following results

Instead of ['src'] attribute print data-src to get the path of the image:
from bs4 import BeautifulSoup
from requests import get
url = "https://homeshopping.pk/categories/Mobile-Phones-Price-Pakistan?page=1&AjaxRequest=1"
response = get(url)
soup = BeautifulSoup(response.text, features="lxml")
div = soup.find_all('div', class_='innerp')
for division in div:
image = division.select_one('img.img-responsive')['data-src']
print(image)
Prints:
https://cdn.homeshopping.pk/product_images/e/629/iphone-xs-max-pakistan__77506_thumb.jpg
https://cdn.homeshopping.pk/product_images/u/724/Apple_iPhone_XS_Max_%284G__256GB_Gold%29__86582_thumb.jpg
https://cdn.homeshopping.pk/product_images/q/375/1__58638_thumb.jpg
https://cdn.homeshopping.pk/product_images/m/071/iphone11-black-select-2019__99988_thumb.png
https://cdn.homeshopping.pk/product_images/c/923/note10_256gb-black_-minn__51011_thumb.jpg
https://cdn.homeshopping.pk/product_images/k/303/iphone-11-pro-max-space-pakistan__86401_thumb.png
https://cdn.homeshopping.pk/product_images/o/722/samsung-galaxy-a71-a715-128gb-dual-sim-blue__62457_thumb.jpg
https://cdn.homeshopping.pk/product_images/y/370/71G1FCIP1EL._SL1500___25143_thumb.jpg
https://cdn.homeshopping.pk/product_images/g/906/vivo-s1-128gb-skyline-blue-4gb-ram-__95991_thumb.jpg
https://cdn.homeshopping.pk/product_images/w/375/samsung-galaxy-a31-a315-4gb-128gb-dual-sim-blue__12319_thumb.jpg
https://cdn.homeshopping.pk/product_images/s/729/Samsung-Galaxy-A51-600x600__14872_thumb.jpg
https://cdn.homeshopping.pk/product_images/n/179/itel-A25-1-2__18679_thumb.jpg
https://cdn.homeshopping.pk/product_images/a/940/Infinix-Hot-8-a__91944_thumb.png
https://cdn.homeshopping.pk/product_images/g/488/Realme-6-Comet-Blue-1__03862_thumb.jpg
https://cdn.homeshopping.pk/product_images/k/364/Vivo-Y11__28500_thumb.jpg
https://cdn.homeshopping.pk/product_images/j/460/samsung-galaxy-fold-5g-2__57605_zoom__77077_thumb.jpg
https://cdn.homeshopping.pk/product_images/c/950/Apple_iPhone_7_Plus_%2832GB__Gold%29_Without_Facetime_1_Year_Official_Warranty__33040_thumb.jpg
https://cdn.homeshopping.pk/product_images/x/022/61vlByKsTSL._AC_SX679___91001_thumb.jpg
https://cdn.homeshopping.pk/product_images/e/023/images__40621_thumb.jpg
https://cdn.homeshopping.pk/product_images/i/979/s10-plus-black-hd-1__91254_thumb.jpg
https://cdn.homeshopping.pk/product_images/n/574/135743-v1-tecno-spark-go-mobile-phone-large-1__54522_thumb.jpg
https://cdn.homeshopping.pk/product_images/k/518/61vlByKsTSL._AC_SX679___72377_thumb.jpg
https://cdn.homeshopping.pk/product_images/a/035/37e49fd768e1cdce2b2b0bc7a3f8baa4__11345_thumb.png
https://cdn.homeshopping.pk/product_images/b/952/InfinixSmart4-b__55934_thumb.jpg

Related

Web Image Scraping not giving any output

I am trying to scrape all the photos in the URL below, but this code doesn't give any output, why?
import requests
from bs4 import BeautifulSoup
import os
url = 'https://www.airbnb.co.uk/s/Ljubljana--Slovenia/homes?tab_id=home_tab&refinement_paths%5B%5D=%2Fhomes&query=Ljubljana%2C%20Slovenia&place_id=ChIJ0YaYlvUxZUcRIOw_ghz4AAQ&checkin=2020-11-01&checkout=2020-11-08&source=structured_search_input_header&search_type=autocomplete_click'
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
images = soup.find_all('img')
print(images)
for image in images:
name = image['alt']
link = image['src']
print(name, link)
from bs4 import BeautifulSoup
import requests
# Replace this with the website's URL
URL = "put your URL here"
getURL = requests.get(URL, headers={"User-Agent": "Mozilla/5.0"})
print(getURL.status_code)
soup = BeautifulSoup(getURL.text, 'html.parser')
images = soup.find_all('img')
resolvedURLs = []
for image in images:
src = image.get('src')
resolvedURLs.append(requests.compat.urljoin(URL, src))
for image in resolvedURLs:
webs = requests.get(image)
open('images/' + image.split('/')[-1], 'wb').write(webs.content)

Retrieving content using Beautifulsoup and selectors

Trying to retrieve content (text) embedded in html. Not getting the content.
Trying to use selector in the format to find price_box:
price_box = soup2.find('div', attrs={'title class': 'Fw(600)'})
# Import libraries
import requests
import urllib.request
import time
from bs4 import BeautifulSoup
# Set the URL you want to webscrape from
url = 'https://finance.yahoo.com/quote/NVDA?p=NVDA'
# Connect to the URL
response = requests.get(url)
# Parse HTML and save to BeautifulSoup object¶
soup = BeautifulSoup(response.text, "html.parser")
beta = soup.find('h1')
#print (beta)
link = beta.contents
variable = 'NVDA - NVIDIA Corporation'
test = 'NVDA - NVIDIA Corporation'
#<..>
url2 = 'https://finance.yahoo.com/calendar/earnings?from=2019-09-01&to=2019-09-07&day=2019-09-01'
response2 = requests.get(url2)
soup2 = BeautifulSoup(response2.text, "html.parser")
# alpha = soup2.find('')
# div = soup.find('a', {class_ ='D(ib) '})
# text = div.string
price_box = soup2.find('div', attrs={'title class': 'Fw(600)'})
#price = price_box.text
print("Price Box: "+ str(price_box)) # THIS IS WHAT I WANT
Was hoping to see "Senea". Instead seeing "None" - "Price Box:
None"
A lot of the content is dynamic. You can regex out that info easily
import requests, re
p = re.compile(r'"YFINANCE:(.*?)"')
r = requests.get('https://finance.yahoo.com/calendar/earnings?from=2019-09-01&to=2019-09-07&day=2019-09-01&guccounter=1')
print(p.findall(r.text)[0])
An alternate is to avoid the dynamic looking classes altogether
import requests
from bs4 import BeautifulSoup as bs
r = requests.get('https://finance.yahoo.com/calendar/earnings?from=2019-09-01&to=2019-09-07&day=2019-09-01&guccounter=1')
soup = bs(r.content, 'lxml')
print(soup.select_one('#cal-res-table a').text)
Reading:
css selectors

Could not get link from html content using python

Here is the URL that I'am using:
http://www.protect-stream.com/PS_DL_xODN4o5HjLuqzEX5fRNuhtobXnvL9SeiyYcPLcqaqqXayD8YaIvg9Qo80hvgj4vCQkY95XB7iqcL4aF1YC8HRg_i_i
In fact on this page, the link that I am looking for appears may be 5 second after loading the page.
I see after 5 second a post request to :
http://www.protect-stream.com/secur.php
with data like so :
k=2AE_a,LHmb6kSC_c,sZNk4eNixIiPo_c,_c,Gw4ERVdriKuHJlciB1uuy_c,Sr7mOTQVUhVEcMlZeINICKegtzYsseabOlrDb_a,LmiP80NGUvAbK1xhbZGC6OWMtIaNF12f0mYA4O0WxBkmAtz75kpYcrHzxtYt32hCYSp0WjqOQR9bY_a,ofQtw_b,
I didn't get from where the 'k' value come from ?
Is their an idea on how we could get the 'k' value using python ?
This is not going to be trivial. The k parameter value is "hidden" deep inside a script element inside nested iframes. Here is a requests + BeautifulSoup way to get to the k value:
import re
from urlparse import urljoin
# Python 3: from urllib.parse import urljoin
import requests
from bs4 import BeautifulSoup
base_url = "http://www.protect-stream.com"
with requests.Session() as session:
response = session.get("http://www.protect-stream.com/PS_DL_xODN4o5HjLuqzEX5fRNuhtobXnvL9SeiyYcPLcqaqqXayD8YaIvg9Qo80hvgj4vCQkY95XB7iqcL4aF1YC8HRg_i_i")
# get the top frame url
soup = BeautifulSoup(response.content, "html.parser")
src = soup.select_one('iframe[src^="frame.php"]')["src"]
frame_url = urljoin(base_url, src)
# get the nested frame url
response = session.get(frame_url)
soup = BeautifulSoup(response.content, "html.parser")
src = soup.select_one('iframe[src^="w.php"]')["src"]
frame_url = urljoin(base_url, src)
# get the frame HTML source and extract the "k" value
response = session.get(frame_url)
soup = BeautifulSoup(response.content, "html.parser")
script = soup.find("script", text=lambda text: text and "k=" in text).get_text(strip=True)
k_value = re.search(r'var k="(.*?)";', script).group(1)
print(k_value)
Prints:
YjfH9430zztSYgf7ItQJ4grv2cvH3mT7xGwv32rTy2HiB1uuy_c,Sr7mOTQVUhVEcMlZeINICKegtzYsseabOlrDb_a,LmiP80NGUvAbK1xhbZGC6OWMtIaNF12f0mYA4O0WXhmwUC0ipkPRkLQepYHLyF1U0xvsrzHMcK2XBCeY3_a,O_b,

decrease string in python doesn't works

I have a problem to cut the url that i get as result from Beautifulsoup, i've used this code to retrieve the url.
import urllib2
from bs4 import BeautifulSoup
url = 'http://192.168.0.184:88/cgi-bin/CGIProxy.fcgi? cmd=snapPicture&usr=USER&pwd=PASS'
html = urllib2.urlopen(url)
soup = BeautifulSoup(html, "html5lib")
imgs = soup.findAll("img")
print imgs
print imgs[1:]
As result from print imgs i get [<img src="../snapPic/Snap_20160401-110642.jpg"/>]
I want to cut the unwanted characters from this string so i try to use for eg. print imgs[1:] but as result i get []
Any tips or solutions?
I want to rebuild the imgs string to the correct image url
imgs string = <img src="../snapPic/Snap_20160401-110642.jpg"/>
wanted result = http://192.168.0.184:88/snapPic/Snap_20160401-110642.jpg
try this
import urllib2
from bs4 import BeautifulSoup
url = 'http://192.168.0.184:88/cgi-bin/CGIProxy.fcgi? cmd=snapPicture&usr=USER&pwd=PASS'
html = urllib2.urlopen(url)
soup = BeautifulSoup(html, "html5lib")
imgs = soup.findAll("img")
print imgs
for img in imgs:
print img["src"].replace("..","http://192.168.0.184:88")

How should I scrape these images without errors?

I'm trying to scrape the images (or the images link) of this forum (http://www.xossip.com/showthread.php?t=1384077) . I've tried beautiful soup 4 and here is the code I tried:
import requests
from bs4 import BeautifulSoup
def spider(max_pages):
page = 1
while page <= max_pages:
url = 'http://www.xossip.com/showthread.php?t=1384077&page=' + str(page)
sourcecode= requests.get(url)
plaintext = sourcecode.text
soup = BeautifulSoup(plaintext)
for link in soup.findAll('a',{'class': 'alt1'}):
src = link.get('src')
print(src)
page += 1
spider(1)
How should I correct it so that I get links of images like pzy.be/example ?
Okay, so I did this by getting all of the #post_message_* divs and then getting the images from each of those.
import requests
from bs4 import BeautifulSoup
def spider(max_pages):
page = 1
while page <= max_pages:
url = 'http://www.xossip.com/showthread.php?t=1384077&page=' + str(page)
sourcecode= requests.get(url)
plaintext = sourcecode.text
soup = BeautifulSoup(plaintext)
divs = soup.findAll('div', id=lambda d: d and d.startswith('post_message_'))
for div in divs:
src = div.find('img')['src']
if src.startswith('http'): # b/c it could be a smilie or something like that
print(src)
page += 1
spider(1)
The simplest way is to just request each page and filter the img tags:
from bs4 import BeautifulSoup
from requests import get
import re
def get_wp():
start_url = "http://www.xossip.com/showthread.php?t=1384077&page={}"
for i in range(73):
r = get(start_url.format(i))
soup = BeautifulSoup(r.content)
for img in (i["src"] for i in soup.find_all("img", src=re.compile("http://pzy.be.*.jpg"))):
yield img

Categories

Resources