how to scrape image src from homeshopping.pk?

how to scrape image src from homeshopping.pk? - python

i tried following code but he is showing progressive gif image src instead of original image shown in browser
from bs4 import BeautifulSoup
from requests import get
url = "https://homeshopping.pk/categories/Mobile-Phones-Price-Pakistan?page=1&AjaxRequest=1"
response = get(url)
soup = BeautifulSoup(response.text, features="lxml")
div = soup.find_all('div', class_='innerp')
for division in div:
image = division.find('img', class_="img-responsive")["src"]
print(image)
But this code shows following results

Instead of ['src'] attribute print data-src to get the path of the image:
from bs4 import BeautifulSoup
from requests import get
url = "https://homeshopping.pk/categories/Mobile-Phones-Price-Pakistan?page=1&AjaxRequest=1"
response = get(url)
soup = BeautifulSoup(response.text, features="lxml")
div = soup.find_all('div', class_='innerp')
for division in div:
image = division.select_one('img.img-responsive')['data-src']
print(image)
Prints:
https://cdn.homeshopping.pk/product_images/e/629/iphone-xs-max-pakistan__77506_thumb.jpg
https://cdn.homeshopping.pk/product_images/u/724/Apple_iPhone_XS_Max_%284G__256GB_Gold%29__86582_thumb.jpg
https://cdn.homeshopping.pk/product_images/q/375/1__58638_thumb.jpg
https://cdn.homeshopping.pk/product_images/m/071/iphone11-black-select-2019__99988_thumb.png
https://cdn.homeshopping.pk/product_images/c/923/note10_256gb-black_-minn__51011_thumb.jpg
https://cdn.homeshopping.pk/product_images/k/303/iphone-11-pro-max-space-pakistan__86401_thumb.png
https://cdn.homeshopping.pk/product_images/o/722/samsung-galaxy-a71-a715-128gb-dual-sim-blue__62457_thumb.jpg
https://cdn.homeshopping.pk/product_images/y/370/71G1FCIP1EL._SL1500___25143_thumb.jpg
https://cdn.homeshopping.pk/product_images/g/906/vivo-s1-128gb-skyline-blue-4gb-ram-__95991_thumb.jpg
https://cdn.homeshopping.pk/product_images/w/375/samsung-galaxy-a31-a315-4gb-128gb-dual-sim-blue__12319_thumb.jpg
https://cdn.homeshopping.pk/product_images/s/729/Samsung-Galaxy-A51-600x600__14872_thumb.jpg
https://cdn.homeshopping.pk/product_images/n/179/itel-A25-1-2__18679_thumb.jpg
https://cdn.homeshopping.pk/product_images/a/940/Infinix-Hot-8-a__91944_thumb.png
https://cdn.homeshopping.pk/product_images/g/488/Realme-6-Comet-Blue-1__03862_thumb.jpg
https://cdn.homeshopping.pk/product_images/k/364/Vivo-Y11__28500_thumb.jpg
https://cdn.homeshopping.pk/product_images/j/460/samsung-galaxy-fold-5g-2__57605_zoom__77077_thumb.jpg
https://cdn.homeshopping.pk/product_images/c/950/Apple_iPhone_7_Plus_%2832GB__Gold%29_Without_Facetime_1_Year_Official_Warranty__33040_thumb.jpg
https://cdn.homeshopping.pk/product_images/x/022/61vlByKsTSL._AC_SX679___91001_thumb.jpg
https://cdn.homeshopping.pk/product_images/e/023/images__40621_thumb.jpg
https://cdn.homeshopping.pk/product_images/i/979/s10-plus-black-hd-1__91254_thumb.jpg
https://cdn.homeshopping.pk/product_images/n/574/135743-v1-tecno-spark-go-mobile-phone-large-1__54522_thumb.jpg
https://cdn.homeshopping.pk/product_images/k/518/61vlByKsTSL._AC_SX679___72377_thumb.jpg
https://cdn.homeshopping.pk/product_images/a/035/37e49fd768e1cdce2b2b0bc7a3f8baa4__11345_thumb.png
https://cdn.homeshopping.pk/product_images/b/952/InfinixSmart4-b__55934_thumb.jpg

Related

Web Image Scraping not giving any output

I am trying to scrape all the photos in the URL below, but this code doesn't give any output, why?
import requests
from bs4 import BeautifulSoup
import os
url = 'https://www.airbnb.co.uk/s/Ljubljana--Slovenia/homes?tab_id=home_tab&refinement_paths%5B%5D=%2Fhomes&query=Ljubljana%2C%20Slovenia&place_id=ChIJ0YaYlvUxZUcRIOw_ghz4AAQ&checkin=2020-11-01&checkout=2020-11-08&source=structured_search_input_header&search_type=autocomplete_click'
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
images = soup.find_all('img')
print(images)
for image in images:
name = image['alt']
link = image['src']
print(name, link)

from bs4 import BeautifulSoup
import requests
# Replace this with the website's URL
URL = "put your URL here"
getURL = requests.get(URL, headers={"User-Agent": "Mozilla/5.0"})
print(getURL.status_code)
soup = BeautifulSoup(getURL.text, 'html.parser')
images = soup.find_all('img')
resolvedURLs = []
for image in images:
src = image.get('src')
resolvedURLs.append(requests.compat.urljoin(URL, src))
for image in resolvedURLs:
webs = requests.get(image)
open('images/' + image.split('/')[-1], 'wb').write(webs.content)

Retrieving content using Beautifulsoup and selectors

Trying to retrieve content (text) embedded in html. Not getting the content.
Trying to use selector in the format to find price_box:
price_box = soup2.find('div', attrs={'title class': 'Fw(600)'})
# Import libraries
import requests
import urllib.request
import time
from bs4 import BeautifulSoup
# Set the URL you want to webscrape from
url = 'https://finance.yahoo.com/quote/NVDA?p=NVDA'
# Connect to the URL
response = requests.get(url)
# Parse HTML and save to BeautifulSoup object¶
soup = BeautifulSoup(response.text, "html.parser")
beta = soup.find('h1')
#print (beta)
link = beta.contents
variable = 'NVDA - NVIDIA Corporation'
test = 'NVDA - NVIDIA Corporation'
#<..>
url2 = 'https://finance.yahoo.com/calendar/earnings?from=2019-09-01&to=2019-09-07&day=2019-09-01'
response2 = requests.get(url2)
soup2 = BeautifulSoup(response2.text, "html.parser")
# alpha = soup2.find('')
# div = soup.find('a', {class_ ='D(ib) '})
# text = div.string
price_box = soup2.find('div', attrs={'title class': 'Fw(600)'})
#price = price_box.text
print("Price Box: "+ str(price_box)) # THIS IS WHAT I WANT
Was hoping to see "Senea". Instead seeing "None" - "Price Box:
None"

A lot of the content is dynamic. You can regex out that info easily
import requests, re
p = re.compile(r'"YFINANCE:(.*?)"')
r = requests.get('https://finance.yahoo.com/calendar/earnings?from=2019-09-01&to=2019-09-07&day=2019-09-01&guccounter=1')
print(p.findall(r.text)[0])
An alternate is to avoid the dynamic looking classes altogether
import requests
from bs4 import BeautifulSoup as bs
r = requests.get('https://finance.yahoo.com/calendar/earnings?from=2019-09-01&to=2019-09-07&day=2019-09-01&guccounter=1')
soup = bs(r.content, 'lxml')
print(soup.select_one('#cal-res-table a').text)
Reading:
css selectors

Could not get link from html content using python

Here is the URL that I'am using:
http://www.protect-stream.com/PS_DL_xODN4o5HjLuqzEX5fRNuhtobXnvL9SeiyYcPLcqaqqXayD8YaIvg9Qo80hvgj4vCQkY95XB7iqcL4aF1YC8HRg_i_i
In fact on this page, the link that I am looking for appears may be 5 second after loading the page.
I see after 5 second a post request to :
http://www.protect-stream.com/secur.php
with data like so :
k=2AE_a,LHmb6kSC_c,sZNk4eNixIiPo_c,_c,Gw4ERVdriKuHJlciB1uuy_c,Sr7mOTQVUhVEcMlZeINICKegtzYsseabOlrDb_a,LmiP80NGUvAbK1xhbZGC6OWMtIaNF12f0mYA4O0WxBkmAtz75kpYcrHzxtYt32hCYSp0WjqOQR9bY_a,ofQtw_b,
I didn't get from where the 'k' value come from ?
Is their an idea on how we could get the 'k' value using python ?

This is not going to be trivial. The k parameter value is "hidden" deep inside a script element inside nested iframes. Here is a requests + BeautifulSoup way to get to the k value:
import re
from urlparse import urljoin
# Python 3: from urllib.parse import urljoin
import requests
from bs4 import BeautifulSoup
base_url = "http://www.protect-stream.com"
with requests.Session() as session:
response = session.get("http://www.protect-stream.com/PS_DL_xODN4o5HjLuqzEX5fRNuhtobXnvL9SeiyYcPLcqaqqXayD8YaIvg9Qo80hvgj4vCQkY95XB7iqcL4aF1YC8HRg_i_i")
# get the top frame url
soup = BeautifulSoup(response.content, "html.parser")
src = soup.select_one('iframe[src^="frame.php"]')["src"]
frame_url = urljoin(base_url, src)
# get the nested frame url
response = session.get(frame_url)
soup = BeautifulSoup(response.content, "html.parser")
src = soup.select_one('iframe[src^="w.php"]')["src"]
frame_url = urljoin(base_url, src)
# get the frame HTML source and extract the "k" value
response = session.get(frame_url)
soup = BeautifulSoup(response.content, "html.parser")
script = soup.find("script", text=lambda text: text and "k=" in text).get_text(strip=True)
k_value = re.search(r'var k="(.*?)";', script).group(1)
print(k_value)
Prints:
YjfH9430zztSYgf7ItQJ4grv2cvH3mT7xGwv32rTy2HiB1uuy_c,Sr7mOTQVUhVEcMlZeINICKegtzYsseabOlrDb_a,LmiP80NGUvAbK1xhbZGC6OWMtIaNF12f0mYA4O0WXhmwUC0ipkPRkLQepYHLyF1U0xvsrzHMcK2XBCeY3_a,O_b,

decrease string in python doesn't works

I have a problem to cut the url that i get as result from Beautifulsoup, i've used this code to retrieve the url.
import urllib2
from bs4 import BeautifulSoup
url = 'http://192.168.0.184:88/cgi-bin/CGIProxy.fcgi? cmd=snapPicture&usr=USER&pwd=PASS'
html = urllib2.urlopen(url)
soup = BeautifulSoup(html, "html5lib")
imgs = soup.findAll("img")
print imgs
print imgs[1:]
As result from print imgs i get [<img src="../snapPic/Snap_20160401-110642.jpg"/>]
I want to cut the unwanted characters from this string so i try to use for eg. print imgs[1:] but as result i get []
Any tips or solutions?
I want to rebuild the imgs string to the correct image url
imgs string = <img src="../snapPic/Snap_20160401-110642.jpg"/>
wanted result = http://192.168.0.184:88/snapPic/Snap_20160401-110642.jpg

try this
import urllib2
from bs4 import BeautifulSoup
url = 'http://192.168.0.184:88/cgi-bin/CGIProxy.fcgi? cmd=snapPicture&usr=USER&pwd=PASS'
html = urllib2.urlopen(url)
soup = BeautifulSoup(html, "html5lib")
imgs = soup.findAll("img")
print imgs
for img in imgs:
print img["src"].replace("..","http://192.168.0.184:88")

How should I scrape these images without errors?

I'm trying to scrape the images (or the images link) of this forum (http://www.xossip.com/showthread.php?t=1384077) . I've tried beautiful soup 4 and here is the code I tried:
import requests
from bs4 import BeautifulSoup
def spider(max_pages):
page = 1
while page <= max_pages:
url = 'http://www.xossip.com/showthread.php?t=1384077&page=' + str(page)
sourcecode= requests.get(url)
plaintext = sourcecode.text
soup = BeautifulSoup(plaintext)
for link in soup.findAll('a',{'class': 'alt1'}):
src = link.get('src')
print(src)
page += 1
spider(1)
How should I correct it so that I get links of images like pzy.be/example ?

Okay, so I did this by getting all of the #post_message_* divs and then getting the images from each of those.
import requests
from bs4 import BeautifulSoup
def spider(max_pages):
page = 1
while page <= max_pages:
url = 'http://www.xossip.com/showthread.php?t=1384077&page=' + str(page)
sourcecode= requests.get(url)
plaintext = sourcecode.text
soup = BeautifulSoup(plaintext)
divs = soup.findAll('div', id=lambda d: d and d.startswith('post_message_'))
for div in divs:
src = div.find('img')['src']
if src.startswith('http'): # b/c it could be a smilie or something like that
print(src)
page += 1
spider(1)

The simplest way is to just request each page and filter the img tags:
from bs4 import BeautifulSoup
from requests import get
import re
def get_wp():
start_url = "http://www.xossip.com/showthread.php?t=1384077&page={}"
for i in range(73):
r = get(start_url.format(i))
soup = BeautifulSoup(r.content)
for img in (i["src"] for i in soup.find_all("img", src=re.compile("http://pzy.be.*.jpg"))):
yield img

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

how to scrape image src from homeshopping.pk? - python

Related

Web Image Scraping not giving any output

Retrieving content using Beautifulsoup and selectors

Could not get link from html content using python

decrease string in python doesn't works

How should I scrape these images without errors?

Categories

Resources