decrease string in python doesn't works

decrease string in python doesn't works - python

I have a problem to cut the url that i get as result from Beautifulsoup, i've used this code to retrieve the url.
import urllib2
from bs4 import BeautifulSoup
url = 'http://192.168.0.184:88/cgi-bin/CGIProxy.fcgi? cmd=snapPicture&usr=USER&pwd=PASS'
html = urllib2.urlopen(url)
soup = BeautifulSoup(html, "html5lib")
imgs = soup.findAll("img")
print imgs
print imgs[1:]
As result from print imgs i get [<img src="../snapPic/Snap_20160401-110642.jpg"/>]
I want to cut the unwanted characters from this string so i try to use for eg. print imgs[1:] but as result i get []
Any tips or solutions?
I want to rebuild the imgs string to the correct image url
imgs string = <img src="../snapPic/Snap_20160401-110642.jpg"/>
wanted result = http://192.168.0.184:88/snapPic/Snap_20160401-110642.jpg

try this
import urllib2
from bs4 import BeautifulSoup
url = 'http://192.168.0.184:88/cgi-bin/CGIProxy.fcgi? cmd=snapPicture&usr=USER&pwd=PASS'
html = urllib2.urlopen(url)
soup = BeautifulSoup(html, "html5lib")
imgs = soup.findAll("img")
print imgs
for img in imgs:
print img["src"].replace("..","http://192.168.0.184:88")

Related

Web Image Scraping not giving any output

I am trying to scrape all the photos in the URL below, but this code doesn't give any output, why?
import requests
from bs4 import BeautifulSoup
import os
url = 'https://www.airbnb.co.uk/s/Ljubljana--Slovenia/homes?tab_id=home_tab&refinement_paths%5B%5D=%2Fhomes&query=Ljubljana%2C%20Slovenia&place_id=ChIJ0YaYlvUxZUcRIOw_ghz4AAQ&checkin=2020-11-01&checkout=2020-11-08&source=structured_search_input_header&search_type=autocomplete_click'
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
images = soup.find_all('img')
print(images)
for image in images:
name = image['alt']
link = image['src']
print(name, link)

from bs4 import BeautifulSoup
import requests
# Replace this with the website's URL
URL = "put your URL here"
getURL = requests.get(URL, headers={"User-Agent": "Mozilla/5.0"})
print(getURL.status_code)
soup = BeautifulSoup(getURL.text, 'html.parser')
images = soup.find_all('img')
resolvedURLs = []
for image in images:
src = image.get('src')
resolvedURLs.append(requests.compat.urljoin(URL, src))
for image in resolvedURLs:
webs = requests.get(image)
open('images/' + image.split('/')[-1], 'wb').write(webs.content)

Beautifulsoup findAll returns an empty list

I'm trying to scrape a webpage using beautifulsoup, but findAll() returns an empty list. This is my code:
URL = "https://elcinema.com/en/index/work/country/eg?page=1"
r = requests.get(URL)
bsObj = BeautifulSoup(r.content, 'html5lib')
recordList = bsObj.findAll('a', attrs = {'class':"lazy-loaded "})
print(recordList)
What am I doing wrong?

you need to find img tags with a class lazyloaded
import requests
from bs4 import BeautifulSoup
URL = "https://elcinema.com/en/index/work/country/eg?page=1"
r = requests.get(URL)
bsObj = BeautifulSoup(r.content, 'html')
recordList = bsObj.findAll('img',class_="lazy-loaded")
recordList =[i['data-src'] for i in recordList ]
print(recordList)
Output:
['https://media.elcinema.com/blank_photos/75x75.jpg', 'https://media.elcinema.com/uploads/_75x75_2fe90cb32f2759181f71eb2a9b29f0735f87ac88150a6a8fd3734300f8714369.jpg', 'https://media.elcinema.com/uploads/_75x75_3d90d1ee22c5f455bc4556073eab69cd218446d6134dc0f2694782ee39ccb5bf.jpg', 'https://media.elcinema.com/uploads/_75x75_81f30061ed82645e9ee688642275d76a23ee329344c5ac25c42f22afa35432ff.jpg', 'https://media.elcinema.com/blank_photos/75x75.jpg',.......]

According to the question, it looks like you need to find all a records who have img tag in it with a specific class lazy-loaded Follow the below code to get those:
Code:
import requests
from bs4 import BeautifulSoup
URL = "https://elcinema.com/en/index/work/country/eg?page=1"
r = requests.get(URL)
bsObj = BeautifulSoup(r.content, 'html.parser')
outputdata=[]
recordList = bsObj.findAll('a')
for record in recordList:
if record.find("img",{"class":"lazy-loaded"}):
outputdata.append(record)
print(len(outputdata))
print(outputdata)
Output:
Let me know if you have any questions :)

how to scrape image src from homeshopping.pk?

i tried following code but he is showing progressive gif image src instead of original image shown in browser
from bs4 import BeautifulSoup
from requests import get
url = "https://homeshopping.pk/categories/Mobile-Phones-Price-Pakistan?page=1&AjaxRequest=1"
response = get(url)
soup = BeautifulSoup(response.text, features="lxml")
div = soup.find_all('div', class_='innerp')
for division in div:
image = division.find('img', class_="img-responsive")["src"]
print(image)
But this code shows following results

Instead of ['src'] attribute print data-src to get the path of the image:
from bs4 import BeautifulSoup
from requests import get
url = "https://homeshopping.pk/categories/Mobile-Phones-Price-Pakistan?page=1&AjaxRequest=1"
response = get(url)
soup = BeautifulSoup(response.text, features="lxml")
div = soup.find_all('div', class_='innerp')
for division in div:
image = division.select_one('img.img-responsive')['data-src']
print(image)
Prints:
https://cdn.homeshopping.pk/product_images/e/629/iphone-xs-max-pakistan__77506_thumb.jpg
https://cdn.homeshopping.pk/product_images/u/724/Apple_iPhone_XS_Max_%284G__256GB_Gold%29__86582_thumb.jpg
https://cdn.homeshopping.pk/product_images/q/375/1__58638_thumb.jpg
https://cdn.homeshopping.pk/product_images/m/071/iphone11-black-select-2019__99988_thumb.png
https://cdn.homeshopping.pk/product_images/c/923/note10_256gb-black_-minn__51011_thumb.jpg
https://cdn.homeshopping.pk/product_images/k/303/iphone-11-pro-max-space-pakistan__86401_thumb.png
https://cdn.homeshopping.pk/product_images/o/722/samsung-galaxy-a71-a715-128gb-dual-sim-blue__62457_thumb.jpg
https://cdn.homeshopping.pk/product_images/y/370/71G1FCIP1EL._SL1500___25143_thumb.jpg
https://cdn.homeshopping.pk/product_images/g/906/vivo-s1-128gb-skyline-blue-4gb-ram-__95991_thumb.jpg
https://cdn.homeshopping.pk/product_images/w/375/samsung-galaxy-a31-a315-4gb-128gb-dual-sim-blue__12319_thumb.jpg
https://cdn.homeshopping.pk/product_images/s/729/Samsung-Galaxy-A51-600x600__14872_thumb.jpg
https://cdn.homeshopping.pk/product_images/n/179/itel-A25-1-2__18679_thumb.jpg
https://cdn.homeshopping.pk/product_images/a/940/Infinix-Hot-8-a__91944_thumb.png
https://cdn.homeshopping.pk/product_images/g/488/Realme-6-Comet-Blue-1__03862_thumb.jpg
https://cdn.homeshopping.pk/product_images/k/364/Vivo-Y11__28500_thumb.jpg
https://cdn.homeshopping.pk/product_images/j/460/samsung-galaxy-fold-5g-2__57605_zoom__77077_thumb.jpg
https://cdn.homeshopping.pk/product_images/c/950/Apple_iPhone_7_Plus_%2832GB__Gold%29_Without_Facetime_1_Year_Official_Warranty__33040_thumb.jpg
https://cdn.homeshopping.pk/product_images/x/022/61vlByKsTSL._AC_SX679___91001_thumb.jpg
https://cdn.homeshopping.pk/product_images/e/023/images__40621_thumb.jpg
https://cdn.homeshopping.pk/product_images/i/979/s10-plus-black-hd-1__91254_thumb.jpg
https://cdn.homeshopping.pk/product_images/n/574/135743-v1-tecno-spark-go-mobile-phone-large-1__54522_thumb.jpg
https://cdn.homeshopping.pk/product_images/k/518/61vlByKsTSL._AC_SX679___72377_thumb.jpg
https://cdn.homeshopping.pk/product_images/a/035/37e49fd768e1cdce2b2b0bc7a3f8baa4__11345_thumb.png
https://cdn.homeshopping.pk/product_images/b/952/InfinixSmart4-b__55934_thumb.jpg

Scraping Craiglist with BeautifulSoup and getting first image in each posting

I am currently trying to scrape aviation data from craigslist. I have no problem getting all the info I want except the first image for each post. Here is my link:
https://spokane.craigslist.org/search/avo?hasPic=1
I have been able to get all images thanks to a different post on this site but I am having trouble figuring out how to get just the first image.
I am using bs4 and requests for this script. Here is what I have so far which gets every image:
from bs4 import BeautifulSoup as bs
import requests
image_url = 'https://images.craigslist.org/{}_300x300.jpg'
r = requests.get('https://spokane.craigslist.org/search/avo?hasPic=1')
soup = bs(r.content, 'lxml')
ids = [item['data-ids'].replace('1:','') for item in soup.select('.result-image[data-ids]', limit = 10)]
images = [image_url.format(j) for i in ids for j in i.split(',')]
print(images)
Any help is greatly appreciated.
Thanks in advance,
inzel

from bs4 import BeautifulSoup
import requests
r = requests.get("https://spokane.craigslist.org/search/avo?hasPic=1")
soup = BeautifulSoup(r.text, 'html.parser')
img = "https://images.craigslist.org/"
imgs = [f"{img}{item.get('data-ids').split(':')[1].split(',')[0]}_300x300.jpg"
for item in soup.findAll("a", class_="result-image gallery")]
print(imgs)
output:
['https://images.craigslist.org/00N0N_ci3cbcv5T58_300x300.jpg', 'https://images.craigslist.org/00101_5dLpBXXdDWJ_300x300.jpg', 'https://images.craigslist.org/00n0n_8zVXHONPkTH_300x300.jpg', 'https://images.craigslist.org/00l0l_jiNMe38avtl_300x300.jpg', 'https://images.craigslist.org/00q0q_l4hts9RPOuk_300x300.jpg', 'https://images.craigslist.org/00D0D_ibbWWn7uFCu_300x300.jpg', 'https://images.craigslist.org/00z0z_2ylVbmdVnPr_300x300.jpg', 'https://images.craigslist.org/00Q0Q_ha0o2IJwj4Q_300x300.jpg', 'https://images.craigslist.org/01212_5LoZU43xA7r_300x300.jpg', 'https://images.craigslist.org/00U0U_7CMAu8vAhDi_300x300.jpg', 'https://images.craigslist.org/00m0m_8c7azYhDR1Z_300x300.jpg', 'https://images.craigslist.org/00E0E_7k7cPL7zNnP_300x300.jpg', 'https://images.craigslist.org/00I0I_97AZy8UMt5V_300x300.jpg', 'https://images.craigslist.org/00G0G_iWw8AI8N8Kf_300x300.jpg', 'https://images.craigslist.org/00m0m_9BEEcvD0681_300x300.jpg', 'https://images.craigslist.org/01717_4Ut5FSIdoi3_300x300.jpg', 'https://images.craigslist.org/00h0h_jeAhtDXW2ST_300x300.jpg', 'https://images.craigslist.org/00T0T_hTogH4m9zTH_300x300.jpg', 'https://images.craigslist.org/01212_9x1EFI1CYHE_300x300.jpg', 'https://images.craigslist.org/00H0H_kiXLOtVgReA_300x300.jpg', 'https://images.craigslist.org/00P0P_ad77Eqvf1ul_300x300.jpg', 'https://images.craigslist.org/00909_jyBoTCNGmAJ_300x300.jpg', 'https://images.craigslist.org/00g0g_gFtJlANhi51_300x300.jpg', 'https://images.craigslist.org/00202_3LV7YERBssE_300x300.jpg', 'https://images.craigslist.org/00j0j_3zxT682nE2i_300x300.jpg', 'https://images.craigslist.org/00Y0Y_b6AXcApcSfl_300x300.jpg', 'https://images.craigslist.org/00M0M_6eTHo5E3Ee5_300x300.jpg', 'https://images.craigslist.org/00g0g_hvyvJKUejXY_300x300.jpg', 'https://images.craigslist.org/00I0I_d2WOWXtgQ8s_300x300.jpg', 'https://images.craigslist.org/00s0s_dAwJG0D6uce_300x300.jpg', 'https://images.craigslist.org/00g0g_TC2qvnD3AN_300x300.jpg', 'https://images.craigslist.org/00M0M_Dba39RfEkr_300x300.jpg', 'https://images.craigslist.org/00M0M_31drxF6c9vO_300x300.jpg', 'https://images.craigslist.org/00505_jOjMq3B8y0M_300x300.jpg', 'https://images.craigslist.org/00e0e_ixfV647qwLh_300x300.jpg', 'https://images.craigslist.org/00p0p_i2noTC4cADw_300x300.jpg', 'https://images.craigslist.org/00a0a_kywatxfm6Ud_300x300.jpg', 'https://images.craigslist.org/00808_1ZjIIX8PdaP_300x300.jpg', 'https://images.craigslist.org/01515_blEEDKbbyKD_300x300.jpg', 'https://images.craigslist.org/00b0b_brUn6sUxBzF_300x300.jpg', 'https://images.craigslist.org/00U0U_2ukBvcgvU99_300x300.jpg', 'https://images.craigslist.org/01212_dPTe5ZHM26A_300x300.jpg', 'https://images.craigslist.org/00B0B_1GsE81zVsr0_300x300.jpg', 'https://images.craigslist.org/00N0N_l8SXlBaI8lq_300x300.jpg', 'https://images.craigslist.org/00f0f_82qAzPq7cXd_300x300.jpg', 'https://images.craigslist.org/00w0w_lUrgFG9YOY0_300x300.jpg', 'https://images.craigslist.org/00C0C_kiZpgrFEnO8_300x300.jpg', 'https://images.craigslist.org/00T0T_g7IHvHMx14L_300x300.jpg', 'https://images.craigslist.org/00E0E_bzm9jRXpWVd_300x300.jpg', 'https://images.craigslist.org/00k0k_lOCRF1fgWCF_300x300.jpg', 'https://images.craigslist.org/00y0y_exwReppAi3L_300x300.jpg', 'https://images.craigslist.org/01515_7xyZ605hYcc_300x300.jpg', 'https://images.craigslist.org/00J0J_hqLMLvTCfXk_300x300.jpg', 'https://images.craigslist.org/00505_3P0xQrbeFY4_300x300.jpg', 'https://images.craigslist.org/00r0r_gj6dO6ZHO8L_300x300.jpg', 'https://images.craigslist.org/01717_cIVmzgKCWtP_300x300.jpg', 'https://images.craigslist.org/00w0w_6O59k6qlZQz_300x300.jpg', 'https://images.craigslist.org/00808_jd43ZthN1uB_300x300.jpg', 'https://images.craigslist.org/00m0m_1GJ41cKvv4Y_300x300.jpg']
That list is containing the first image for each post.

You need to find all class with the gallery of images then get the data-ids.
Then split them into a list and get the first element [0].
from bs4 import BeautifulSoup as bs
import requests
image_url = 'https://images.craigslist.org/{}_300x300.jpg'
r = requests.get('https://spokane.craigslist.org/search/avo?hasPic=1')
soup = bs(r.content, 'lxml')
ids = [item.get('data-ids').replace('1:','') for item in soup.findAll("a", {"class": "result-image gallery"}, limit=10)]
images = [image_url.format(i.split(',')[0]) for i in ids]
print(images)
Result:
['https://images.craigslist.org/00N0N_ci3cbcv5T58_300x300.jpg', 'https://images.craigslist.org/00101_5dLpBXXdDWJ_300x300.jpg', 'https://images.craigslist.org/00n0n_8zVXHONPkTH_300x300.jpg', 'https://images.craigslist.org/00l0l_jiNMe38avtl_300x300.jpg', 'https://images.craigslist.org/01212_fULyvfO9Rqz_300x300.jpg', 'https://images.craigslist.org/00D0D_ibbWWn7uFCu_300x300.jpg', 'https://images.craigslist.org/00z0z_2ylVbmdVnPr_300x300.jpg', 'https://images.craigslist.org/00Q0Q_ha0o2IJwj4Q_300x300.jpg', 'https://images.craigslist.org/01212_5LoZU43xA7r_300x300.jpg', 'https://images.craigslist.org/00U0U_7CMAu8vAhDi_300x300.jpg']

Here is a clean and straightforward solution:
from pprint import pprint
import requests
from bs4 import BeautifulSoup
base_image_url = 'https://images.craigslist.org/{}_300x300.jpg'
r = requests.get('https://spokane.craigslist.org/search/avo?hasPic=1')
soup = BeautifulSoup(r.content, 'lxml')
results = []
for elem in soup.find_all("a", attrs={"class": "result-image gallery"})[:2]:
listing_url = elem.get("href")
image_urls = []
image_ids = elem.get("data-ids")
if image_ids:
image_urls = [base_image_url.format(curr_id[2:]) for curr_id in image_ids.split(",")]
results.append((listing_url, image_urls))
pprint(results)
Output:
[('https://spokane.craigslist.org/avo/d/spokane-lightspeed-sierra-headset/7090771925.html',
['https://images.craigslist.org/00N0N_ci3cbcv5T58_300x300.jpg',
'https://images.craigslist.org/00q0q_5ax4n1nCwmI_300x300.jpg',
'https://images.craigslist.org/00202_pAcLlJsaR3_300x300.jpg',
'https://images.craigslist.org/00z0z_kCZGUL6WZZw_300x300.jpg',
'https://images.craigslist.org/00G0G_8A2Xg7Wbe7B_300x300.jpg',
'https://images.craigslist.org/00f0f_cNpP8ZfUXdU_300x300.jpg']),
('https://spokane.craigslist.org/avo/d/spokane-window-mounted-air-conditioner/7090361383.html',
['https://images.craigslist.org/00101_5dLpBXXdDWJ_300x300.jpg',
'https://images.craigslist.org/00I0I_lxNKJsQAT7X_300x300.jpg',
'https://images.craigslist.org/00t0t_3BeBsNO6xH6_300x300.jpg',
'https://images.craigslist.org/00L0L_aPnbejSiXQp_300x300.jpg'])]
Let me know if you have any questions :)

Could not get link from html content using python

Here is the URL that I'am using:
http://www.protect-stream.com/PS_DL_xODN4o5HjLuqzEX5fRNuhtobXnvL9SeiyYcPLcqaqqXayD8YaIvg9Qo80hvgj4vCQkY95XB7iqcL4aF1YC8HRg_i_i
In fact on this page, the link that I am looking for appears may be 5 second after loading the page.
I see after 5 second a post request to :
http://www.protect-stream.com/secur.php
with data like so :
k=2AE_a,LHmb6kSC_c,sZNk4eNixIiPo_c,_c,Gw4ERVdriKuHJlciB1uuy_c,Sr7mOTQVUhVEcMlZeINICKegtzYsseabOlrDb_a,LmiP80NGUvAbK1xhbZGC6OWMtIaNF12f0mYA4O0WxBkmAtz75kpYcrHzxtYt32hCYSp0WjqOQR9bY_a,ofQtw_b,
I didn't get from where the 'k' value come from ?
Is their an idea on how we could get the 'k' value using python ?

This is not going to be trivial. The k parameter value is "hidden" deep inside a script element inside nested iframes. Here is a requests + BeautifulSoup way to get to the k value:
import re
from urlparse import urljoin
# Python 3: from urllib.parse import urljoin
import requests
from bs4 import BeautifulSoup
base_url = "http://www.protect-stream.com"
with requests.Session() as session:
response = session.get("http://www.protect-stream.com/PS_DL_xODN4o5HjLuqzEX5fRNuhtobXnvL9SeiyYcPLcqaqqXayD8YaIvg9Qo80hvgj4vCQkY95XB7iqcL4aF1YC8HRg_i_i")
# get the top frame url
soup = BeautifulSoup(response.content, "html.parser")
src = soup.select_one('iframe[src^="frame.php"]')["src"]
frame_url = urljoin(base_url, src)
# get the nested frame url
response = session.get(frame_url)
soup = BeautifulSoup(response.content, "html.parser")
src = soup.select_one('iframe[src^="w.php"]')["src"]
frame_url = urljoin(base_url, src)
# get the frame HTML source and extract the "k" value
response = session.get(frame_url)
soup = BeautifulSoup(response.content, "html.parser")
script = soup.find("script", text=lambda text: text and "k=" in text).get_text(strip=True)
k_value = re.search(r'var k="(.*?)";', script).group(1)
print(k_value)
Prints:
YjfH9430zztSYgf7ItQJ4grv2cvH3mT7xGwv32rTy2HiB1uuy_c,Sr7mOTQVUhVEcMlZeINICKegtzYsseabOlrDb_a,LmiP80NGUvAbK1xhbZGC6OWMtIaNF12f0mYA4O0WXhmwUC0ipkPRkLQepYHLyF1U0xvsrzHMcK2XBCeY3_a,O_b,

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

decrease string in python doesn't works - python

Related

Web Image Scraping not giving any output

Beautifulsoup findAll returns an empty list

how to scrape image src from homeshopping.pk?

Scraping Craiglist with BeautifulSoup and getting first image in each posting

Could not get link from html content using python

Categories

Resources