I am trying to scrape all the photos in the URL below, but this code doesn't give any output, why?
import requests
from bs4 import BeautifulSoup
import os
url = 'https://www.airbnb.co.uk/s/Ljubljana--Slovenia/homes?tab_id=home_tab&refinement_paths%5B%5D=%2Fhomes&query=Ljubljana%2C%20Slovenia&place_id=ChIJ0YaYlvUxZUcRIOw_ghz4AAQ&checkin=2020-11-01&checkout=2020-11-08&source=structured_search_input_header&search_type=autocomplete_click'
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
images = soup.find_all('img')
print(images)
for image in images:
name = image['alt']
link = image['src']
print(name, link)
from bs4 import BeautifulSoup
import requests
# Replace this with the website's URL
URL = "put your URL here"
getURL = requests.get(URL, headers={"User-Agent": "Mozilla/5.0"})
print(getURL.status_code)
soup = BeautifulSoup(getURL.text, 'html.parser')
images = soup.find_all('img')
resolvedURLs = []
for image in images:
src = image.get('src')
resolvedURLs.append(requests.compat.urljoin(URL, src))
for image in resolvedURLs:
webs = requests.get(image)
open('images/' + image.split('/')[-1], 'wb').write(webs.content)
I'm trying to scrape a webpage using beautifulsoup, but findAll() returns an empty list. This is my code:
URL = "https://elcinema.com/en/index/work/country/eg?page=1"
r = requests.get(URL)
bsObj = BeautifulSoup(r.content, 'html5lib')
recordList = bsObj.findAll('a', attrs = {'class':"lazy-loaded "})
print(recordList)
What am I doing wrong?
you need to find img tags with a class lazyloaded
import requests
from bs4 import BeautifulSoup
URL = "https://elcinema.com/en/index/work/country/eg?page=1"
r = requests.get(URL)
bsObj = BeautifulSoup(r.content, 'html')
recordList = bsObj.findAll('img',class_="lazy-loaded")
recordList =[i['data-src'] for i in recordList ]
print(recordList)
Output:
['https://media.elcinema.com/blank_photos/75x75.jpg', 'https://media.elcinema.com/uploads/_75x75_2fe90cb32f2759181f71eb2a9b29f0735f87ac88150a6a8fd3734300f8714369.jpg', 'https://media.elcinema.com/uploads/_75x75_3d90d1ee22c5f455bc4556073eab69cd218446d6134dc0f2694782ee39ccb5bf.jpg', 'https://media.elcinema.com/uploads/_75x75_81f30061ed82645e9ee688642275d76a23ee329344c5ac25c42f22afa35432ff.jpg', 'https://media.elcinema.com/blank_photos/75x75.jpg',.......]
According to the question, it looks like you need to find all a records who have img tag in it with a specific class lazy-loaded Follow the below code to get those:
Code:
import requests
from bs4 import BeautifulSoup
URL = "https://elcinema.com/en/index/work/country/eg?page=1"
r = requests.get(URL)
bsObj = BeautifulSoup(r.content, 'html.parser')
outputdata=[]
recordList = bsObj.findAll('a')
for record in recordList:
if record.find("img",{"class":"lazy-loaded"}):
outputdata.append(record)
print(len(outputdata))
print(outputdata)
Output:
Let me know if you have any questions :)
i tried following code but he is showing progressive gif image src instead of original image shown in browser
from bs4 import BeautifulSoup
from requests import get
url = "https://homeshopping.pk/categories/Mobile-Phones-Price-Pakistan?page=1&AjaxRequest=1"
response = get(url)
soup = BeautifulSoup(response.text, features="lxml")
div = soup.find_all('div', class_='innerp')
for division in div:
image = division.find('img', class_="img-responsive")["src"]
print(image)
But this code shows following results
Instead of ['src'] attribute print data-src to get the path of the image:
from bs4 import BeautifulSoup
from requests import get
url = "https://homeshopping.pk/categories/Mobile-Phones-Price-Pakistan?page=1&AjaxRequest=1"
response = get(url)
soup = BeautifulSoup(response.text, features="lxml")
div = soup.find_all('div', class_='innerp')
for division in div:
image = division.select_one('img.img-responsive')['data-src']
print(image)
Prints:
https://cdn.homeshopping.pk/product_images/e/629/iphone-xs-max-pakistan__77506_thumb.jpg
https://cdn.homeshopping.pk/product_images/u/724/Apple_iPhone_XS_Max_%284G__256GB_Gold%29__86582_thumb.jpg
https://cdn.homeshopping.pk/product_images/q/375/1__58638_thumb.jpg
https://cdn.homeshopping.pk/product_images/m/071/iphone11-black-select-2019__99988_thumb.png
https://cdn.homeshopping.pk/product_images/c/923/note10_256gb-black_-minn__51011_thumb.jpg
https://cdn.homeshopping.pk/product_images/k/303/iphone-11-pro-max-space-pakistan__86401_thumb.png
https://cdn.homeshopping.pk/product_images/o/722/samsung-galaxy-a71-a715-128gb-dual-sim-blue__62457_thumb.jpg
https://cdn.homeshopping.pk/product_images/y/370/71G1FCIP1EL._SL1500___25143_thumb.jpg
https://cdn.homeshopping.pk/product_images/g/906/vivo-s1-128gb-skyline-blue-4gb-ram-__95991_thumb.jpg
https://cdn.homeshopping.pk/product_images/w/375/samsung-galaxy-a31-a315-4gb-128gb-dual-sim-blue__12319_thumb.jpg
https://cdn.homeshopping.pk/product_images/s/729/Samsung-Galaxy-A51-600x600__14872_thumb.jpg
https://cdn.homeshopping.pk/product_images/n/179/itel-A25-1-2__18679_thumb.jpg
https://cdn.homeshopping.pk/product_images/a/940/Infinix-Hot-8-a__91944_thumb.png
https://cdn.homeshopping.pk/product_images/g/488/Realme-6-Comet-Blue-1__03862_thumb.jpg
https://cdn.homeshopping.pk/product_images/k/364/Vivo-Y11__28500_thumb.jpg
https://cdn.homeshopping.pk/product_images/j/460/samsung-galaxy-fold-5g-2__57605_zoom__77077_thumb.jpg
https://cdn.homeshopping.pk/product_images/c/950/Apple_iPhone_7_Plus_%2832GB__Gold%29_Without_Facetime_1_Year_Official_Warranty__33040_thumb.jpg
https://cdn.homeshopping.pk/product_images/x/022/61vlByKsTSL._AC_SX679___91001_thumb.jpg
https://cdn.homeshopping.pk/product_images/e/023/images__40621_thumb.jpg
https://cdn.homeshopping.pk/product_images/i/979/s10-plus-black-hd-1__91254_thumb.jpg
https://cdn.homeshopping.pk/product_images/n/574/135743-v1-tecno-spark-go-mobile-phone-large-1__54522_thumb.jpg
https://cdn.homeshopping.pk/product_images/k/518/61vlByKsTSL._AC_SX679___72377_thumb.jpg
https://cdn.homeshopping.pk/product_images/a/035/37e49fd768e1cdce2b2b0bc7a3f8baa4__11345_thumb.png
https://cdn.homeshopping.pk/product_images/b/952/InfinixSmart4-b__55934_thumb.jpg
I am currently trying to scrape aviation data from craigslist. I have no problem getting all the info I want except the first image for each post. Here is my link:
https://spokane.craigslist.org/search/avo?hasPic=1
I have been able to get all images thanks to a different post on this site but I am having trouble figuring out how to get just the first image.
I am using bs4 and requests for this script. Here is what I have so far which gets every image:
from bs4 import BeautifulSoup as bs
import requests
image_url = 'https://images.craigslist.org/{}_300x300.jpg'
r = requests.get('https://spokane.craigslist.org/search/avo?hasPic=1')
soup = bs(r.content, 'lxml')
ids = [item['data-ids'].replace('1:','') for item in soup.select('.result-image[data-ids]', limit = 10)]
images = [image_url.format(j) for i in ids for j in i.split(',')]
print(images)
Any help is greatly appreciated.
Thanks in advance,
inzel
from bs4 import BeautifulSoup
import requests
r = requests.get("https://spokane.craigslist.org/search/avo?hasPic=1")
soup = BeautifulSoup(r.text, 'html.parser')
img = "https://images.craigslist.org/"
imgs = [f"{img}{item.get('data-ids').split(':')[1].split(',')[0]}_300x300.jpg"
for item in soup.findAll("a", class_="result-image gallery")]
print(imgs)
output:
['https://images.craigslist.org/00N0N_ci3cbcv5T58_300x300.jpg', 'https://images.craigslist.org/00101_5dLpBXXdDWJ_300x300.jpg', 'https://images.craigslist.org/00n0n_8zVXHONPkTH_300x300.jpg', 'https://images.craigslist.org/00l0l_jiNMe38avtl_300x300.jpg', 'https://images.craigslist.org/00q0q_l4hts9RPOuk_300x300.jpg', 'https://images.craigslist.org/00D0D_ibbWWn7uFCu_300x300.jpg', 'https://images.craigslist.org/00z0z_2ylVbmdVnPr_300x300.jpg', 'https://images.craigslist.org/00Q0Q_ha0o2IJwj4Q_300x300.jpg', 'https://images.craigslist.org/01212_5LoZU43xA7r_300x300.jpg', 'https://images.craigslist.org/00U0U_7CMAu8vAhDi_300x300.jpg', 'https://images.craigslist.org/00m0m_8c7azYhDR1Z_300x300.jpg', 'https://images.craigslist.org/00E0E_7k7cPL7zNnP_300x300.jpg', 'https://images.craigslist.org/00I0I_97AZy8UMt5V_300x300.jpg', 'https://images.craigslist.org/00G0G_iWw8AI8N8Kf_300x300.jpg', 'https://images.craigslist.org/00m0m_9BEEcvD0681_300x300.jpg', 'https://images.craigslist.org/01717_4Ut5FSIdoi3_300x300.jpg', 'https://images.craigslist.org/00h0h_jeAhtDXW2ST_300x300.jpg', 'https://images.craigslist.org/00T0T_hTogH4m9zTH_300x300.jpg', 'https://images.craigslist.org/01212_9x1EFI1CYHE_300x300.jpg', 'https://images.craigslist.org/00H0H_kiXLOtVgReA_300x300.jpg', 'https://images.craigslist.org/00P0P_ad77Eqvf1ul_300x300.jpg', 'https://images.craigslist.org/00909_jyBoTCNGmAJ_300x300.jpg', 'https://images.craigslist.org/00g0g_gFtJlANhi51_300x300.jpg', 'https://images.craigslist.org/00202_3LV7YERBssE_300x300.jpg', 'https://images.craigslist.org/00j0j_3zxT682nE2i_300x300.jpg', 'https://images.craigslist.org/00Y0Y_b6AXcApcSfl_300x300.jpg', 'https://images.craigslist.org/00M0M_6eTHo5E3Ee5_300x300.jpg', 'https://images.craigslist.org/00g0g_hvyvJKUejXY_300x300.jpg', 'https://images.craigslist.org/00I0I_d2WOWXtgQ8s_300x300.jpg', 'https://images.craigslist.org/00s0s_dAwJG0D6uce_300x300.jpg', 'https://images.craigslist.org/00g0g_TC2qvnD3AN_300x300.jpg', 'https://images.craigslist.org/00M0M_Dba39RfEkr_300x300.jpg', 'https://images.craigslist.org/00M0M_31drxF6c9vO_300x300.jpg', 'https://images.craigslist.org/00505_jOjMq3B8y0M_300x300.jpg', 'https://images.craigslist.org/00e0e_ixfV647qwLh_300x300.jpg', 'https://images.craigslist.org/00p0p_i2noTC4cADw_300x300.jpg', 'https://images.craigslist.org/00a0a_kywatxfm6Ud_300x300.jpg', 'https://images.craigslist.org/00808_1ZjIIX8PdaP_300x300.jpg', 'https://images.craigslist.org/01515_blEEDKbbyKD_300x300.jpg', 'https://images.craigslist.org/00b0b_brUn6sUxBzF_300x300.jpg', 'https://images.craigslist.org/00U0U_2ukBvcgvU99_300x300.jpg', 'https://images.craigslist.org/01212_dPTe5ZHM26A_300x300.jpg', 'https://images.craigslist.org/00B0B_1GsE81zVsr0_300x300.jpg', 'https://images.craigslist.org/00N0N_l8SXlBaI8lq_300x300.jpg', 'https://images.craigslist.org/00f0f_82qAzPq7cXd_300x300.jpg', 'https://images.craigslist.org/00w0w_lUrgFG9YOY0_300x300.jpg', 'https://images.craigslist.org/00C0C_kiZpgrFEnO8_300x300.jpg', 'https://images.craigslist.org/00T0T_g7IHvHMx14L_300x300.jpg', 'https://images.craigslist.org/00E0E_bzm9jRXpWVd_300x300.jpg', 'https://images.craigslist.org/00k0k_lOCRF1fgWCF_300x300.jpg', 'https://images.craigslist.org/00y0y_exwReppAi3L_300x300.jpg', 'https://images.craigslist.org/01515_7xyZ605hYcc_300x300.jpg', 'https://images.craigslist.org/00J0J_hqLMLvTCfXk_300x300.jpg', 'https://images.craigslist.org/00505_3P0xQrbeFY4_300x300.jpg', 'https://images.craigslist.org/00r0r_gj6dO6ZHO8L_300x300.jpg', 'https://images.craigslist.org/01717_cIVmzgKCWtP_300x300.jpg', 'https://images.craigslist.org/00w0w_6O59k6qlZQz_300x300.jpg', 'https://images.craigslist.org/00808_jd43ZthN1uB_300x300.jpg', 'https://images.craigslist.org/00m0m_1GJ41cKvv4Y_300x300.jpg']
That list is containing the first image for each post.
You need to find all class with the gallery of images then get the data-ids.
Then split them into a list and get the first element [0].
from bs4 import BeautifulSoup as bs
import requests
image_url = 'https://images.craigslist.org/{}_300x300.jpg'
r = requests.get('https://spokane.craigslist.org/search/avo?hasPic=1')
soup = bs(r.content, 'lxml')
ids = [item.get('data-ids').replace('1:','') for item in soup.findAll("a", {"class": "result-image gallery"}, limit=10)]
images = [image_url.format(i.split(',')[0]) for i in ids]
print(images)
Result:
['https://images.craigslist.org/00N0N_ci3cbcv5T58_300x300.jpg', 'https://images.craigslist.org/00101_5dLpBXXdDWJ_300x300.jpg', 'https://images.craigslist.org/00n0n_8zVXHONPkTH_300x300.jpg', 'https://images.craigslist.org/00l0l_jiNMe38avtl_300x300.jpg', 'https://images.craigslist.org/01212_fULyvfO9Rqz_300x300.jpg', 'https://images.craigslist.org/00D0D_ibbWWn7uFCu_300x300.jpg', 'https://images.craigslist.org/00z0z_2ylVbmdVnPr_300x300.jpg', 'https://images.craigslist.org/00Q0Q_ha0o2IJwj4Q_300x300.jpg', 'https://images.craigslist.org/01212_5LoZU43xA7r_300x300.jpg', 'https://images.craigslist.org/00U0U_7CMAu8vAhDi_300x300.jpg']
Here is a clean and straightforward solution:
from pprint import pprint
import requests
from bs4 import BeautifulSoup
base_image_url = 'https://images.craigslist.org/{}_300x300.jpg'
r = requests.get('https://spokane.craigslist.org/search/avo?hasPic=1')
soup = BeautifulSoup(r.content, 'lxml')
results = []
for elem in soup.find_all("a", attrs={"class": "result-image gallery"})[:2]:
listing_url = elem.get("href")
image_urls = []
image_ids = elem.get("data-ids")
if image_ids:
image_urls = [base_image_url.format(curr_id[2:]) for curr_id in image_ids.split(",")]
results.append((listing_url, image_urls))
pprint(results)
Output:
[('https://spokane.craigslist.org/avo/d/spokane-lightspeed-sierra-headset/7090771925.html',
['https://images.craigslist.org/00N0N_ci3cbcv5T58_300x300.jpg',
'https://images.craigslist.org/00q0q_5ax4n1nCwmI_300x300.jpg',
'https://images.craigslist.org/00202_pAcLlJsaR3_300x300.jpg',
'https://images.craigslist.org/00z0z_kCZGUL6WZZw_300x300.jpg',
'https://images.craigslist.org/00G0G_8A2Xg7Wbe7B_300x300.jpg',
'https://images.craigslist.org/00f0f_cNpP8ZfUXdU_300x300.jpg']),
('https://spokane.craigslist.org/avo/d/spokane-window-mounted-air-conditioner/7090361383.html',
['https://images.craigslist.org/00101_5dLpBXXdDWJ_300x300.jpg',
'https://images.craigslist.org/00I0I_lxNKJsQAT7X_300x300.jpg',
'https://images.craigslist.org/00t0t_3BeBsNO6xH6_300x300.jpg',
'https://images.craigslist.org/00L0L_aPnbejSiXQp_300x300.jpg'])]
Let me know if you have any questions :)
Here is the URL that I'am using:
http://www.protect-stream.com/PS_DL_xODN4o5HjLuqzEX5fRNuhtobXnvL9SeiyYcPLcqaqqXayD8YaIvg9Qo80hvgj4vCQkY95XB7iqcL4aF1YC8HRg_i_i
In fact on this page, the link that I am looking for appears may be 5 second after loading the page.
I see after 5 second a post request to :
http://www.protect-stream.com/secur.php
with data like so :
k=2AE_a,LHmb6kSC_c,sZNk4eNixIiPo_c,_c,Gw4ERVdriKuHJlciB1uuy_c,Sr7mOTQVUhVEcMlZeINICKegtzYsseabOlrDb_a,LmiP80NGUvAbK1xhbZGC6OWMtIaNF12f0mYA4O0WxBkmAtz75kpYcrHzxtYt32hCYSp0WjqOQR9bY_a,ofQtw_b,
I didn't get from where the 'k' value come from ?
Is their an idea on how we could get the 'k' value using python ?
This is not going to be trivial. The k parameter value is "hidden" deep inside a script element inside nested iframes. Here is a requests + BeautifulSoup way to get to the k value:
import re
from urlparse import urljoin
# Python 3: from urllib.parse import urljoin
import requests
from bs4 import BeautifulSoup
base_url = "http://www.protect-stream.com"
with requests.Session() as session:
response = session.get("http://www.protect-stream.com/PS_DL_xODN4o5HjLuqzEX5fRNuhtobXnvL9SeiyYcPLcqaqqXayD8YaIvg9Qo80hvgj4vCQkY95XB7iqcL4aF1YC8HRg_i_i")
# get the top frame url
soup = BeautifulSoup(response.content, "html.parser")
src = soup.select_one('iframe[src^="frame.php"]')["src"]
frame_url = urljoin(base_url, src)
# get the nested frame url
response = session.get(frame_url)
soup = BeautifulSoup(response.content, "html.parser")
src = soup.select_one('iframe[src^="w.php"]')["src"]
frame_url = urljoin(base_url, src)
# get the frame HTML source and extract the "k" value
response = session.get(frame_url)
soup = BeautifulSoup(response.content, "html.parser")
script = soup.find("script", text=lambda text: text and "k=" in text).get_text(strip=True)
k_value = re.search(r'var k="(.*?)";', script).group(1)
print(k_value)
Prints:
YjfH9430zztSYgf7ItQJ4grv2cvH3mT7xGwv32rTy2HiB1uuy_c,Sr7mOTQVUhVEcMlZeINICKegtzYsseabOlrDb_a,LmiP80NGUvAbK1xhbZGC6OWMtIaNF12f0mYA4O0WXhmwUC0ipkPRkLQepYHLyF1U0xvsrzHMcK2XBCeY3_a,O_b,