I want to learn how to get information from dynamically generated fields.
When I tried simple sites everything worked. Then I decided to try more difficult and now I can't figure it out. It took me about a two weeks to cross out the solution options that I found on the Internet over and over again.
Now I'm not sure that I can get information that appears on sites in this way. Of course, most likely I'm doing something wrong, but I can't take some new idea how it do. Now, I decided to ask here. Perhaps there are those who understand this and can prompt. If yes - please give me some example.
The site I use to learn - kbp.aero/en
The information I'm trying to get (arrival schedule) - .tbody .tr .td
For example I tried:
1.
URL = 'https://kbp.aero/en/'
HEADERS = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36'
}
time.sleep(1)
response = requests.get(URL, headers = HEADERS)
soup = BeautifulSoup(response.content, 'html.parser')
items = soup.find('div', class_ = 'table_wrp out yesterday')
items = items.findAll('tr', class_ = 'tr')
comps = []
if(len(items) > 0):
for item in items:
comps.append({
'title':item.find('td', class_ = 'td').get_text(strip = True),
})
for comp in comps:
print(comp['title'])
# for item in items:
# comps.append({
# 'text': item.get_text(strip=True)
# })
#
# for comp in comps:
# print(comp['text'])
from selenium import webdriver
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
def main():
driver = webdriver.Chrome()
driver.get("https://kbp.aero/en/")
wait = WebDriverWait(driver, 10)
element = wait.until(EC.text_to_be_present_in_element((By.CLASS_NAME, 'tbody'), ''))
tds = element.find_elements(By.CLASS_NAME, "td")
for td in tds:
print(td.text)
# try:
# element = WebDriverWait(driver, 10).until(
# EC.presence_of_element_located((By.CLASS_NAME, "tbody"))
# )
# tds = element.find_elements(By.CLASS_NAME, "td")
# for td in tds:
# print(td.text)
#
# finally:
# driver.quit()
Thanks for any advice.
This will fetch the entire table data:
from time import sleep
from selenium import webdriver
from selenium.webdriver.common.by import By
PATH = r"chromedriverexe path"
driver = webdriver.Chrome(PATH)
driver.get("https://kbp.aero/en/")
driver.maximize_window()
sleep(3)
print(driver.find_element(By.CSS_SELECTOR, "div.table_wrp.out.today > table").text)
Output:
Рейс Час Призначення Перевізник Термінал Гейт Статус
TK 1256 15:05 Istanbul Turkish Airlines D D5 Boarding Completed
PS 9556 15:05 Istanbul Ukraine International Airlines D D5 Boarding Completed
7W 163 15:10 Lviv Wind Rose D D19 Boarding
FR 3167 15:10 Warsaw Ryanair D D9 Boarding
PS 9013 15:15 Ivano-Frankivsk Ukraine International Airlines D D18 Boarding
7W 113 15:15 Ivano-Frankivsk Wind Rose D D18 Boarding
Related
I want to scrape the news articles from a number of pages on the site: https://koreajoongangdaily.joins.com/section/business
At the end, I want to create a dictionary out of the scraped data which should have the date, UTC_date, title, authors_name, news_content, url.
Here is my code, which I tried but couldn't make the dictionary.
Import all the necessary functions
from bs4 import BeautifulSoup as soup
import requests
import numpy as np
from pymongo import MongoClient
from bs4 import BeautifulSoup
from selenium import webdriver
import pandas as pd
from time import sleep
import uuid
import datetime
import time
from fake_useragent import UserAgent
import os
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
import sys
from fake_useragent import UserAgent
import warnings
warnings.filterwarnings('ignore')
import re
from tqdm import tqdm
import pandas as pd
import datetime
def string_to_date(x):
return datetime.datetime.strptime(x, '%Y/%m/%d')
def datee(pp):
return str(pp.date())
To get the links,
def get_link(res):
href_list = []
for res in res_list: # h3
link_list = res.select('a')
for link in link_list: # a
href = link.get('href')
href_list.append(href)
return href_list
To get the article body, title, authors, date and utc date from every link
def get_article(url):
news_list = []
title_list= []
page = requests.get(url)
bsobj = soup(page.content)
for title in bsobj.findAll('h1',{'class':'view-article-title serif'}):
title_list.append(title.text.strip())
for news in bsobj.findAll('div',{'class':'article-content-left pb-30'}):
news = news_list.append(news.text.strip())
author_list = []
for f in news:
author = ""
pattern = r"BY\b(.+)(?=\[.+\])"
resultsss = re.search(pattern, f)
if resultsss != None:
author = resultsss.group(0).strip()[3:]
authors = author_list.append(author)
#there is date given in every links of the articles hence we can use that
date_list_1 = []
separator = '/business'
for link in href_list:
new_set1 = link.replace('https://koreajoongangdaily.joins.com/', '')
new_set2 = new_set1.split(separator, 1)[0]
new_set3 = date_list_1.append(new_set2)
new_set4 = list(map(datee, new_set_4))
#no separate time so add 00:00:00 for UTC
p=[]
for x in new_set4:
utc_date = p.append(str(x) + " 00:00:00")
#print(news_list)
return news_list, title_list, authors, new_set4, utc_date
The n denotes the number of page I want to scrape,
def scrape_the_article(n):
options = webdriver.ChromeOptions()
lists = ['disable-popup-blocking']
caps = DesiredCapabilities().CHROME
caps["pageLoadStrategy"] = "normal"
options.add_argument("--window-size=1920,1080")
options.add_argument("--disable-extensions")
options.add_argument("--disable-notifications")
options.add_argument("--disable-Advertisement")
options.add_argument("--disable-popup-blocking")
driver = webdriver.Chrome(executable_path= r"E:\chromedriver\chromedriver.exe", options=options) #paste your own choromedriver path
url = "https://koreajoongangdaily.joins.com/section/business"
driver.get(url)
page = 0
for step in tqdm(range(n)): # set the page range here, how many page you want to scrape
page += 1
time.sleep(2)
try:
button = driver.find_element_by_class_name("service-more-btn")
button.click()
except Exception as e:
print("trying to scroll")
driver.execute_script("window.scrollBy(0, 100);")
print("Page: ", page)
html = driver.page_source
bs = BeautifulSoup(html, 'html.parser')
res_list = bs.select('div[class="mid-article3"]')
for res in res_list:
links = get_article_links(res)
article = get_article(links)
scrape_the_article(4)
And at the end I wanna make a dictionary which will look like this,
data = {'date': new_set4, 'utc_date_time': utc_date, 'title': title_list,'author': authors,
'content': news_list,'link': href_list}
But I couldn't get back the dictionary I wanted to get back. Please help me with this. Thank you!
There's an API endpoint that holds (almost) all data you need and each item is a dictionary, so you can construct your own data structure out of the API response.
NOTE There's no author key in the response, so if you really need this, you'll have to visit each article URL.
Here's how to get the first 10 items:
import datetime
import requests
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:104.0) Gecko/20100101 Firefox/104.0',
'X-Requested-With': 'XMLHttpRequest'
}
api_endpoint = "https://koreajoongangdaily.joins.com/section/business"
payload = {
"url": "/section/business",
"currPage": "1",
}
results = requests.post(api_endpoint, headers=headers, data=payload)
for result in results.json()['RESULT_LIST']:
date = (
datetime.datetime
.strptime(result['service_date'], '%Y%m%d%H%M%S')
.strftime('%Y-%m-%d %H:%M:%S')
)
print(date)
print(f"{result['list_title']}\n{result['cmss_url']}")
print(f"{result['summary']}")
print("-" * 50)
Output:
2022-10-25 18:20:42
Bio business
https://koreajoongangdaily.joins.com/2022/10/25/business/industry/Korea-World-Bio-Summit-Seoul/20221025182043006.html
President Yoon Suk-yeol delivers an opening address at the World Bio Summit 2022 held at the Grand Walkerhill Seoul in Gwangjin District, eastern Seoul, on Tuesday.
--------------------------------------------------
2022-10-25 18:20:33
Mirae Group invests in Musk's Twitter takeover
https://koreajoongangdaily.joins.com/2022/10/25/business/tech/Korea-Twitter-Elon-Musk/20221025182048690.html
Mirae Asset Financial Group will invest $212 million in Elon Musks’ $44 billion acquisition of Twitter, according to electronic disclosures and local media reports.
--------------------------------------------------
2022-10-25 18:20:00
Smart chair
https://koreajoongangdaily.joins.com/2022/10/25/imageNews/photos/KT-robotics-smart-autonomous-chairs/20221025182003312.html
A demonstration of an autonomous “smart” robot chair at the Dongdaemun Design Plaza in Seoul. KT announced that it is making the smart robotic chair available for three weeks to visitors attending the DDP-NFT exhibition.
--------------------------------------------------
and more ...
To paginate the API, try this example:
import requests
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:104.0) Gecko/20100101 Firefox/104.0',
'X-Requested-With': 'XMLHttpRequest'
}
api_endpoint = "https://koreajoongangdaily.joins.com/section/business"
payload = {
"url": "/section/business",
"currPage": "1",
}
with requests.Session() as s:
for page in range(1, 100, 10):
payload["currPage"] = str(page)
results = s.post(api_endpoint, headers=headers, data=payload)
for result in results.json()['RESULT_LIST']:
print(result['service_date'])
print(f"{result['list_title']}\n{result['cmss_url']}")
print(f"{result['summary']}")
print("-" * 50)
NOTE: I'd highly recommend throttling the request to a 1 - 3 seconds between each attempt.
I need to scrape the titles for all blog post articles via a Load More button as set by my desired range for i in range(1,3):
At present I'm only able to capture the titles for the first page even though i'm able to navigate to the next page using selenium.
Update:
In a previous question (How To Scrape Content With Load More Pages Using Selenium Python) by myself the pagination url was captured via:
Network Tab > Reload Page > Click Show more button > Select wp-admin/admin-ajax.php?...... Right Click Copy > Copy Link Address.
However, i do not know how to capture similar url for the site learnwoo.com/blog. I'm not sure if it uses a different technique.
Any help would be much appreciated.
from bs4 import BeautifulSoup
import pandas as pd
import requests
import time
# Selenium Routine
from requests_html import HTMLSession
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
# Removes SSL Issues With Chrome
options = webdriver.ChromeOptions()
options.add_argument('--ignore-certificate-errors')
options.add_argument('--ignore-ssl-errors')
options.add_argument('--ignore-certificate-errors-spki-list')
options.add_argument('log-level=3')
options.add_argument('--disable-notifications')
#options.add_argument('--headless') # Comment to view browser actions
# Get website url
urls = "https://learnwoo.com/blog/"
r = requests.get(urls)
driver = webdriver.Chrome(executable_path="C:\webdrivers\chromedriver.exe",options=options)
driver.get(urls)
productlist = []
for i in range(1,3):
# Get Page Information
soup = BeautifulSoup(r.content, features='lxml')
items = soup.find_all('div', class_ = 'td_module_1')
print(f'LOOP: start [{len(items)}]')
for single_item in items:
title = single_item.find('h3').text.strip()
print('Title:', title)
product = {
'Title': title,
}
productlist.append(product)
print()
time.sleep(5)
WebDriverWait(driver, 40).until(EC.element_to_be_clickable((By.XPATH,"//a[#id='next-page-tdi_5']"))).send_keys(Keys.ENTER)
driver.close()
# Save Results
df = pd.DataFrame(productlist)
df.to_csv('Results.csv', index=False)
Alternative solution: You can use API response to extract the desired data.From API response,I'm getting total 74 items where each page contains 6 items.
import pandas as pd
import requests
from bs4 import BeautifulSoup
params = {
'id': '',
'post_id': '0',
'slug': 'home',
'canonical_url': 'https://jooble.org/blog/',
'posts_per_page': '6',
'page': '0',
'offset': '20',
'post_type': 'post',
'repeater': 'default',
'seo_start_page': '1',
'preloaded': 'false',
'preloaded_amount': '0',
'lang': 'en',
'order': 'DESC',
'orderby': 'date',
'action': 'alm_get_posts',
'query_type': 'standard',
}
headers= {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36',
'x-requested-with': 'XMLHttpRequest'
}
api_url='https://jooble.org/blog/wp-admin/admin-ajax.php'
productlist= []
for params['page'] in range(0,13):
req = requests.get(api_url,params=params,headers=headers)
e = req.json()['html']
soup = BeautifulSoup(e,'lxml')
items = soup.find_all('div', class_ = 'front__news-content-wrapper')
for single_item in items:
title = single_item.find('div', class_ = 'front__news-title')
title=title.text.strip() if title else None
product = {
'Title': title,
}
productlist.append(product)
df = pd.DataFrame(productlist)
print(df)
Output:
Title
0 How to become an anesthesiologist
1 How to Become a Flight Attendant
2 How To Become An Influencer
3 How to Become an Electrician
4 3 Common Job Scams You Should Stay Away From
.. ...
69 Exploring Main Types of Remote Work
70 14 books HR specialist should read. Part 2
71 14 books HR specialist should read. Part 1
72 Don’t do that: 7 mistakes ruining your job int...
73 Virtual job interview. Jooble tips how to nail it
[74 rows x 1 columns]
To answer your question in selenium context, you could call .click():
WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.XPATH,"//a[#id='next-page-tdi_5']"))).click()
Concerning your xhr request comment - Note: Here it is not a GET it is a POST request (https://learnwoo.com/wp-admin/admin-ajax.php?td_theme_name=Newspaper&v=11) and you have to send some additional payload with requests
Example
This example is based on selenium 4 and uses its imports, may check https://www.selenium.dev/documentation/webdriver/getting_started/upgrade_to_selenium_4/#python-1
from bs4 import BeautifulSoup
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
urls = "https://learnwoo.com/blog/"
driver.get(urls)
productlist = []
for i in range(1,3):
soup = BeautifulSoup(driver.page_source)
items = soup.find_all('div', class_ = 'td_module_1')
print(f'LOOP: start [{len(items)}]')
for single_item in items:
product = {
'Title': single_item.find('h3').text.strip(),
}
productlist.append(product)
WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.XPATH,"//a[#id='next-page-tdi_5']"))).click()
pd.DataFrame(productlist)
Output
Title
0 5 Futuristic eCommerce Trends
1 9 Best Shopify Apps to Print Shipping Labels
2 Cloud Hosting VS VPS Hosting: A Comparison
3 10 Best WooCommerce Facebook Integration Plugins
4 Complete Guide to BigCommerce Security
... ...
91 How To Calculate ROI of Your Moodle LMS?
92 How and Where to Find Help for WordPress Begin...
93 Expert Speaks: In Conversation with Amir Helze...
94 A Complete Analysis: NetSuite WooCommerce Inte...
95 Review of Currency Switcher & Converter for Sh...
96 rows × 1 columns
Here is an alternative to HedgHog's response: maybe the better way here is to use a while loop, as we don't know how many entries there are. I used a counter to breakout of the loop after the 5th loading - if you want to get all those entries, you just remove the counter.
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time as t
import pandas as pd
chrome_options = Options()
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument('disable-notifications')
chrome_options.add_argument("window-size=1280,720")
webdriver_service = Service("chromedriver/chromedriver") ## path to where you saved chromedriver binary
browser = webdriver.Chrome(service=webdriver_service, options=chrome_options)
wait = WebDriverWait(browser, 20)
big_list = []
counter = 1
url = 'https://learnwoo.com/blog/'
browser.get(url)
while True:
try:
load_button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, 'a[aria-label="load_more"]')))
load_button.click()
counter = counter + 1
print('clicked to load more')
t.sleep(3)
entries = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'h3[class="entry-title td-module-title"]')))
print('we have', len(entries), 'articles')
big_list = [(x.text, x.find_element(By.TAG_NAME, 'a').get_attribute('href')) for x in entries]
if counter > 5:
break
except Exception as e:
print('all done')
break
df = pd.DataFrame(big_list, columns = ['Article', 'Url'])
print(df)
Result:
Article Url
0 5 Futuristic eCommerce Trends https://learnwoo.com/future-ecommerce-trends/
1 9 Best Shopify Apps to Print Shipping Labels https://learnwoo.com/best-shopify-apps-print-s...
2 Cloud Hosting VS VPS Hosting: A Comparison https://learnwoo.com/cloud-hosting-vps-hosting/
3 10 Best WooCommerce Facebook Integration Plugins https://learnwoo.com/best-woocommerce-facebook...
4 Complete Guide to BigCommerce Security https://learnwoo.com/bigcommerce-security-guide/
... ... ...
286 A Comparison Between Omnichannel and Multichan... https://learnwoo.com/omnichannel-multichannel-...
287 8 Winning Techniques for Off-page SEO https://learnwoo.com/winning-techniques-for-of...
288 WooCommerce – How to Understand User Roles and... https://learnwoo.com/woocommerce-understand-us...
289 7 Best Free WooCommerce Catalog Mode Plugins (... https://learnwoo.com/free-woocommerce-catalog-...
290 Different WooCommerce Product Types Explained ... https://learnwoo.com/woocommerce-different-pro...
291 rows × 2 columns
I am using Beautifulsoup to filter data from a website. To do this, I pass several search terms in a loop using the site's built-in search box.
If the search term does not find any content, the following loop in soup breaks.
AttributeError Traceback (most recent call last)
~\AppData\Local\Temp/ipykernel_25352/1005464644.py in <cell line: 10>()
21
22 soup = BeautifulSoup(driver.page_source, "html.parser")
---> 23 results = soup.find('ul', {'class':'result-list'}).find_all('li')
24
25 for i in results:
AttributeError: 'NoneType' object has no attribute 'find_all'
Because no data was found, logically no data could be transferred to the soup.
How can I catch this error?
Thanks for your help.
Here is the code:
# Imports
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from webdriver_manager.chrome import ChromeDriverManager
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver.get("https://www.google.com")
from bs4 import BeautifulSoup
import requests
import pandas as pd
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36'}
ausschreibungsliste = []
sb_1 = ['66512200', '85140000', '75000000', '75130000', '75131000', '79200000' , '79210000' , '79510000']
for z in sb_1:
time.sleep(1)
driver.get('https://www.service.bund.de/Content/DE/Ausschreibungen/Suche/Formular.html')
was_sb1 = driver.find_element("xpath", '//input[#id="f4641464d4642144"]')
was_sb1.send_keys(z)
was_sb1.send_keys(Keys.RETURN)
while True:
soup = BeautifulSoup(driver.page_source, "html.parser")
results = soup.find('ul', {'class':'result-list'}).find_all('li')
for i in results:
# Liste erzeugen
# Ausschreibung
ausschreibung = i.find_all('h3')[0].get_text().strip().replace(u'Ausschreibung', u'').replace(u'\xad', u'')
# Vergabestelle
organisation = i.find_all('p')[0].get_text().strip().replace(u'Vergabestelle ', u'')
# Ausschreibungsdatum
verdatum = i.find_all('p')[1].get_text().strip().replace(u'Veröffentlicht ', u'')
# Frist
frist = i.find_all('p')[2].get_text().replace(u'Angebotsfrist ', u'')
# Typ
typ = 'Ausschreibung'
# Website
website = 'service.bund.de'
# Prüfung ab
pruefdatum_format = 'fehlt'
# Datei erzeugt
jetzt = 'fehlt'
i_info = {
'Vergabedatum': verdatum,
'Frist': frist,
'Organisation': organisation,
'Ausschreibung': ausschreibung,
'Typ': typ,
'Website': website,
'Prüfung ab': pruefdatum_format,
'Datei erzeugt': jetzt
}
ausschreibungsliste.append(i_info)
try:
time.sleep(2)
driver.find_element("xpath", '//*[#id="main"]/div/div/section[2]/div[1]/div/form/fieldset/div[2]/div[1]/ul/li[2]/a').click()
except NoSuchElementException:
break
Something in the lines like:
if soup.find('ul', {'class':'result-list'}):
results = soup.find('ul', {'class':'result-list'}).find_all('li')
for i in results:
etc.
else:
pass
I have now found a solution based on the comments of Scott Hunter and AudioBaten.
Here is the (shortened) code:
ausschreibungsliste = []
# CPV-Codes die zu suchen sind
cpvcode = ['32441300', '64226000' , '66512200']
for z in cpvcode:
time.sleep(1)
driver.get('the_url')
suchfeld = driver.find_element("xpath", '//input[#id="f4641464d4642144"]')
suchfeld.clear()
suchfeld.send_keys(z)
suchfeld.send_keys(Keys.RETURN)
try:
soup = BeautifulSoup(driver.page_source, "html.parser")
results = soup.find('ul', {'class':'result-list'}).find_all('li')
while True:
for i in results:
# Liste erzeugen
.... etc. ....
i_info = {
'Vergabedatum': verdatum,
'Frist': frist,
'Organisation': organisation,
'Ausschreibung': ausschreibung,
'CPV-Code': z,
'Link': linkausschreibung,
'Typ': typ,
'Website': website,
'Prüfung ab': pruefdatum_format,
'Datei erzeugt': jetzt
}
ausschreibungsliste.append(i_info)
# Nächster Seitenaufruf bis letzte Seite erreicht
if not soup.select_one('span', {'class':'disabled'}):
next=driver.find_element("xpath", '//*[#id="main"]/div/div/section[2]/div[1]/div/form/fieldset/div[2]/div[1]/ul/li[2]/a').click()
else:
print('Ausschreibungen gefunden :', len(ausschreibungsliste))
break
except:
continue
Thanks for your help.
I want scrape the Data from this Domain https://stacker.com/stories/1587/100-best-movies-all-time
I can get data only if I add header User-Agent
from bs4 import BeautifulSoup as BS
import requests
headers = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:98.0) Gecko/20100101 Firefox/98.0',
}
url = 'https://www.the-numbers.com/movie/Avengers-Endgame-(2019)#tab=cast-and-crew'
response = requests.get(url, headers=headers)
# --- response ---
#print(response.status_code)
#print(response.text[:1000])
soup = BS(response.text, 'html.parser')
all_items = soup.find_all('div', id="cast-and-crew")
for item in all_items:
print(item.get_text(strip=True, separator='\n'))
Result:
Lead Ensemble Members
Robert Downey, Jr.
Tony Stark/Iron Man
Chris Evans
Steve Rogers/Captain America
Mark Ruffalo
Bruce Banner/Hulk
Chris Hemsworth
Thor
Scarlett Johansson
Natasha Romanoff/Black Widow
Jeremy Renner
Clint Barton/Hawkeye
Don Cheadle
...
I am new in this like you, I tried and with beatifulsoap it does get the request, maybe some type of security, but I tried to do what you want with selenium and it works, check this:
from selenium import webdriver
website = "https://www.the-numbers.com/movie/Avengers-Endgame-(2019)#tab=cast-and-crew"
path = "/"
chrome_options = webdriver.ChromeOptions();
chrome_options.add_experimental_option("excludeSwitches", ['enable-logging'])
driver = webdriver.Chrome(options=chrome_options);
driver.get(website)
box = driver.find_element_by_class_name("cast_new")
matches = box.find_elements_by_xpath('//*[#id="cast-and-crew"]/div[5]/table/tbody/tr[1]/td[1]/b/a')
for match in matches:
print(match.text)
driver.quit()
I want to get first 10 images url from google search (not base64).
I have code:
import os
import base64
import time
from selenium.webdriver.common.keys import Keys
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
searchterm = 'bananas' # will also be the name of the folder
url = "https://www.google.com/search?q=banan&source=lnms&tbm=isch&sa=X&ved=2ahUKEwj-75rDlJLoAhWLHHcKHStFC6EQ_AUoAXoECA4QAw&biw=1867&bih=951"
options = webdriver.ChromeOptions()
options.add_argument("--start-maximized")
browser = webdriver.Chrome(executable_path=ChromeDriverManager().install(), options=options)
browser.get(url)
actions = webdriver.common.action_chains.ActionChains(browser)
header = {
'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36"}
counter = 0
succounter = 0
if not os.path.exists(searchterm):
os.mkdir(searchterm)
for i in range(0, 11):
time.sleep(1)
x = browser.find_elements_by_xpath('//*[#id="islrg"]/descendant::img')[i]
x.click()
i += 1
if i > 10:
break
ba = browser.find_element_by_xpath('//*
[#id="Sva75c"]/div/div/div[3]/div[2]/div/div[1]/div[1]/div/div[2]/a/img')
print(ba.get_attribute('src'))
It returns image urls, but sometimes base64. How to make the script always return image url?
Thank you.
Change the xpath to get the link rather image, and then get the href.
ba = browser.find_element_by_xpath("//div[#class='islrc']//a[#href][#rel='noopener']")
print(ba.get_attribute("href")
You can always get only Image URLs if you scrape another search engine DuckDuckGo using the following code:
search_query = 'what you want to find'
num_images = 10
driver_location = '/put/location/of/your/driver/here'
# setting up the driver
ser = Service(driver_location)
op = webdriver.ChromeOptions()
driver = webdriver.Chrome(service=ser, options=op)
# searching the query
driver.get(f'https://duckduckgo.com/?q={search_query}&kl=us-en&ia=web')
# going to Images Section
ba = driver.find_element(By.XPATH, "//a[#class='zcm__link js-zci-link js-zci-link--images']")
ba.click()
# getting the images URLs
for result in driver.find_elements(By.CSS_SELECTOR, '.js-images-link')[0:0+num_images]:
imageURL = result.get_attribute('data-id')
print(f'{imageURL}\n')
driver.quit()