BeautifulSoup : Unable to find image URL's in for loop - python

I am trying to scrape a website through beautiful soup + selenium and getting their image URLs under <img> tag with src as an attribute. I don't want to scrape through div class names. Here is what i am scraping through:
<img src="https://secure.gravatar.com/avatar/f1fb5ec60129b029e968f0522fe4828c?s=100&d=retro&f=y" alt="" width="55" height="55">
I want to get all URLs under image tag. Here is my code which is giving me an error:
from bs4 import BeautifulSoup as Soup
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) ' \
'Chrome/80.0.3987.132 Safari/537.36'
options = Options()
options.add_argument("--headless")
options.add_argument(f'user-agent={user_agent}')
options.add_argument("--disable-web-security")
options.add_argument("--allow-running-insecure-content")
options.add_argument("--allow-cross-origin-auth-prompt")
driver = webdriver.Chrome(executable_path=r"C:\Users\intel\Downloads\setups\chromedriver.exe", options=options)
driver.get("https://python-forum.io/Thread-Using-beautiful-soup-to-get-html-attribute-value")
page = Soup(driver.page_source, features='html.parser')
divs = page.select("img")
for product in divs:
ele = divs.find('src')
print(ele)
It's giving me attribute error :
AttributeError: ResultSet object has no attribute 'find'.
You're probably treating a list of elements like a single element. Did you call find_all() when you meant to call find()?
Any of your help would be appreciated...

Intitaly i thought, this ele = divs.find('src') should be ele = product.find('src') but that didnt work, so I have implemented in in the following way. Change this
page = Soup(driver.page_source, features='html.parser')
divs = page.select("img")
for product in divs:
ele = divs.find('src')
print(ele)
to this
page = Soup(driver.page_source, features='html.parser')
divs = page.find_all("img")
print(divs)
for product in divs:
ele = product['src']
print(ele)
This should give you values in the src attribute of the img tag.

import requests
from bs4 import BeautifulSoup
def main(url):
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
target = [item['content']
for item in soup.findAll("meta", {'property': "og:image"})]
print(target)
main("https://python-forum.io/Thread-Using-beautiful-soup-to-get-html-attribute-value")
Output:
['https://python-forum.io/images/facebook.png', 'https://secure.gravatar.com/avatar/f1fb5ec60129b029e968f0522fe4828c?s=100&d=retro&f=y']

Related

BeautifulSoup: how to get the value of the price from the webpage's source code if there is no id within the source code for the price

I am doing web scraping in Python with BeautifulSoup and wondering if I there is a way of getting the value of a cell when it has no id. The code is as below:
from bs4 import BeautifulSoup
import requests
import time
import datetime
URL = "https://www.amazon.co.uk/Got-Data-MIS-Business-Analyst/dp/B09F319PK2/ref=sr_1_1?keywords=funny+got+data+mis+data+systems+business+analyst+tshirt&qid=1636481904&qsid=257-9827493-6142040&sr=8-1&sres=B09F319PK2%2CB09F33452D%2CB08MCBFLHC%2CB07Y8Z4SF8%2CB07GJGXY7P%2CB07Z2DV1C2%2CB085MZDMZ8%2CB08XYL6GRM%2CB095CXJ226%2CB08JDMYMPV%2CB08525RB37%2CB07ZDNR6MP%2CB07WL5JGPH%2CB08Y67YF63%2CB07GD73XD8%2CB09JN7Z3G2%2CB078W9GXJY%2CB09HVDRJZ1%2CB07JD7R6CB%2CB08JDKYR6Q&srpt=SHIRT"
headers = {"User-Agent": "Mozilla/5.0 (X11; CrOS x86_64 14092.77.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.107 Safari/537.36"}
page = requests.get(URL, headers = headers)
soup1 = BeautifulSoup(page.content, "html.parser")
soup2 = BeautifulSoup(soup1.prettify(), "html.parser")
title = soup2.find(id="productTitle").get_text()
price = soup2.find(id="priceblock_ourprice").get_text()
print(title)
print(price)
For this page, you have to select the garment size before the price is displayed. We can get the price from the dropdown list of sizes which is a SELECT with id = "dropdown_selected_size_name"
First let's get a list of the options in the SELECT dropdown:
options = soup2.find(id='variation_size_name').select('select option')
Then we can get the price say for size 'Large'
for opt in options:
if opt.get('data-a-html-content', '') == 'Large':
print(opt['value'])
or a little more succinctly:
print([opt['value'] for opt in options if opt.get('data-a-html-content', '') == 'Large'][0])

Python / Selenium / Beautiful Soup not scraping desired elements

I'm struggling to get this code to extract the desired information from one single page.
I've tried all the usual selenium tactics and added a time delay. Hopefully, it's something simple. I'm not getting any error messages.
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup as bs
from time import sleep
options = Options()
options.add_argument("--headless")
options.add_argument("window-size=1400,600")
user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 11_0_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.67 Safari/537.36"
options.add_argument(f'user-agent={user_agent}')
capabilities = { 'chromeOptions': { 'useAutomationExtension': False},'args': ['--disable-extensions']}
browser = webdriver.Chrome(executable_path=r'/usr/local/bin/chromedriver',desired_capabilities = capabilities,options=options)
url='https://groceries.asda.com/product/celery-spring-onions/asda-growers-selection-trimmed-spring-onions/41676'
browser.get(url)
sleep(3)
source_data = browser.page_source
bs_data = bs(source_data,"html.parser")
#product id
try:
product_id = bs_data.findfindAll('span', {'class': 'pdp-main-details__product-code'})
product_id = product_id.replace('Product code:','').strip()
except:
product_id = "n/a"
#image address
try:
for image in bs_data.find("div", {"class":"s7staticimage"}):
image_url = image.find('img')['src']
except:
image_url = "n/a"
#product description
try:
product_desc = bs_data.find('class',{'pdp-main-pdp-main-details__title'})
product_desc = product_desc.get_text().strip()
except:
product_desc = "n/a"
#product price
try:
product_price = bs_data.find('class',{'co-product__price pdp-main-details__price'})
product_price = product_price.get_text().strip()
except:
product_price = "n/a"
print (url,'|',image_url,'|',product_id,'|',product_desc,'|',product_price)
browser.quit()
Any assistance is greatly appreciated.
Thanks
Since the content is dynamically generated, your soup has nothing in it to find. Selenium is good enough. I don't know why you have treated the elements as list because there is only one of each on this page.
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
options = Options()
options.add_argument('--headless')
capabilities = { 'chromeOptions': { 'useAutomationExtension': False},'args': ['--disable-extensions']}
browser = webdriver.Chrome(executable_path='C:/bin/chromedriver.exe',desired_capabilities = capabilities,options=options)
url='https://groceries.asda.com/product/celery-spring-onions/asda-growers-selection-trimmed-spring-onions/41676'
browser.get(url)
browser.implicitly_wait(15)
product_id = browser.find_element_by_class_name('pdp-main-details__product-code')
print(product_id.text)
image = browser.find_element_by_xpath("//*[#id=\"s7viewer_flyout\"]/div[1]/img[1]")
image_url = image.get_attribute('src')
print(image_url)
Output:-
Product code: 410212
https://ui.assets-asda.com/dm/asdagroceries/5050854288142_T1?defaultImage=asdagroceries/noImage&resMode=sharp2&id=PqaST3&fmt=jpg&fit=constrain,1&wid=188&hei=188

how to print 1st element in HTML tag

My code gets links/HTML from different "sections" of a page.
It prints 2 links per section, however I only want the first one printed.
Expected output should not contain the links ending with "video", as it does with my code.
from selenium import webdriver
from bs4 import BeautifulSoup
import time
driver = webdriver.Chrome()
jam=[]
baseurl='https://meetinglibrary.asco.org'
driver.get('https://meetinglibrary.asco.org/results?meetingView=2020%20ASCO%20Virtual%20Scientific%20Program&page=1')
time.sleep(3)
page_source = driver.page_source
soup = BeautifulSoup(page_source,'html.parser')
productlist=soup.find_all('a',class_='ng-star-inserted')
for item in productlist:
for link in item.find_all('a',href=True):
jam.append(baseurl+link['href'])
print(jam)
You can use the condition function before appending the script.
...
for item in productlist:
ahrefs = item.find_all('a', href=True)
for index in range(len(ahrefs)):
if (index % 2 == 0) and ('video' not in ahrefs[index]['href']):
jam.append(baseurl+ahrefs[index]['href'])
print(jam)
...
Let me know after trying.
Good luck
Use os.path.basename to get the end of string.And use in operator to check whether "video" exists:
from selenium import webdriver
from bs4 import BeautifulSoup
import time
import os
driver = webdriver.Chrome()
jam = []
baseurl = 'https://meetinglibrary.asco.org'
driver.get('https://meetinglibrary.asco.org/results?meetingView=2020%20ASCO%20Virtual%20Scientific%20Program&page=1')
time.sleep(3)
page_source = driver.page_source
soup = BeautifulSoup(page_source, 'html.parser')
productlist = soup.find_all('a', class_='ng-star-inserted')
for item in productlist:
for link in item.find_all('a', href=True):
url = link['href']
if "video" not in os.path.basename(url):
jam.append(baseurl + url)
print(jam)
result:
['https://meetinglibrary.asco.org/record/185955/abstract',
'https://meetinglibrary.asco.org/record/185955/slide',
'https://meetinglibrary.asco.org/record/185954/abstract',
'https://meetinglibrary.asco.org/record/186048/abstract',
'https://meetinglibrary.asco.org/record/186048/slide',
'https://meetinglibrary.asco.org/record/190197/slide',
'https://meetinglibrary.asco.org/record/192623/slide',
'https://meetinglibrary.asco.org/record/185414/abstract',
'https://meetinglibrary.asco.org/record/185414/slide',
'https://meetinglibrary.asco.org/record/185415/abstract',
'https://meetinglibrary.asco.org/record/185415/slide',
'https://meetinglibrary.asco.org/record/185473/abstract',
'https://meetinglibrary.asco.org/record/185473/slide',
'https://meetinglibrary.asco.org/record/187584/slide',
'https://meetinglibrary.asco.org/record/188561/slide',
'https://meetinglibrary.asco.org/record/186710/abstract',
'https://meetinglibrary.asco.org/record/186710/slide',
'https://meetinglibrary.asco.org/record/186699/abstract',
'https://meetinglibrary.asco.org/record/186699/slide',
'https://meetinglibrary.asco.org/record/186698/abstract',
'https://meetinglibrary.asco.org/record/186698/slide',
'https://meetinglibrary.asco.org/record/187720/slide',
'https://meetinglibrary.asco.org/record/187480/abstract',
'https://meetinglibrary.asco.org/record/187480/slide',
'https://meetinglibrary.asco.org/record/191961/slide',
'https://meetinglibrary.asco.org/record/192626/slide',
'https://meetinglibrary.asco.org/record/186983/abstract',
'https://meetinglibrary.asco.org/record/186983/slide',
'https://meetinglibrary.asco.org/record/188580/abstract',
'https://meetinglibrary.asco.org/record/188580/slide',
'https://meetinglibrary.asco.org/record/189047/abstract',
'https://meetinglibrary.asco.org/record/189047/slide',
'https://meetinglibrary.asco.org/record/190223/slide',
'https://meetinglibrary.asco.org/record/190273/slide',
'https://meetinglibrary.asco.org/record/184812/abstract',
'https://meetinglibrary.asco.org/record/184812/slide',
'https://meetinglibrary.asco.org/record/184927/slide',
'https://meetinglibrary.asco.org/record/184805/abstract',
'https://meetinglibrary.asco.org/record/184805/slide',
'https://meetinglibrary.asco.org/record/184811/abstract',
'https://meetinglibrary.asco.org/record/184811/slide',
'https://meetinglibrary.asco.org/record/185576/slide',
'https://meetinglibrary.asco.org/record/190147/slide']

How to extract links from elements?

I am trying to extract the links of every individual member but I am not getting output:
from bs4 import BeautifulSoup
import requests
r = requests.get('https://www.asklaila.com/search/Delhi-NCR/-/doctors/')
soup = BeautifulSoup(r.text,'lxml')
for link in soup.find_all('h2',class_='resultTitle'):
link1 = link.find('a')
print link1['href']
You need request url with header param. more details
Where resultContent top doctors in Delhi-NCR result div class, cardWrap every doctor cards div class.
from bs4 import BeautifulSoup
import requests
headers = {'User-Agent': 'Custom user agent'}
r = requests.get('https://www.asklaila.com/search/Delhi-NCR/-/doctors/',headers=headers)
soup = BeautifulSoup(r.text,'lxml')
resultContentArray = soup.find('div',{'class':'resultContent'}).find_all("div",{'class':'cardWrap'})
for rr in resultContentArray:
title = rr.find('h2',{'class':'resultTitle'})
link = rr.find("a",href=True)
if link is not None:
print(link['href'])
O/P:
https://www.asklaila.com/category/Delhi-NCR/-/doctors/doctor/?category=176
https://www.asklaila.com/search/Delhi-NCR/greater-kailash-1/doctors/
https://www.asklaila.com/search/Delhi-NCR/-/maternity-hospital/
https://www.asklaila.com/Delhi-NCR/
https://www.asklaila.com/listing/Delhi-NCR/madangir/dr-vp-kaushik/0Vm4m7jP/
https://www.asklaila.com/listing/Delhi-NCR/sector-19/dr-arvind-garg/1BEtXFWP/
https://www.asklaila.com/listing/Delhi-NCR/indira-puram/dr-sanjay-garg/kUUpPPzH/
https://www.asklaila.com/listing/Delhi-NCR/new-friends-colony/dr-rk-caroli/GK5X4dSI/
https://www.asklaila.com/listing/Delhi-NCR/vasant-vihar/dr-sourabh-nagpal/0v1s6pGr/
https://www.asklaila.com/listing/Delhi-NCR/ncr/care24/0bbotWCf/
https://www.asklaila.com/listing/Delhi-NCR/soami-nagar-north/sudaksh-physiotherapy-psychology-orthopaedic-psychiatry-clinic-/kJxps7Dn/
https://www.asklaila.com/listing/Delhi-NCR/vaishali-sector-3/dr-sb-singh/00PPdXnM/
https://www.asklaila.com/listing/Delhi-NCR/kaushambi/dr-uma-kant-gupta/0ivP1mJ6/
https://www.asklaila.com/listing/Delhi-NCR/vaishali-sector-4/dr-kanwal-deep/09eZqT9k/
https://www.asklaila.com/listing/Delhi-NCR/east-of-kailash/dr-harbhajan-singh/ngDklERb/
https://www.asklaila.com/listing/Delhi-NCR/uttam-nagar/dr-bb-jindal/0Z8u07oQ/
https://www.asklaila.com/listing/Delhi-NCR/greater-kailash-part-1/dr-raman-kapoor/kNFPgYfZ/
https://www.asklaila.com/listing/Delhi-NCR/dwarka-sector-7/dr-pankaj-n-surange/NpIBzM4K/
https://www.asklaila.com/listing/Delhi-NCR/vaishali-sector-3/dr-ritu-gupta/19IoQ4A7/
https://www.asklaila.com/listing/Delhi-NCR/vaishali-sector-5/dr-mala-bhattacharjee/ywTzyamp/
https://www.asklaila.com/listing/Delhi-NCR/vasundhara/dr-mohit-jindal/vN9FiMAd/
https://www.asklaila.com/listing/Delhi-NCR/janakpuri/dr-ravi-manocha/1Qe4iuK1/
https://www.asklaila.com/listing/Delhi-NCR/vikas-marg/sparsh/08ZpsI85/
https://www.asklaila.com/listing/Delhi-NCR/kamla-nagar/dr-deepak-guha/ETn71X1r/
https://www.asklaila.com/search/Delhi-NCR/-/doctors/20
Use:
html.parser
custom header User-agent
soup.select feature
from bs4 import BeautifulSoup
import requests
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
r = requests.get('https://www.asklaila.com/search/Delhi-NCR/-/doctors/', headers=headers)
soup = BeautifulSoup(r.content, 'html.parser')
for link in soup.select('h2[class="resultTitle"] > a'):
print(link['href'])
The output:
https://www.asklaila.com/listing/Delhi-NCR/madangir/dr-vp-kaushik/0Vm4m7jP/
https://www.asklaila.com/listing/Delhi-NCR/sector-19/dr-arvind-garg/1BEtXFWP/
https://www.asklaila.com/listing/Delhi-NCR/indira-puram/dr-sanjay-garg/kUUpPPzH/
https://www.asklaila.com/listing/Delhi-NCR/new-friends-colony/dr-rk-caroli/GK5X4dSI/
https://www.asklaila.com/listing/Delhi-NCR/vasant-vihar/dr-sourabh-nagpal/0v1s6pGr/
https://www.asklaila.com/listing/Delhi-NCR/ncr/care24/0bbotWCf/
https://www.asklaila.com/listing/Delhi-NCR/soami-nagar-north/sudaksh-physiotherapy-psychology-orthopaedic-psychiatry-clinic-/kJxps7Dn/
https://www.asklaila.com/listing/Delhi-NCR/vaishali-sector-3/dr-sb-singh/00PPdXnM/
https://www.asklaila.com/listing/Delhi-NCR/kaushambi/dr-uma-kant-gupta/0ivP1mJ6/
https://www.asklaila.com/listing/Delhi-NCR/vaishali-sector-4/dr-kanwal-deep/09eZqT9k/
https://www.asklaila.com/listing/Delhi-NCR/east-of-kailash/dr-harbhajan-singh/ngDklERb/
https://www.asklaila.com/listing/Delhi-NCR/uttam-nagar/dr-bb-jindal/0Z8u07oQ/
https://www.asklaila.com/listing/Delhi-NCR/greater-kailash-part-1/dr-raman-kapoor/kNFPgYfZ/
https://www.asklaila.com/listing/Delhi-NCR/dwarka-sector-7/dr-pankaj-n-surange/NpIBzM4K/
https://www.asklaila.com/listing/Delhi-NCR/vaishali-sector-3/dr-ritu-gupta/19IoQ4A7/
https://www.asklaila.com/listing/Delhi-NCR/vaishali-sector-5/dr-mala-bhattacharjee/ywTzyamp/
https://www.asklaila.com/listing/Delhi-NCR/vasundhara/dr-mohit-jindal/vN9FiMAd/
https://www.asklaila.com/listing/Delhi-NCR/janakpuri/dr-ravi-manocha/1Qe4iuK1/
https://www.asklaila.com/listing/Delhi-NCR/vikas-marg/sparsh/08ZpsI85/
https://www.asklaila.com/listing/Delhi-NCR/sector-40/dr-amit-yadav/1ik21lZw/
Using **SoupStrainer
import httplib2
from bs4 import BeautifulSoup, SoupStrainer
http = httplib2.Http()
status, response = http.request('https://www.asklaila.com/search/Delhi-NCR/-/doctors/')
for link in BeautifulSoup(response, 'html.parser', parse_only=SoupStrainer('a')):
if link.has_attr('href'):
print(link['href'])
There are twenty correct links to retrieve for members. A concise way is to use css selector of parent class with child combinator to get a tag within
from bs4 import BeautifulSoup
import requests
r = requests.get('https://www.asklaila.com/search/Delhi-NCR/-/doctors/',headers= {'User-Agent' : 'Mozilla/5.0'})
soup = BeautifulSoup(r.content,'lxml')
links = [item['href'] for item in soup.select('.resultTitle > a')]
print(links)
The server is looking for User-Agent in header to prevent users from scraping the content
you could set request headers as a work around.
from bs4 import BeautifulSoup
import requests
headers = dict()
headers['User-Agent']= "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:67.0) Gecko/20100101 Firefox/67.0"
r = requests.get('https://www.asklaila.com/search/Delhi-NCR/-/doctors/',headers=headers)
soup = BeautifulSoup(r.text,'lxml')
# with open('h.html','w') as w:
# w.write(soup.text)
for link in soup.find_all('h2',class_='resultTitle'):
link1 = link.find('a')
print link1['href']
Should give you
https://www.asklaila.com/listing/Delhi-NCR/madangir/dr-vp-kaushik/0Vm4m7jP/
https://www.asklaila.com/listing/Delhi-NCR/sector-19/dr-arvind-garg/1BEtXFWP/
https://www.asklaila.com/listing/Delhi-NCR/indira-puram/dr-sanjay-garg/kUUpPPzH/
https://www.asklaila.com/listing/Delhi-NCR/new-friends-colony/dr-rk-caroli/GK5X4dSI/
https://www.asklaila.com/listing/Delhi-NCR/vasant-vihar/dr-sourabh-nagpal/0v1s6pGr/
https://www.asklaila.com/listing/Delhi-NCR/ncr/care24/0bbotWCf/
https://www.asklaila.com/listing/Delhi-NCR/soami-nagar-north/sudaksh-physiotherapy-psychology-orthopaedic-psychiatry-clinic-/kJxps7Dn/
https://www.asklaila.com/listing/Delhi-NCR/vaishali-sector-3/dr-sb-singh/00PPdXnM/
https://www.asklaila.com/listing/Delhi-NCR/vaishali-sector-4/dr-kanwal-deep/09eZqT9k/
https://www.asklaila.com/listing/Delhi-NCR/kaushambi/dr-uma-kant-gupta/0ivP1mJ6/
https://www.asklaila.com/listing/Delhi-NCR/east-of-kailash/dr-harbhajan-singh/ngDklERb/
https://www.asklaila.com/listing/Delhi-NCR/uttam-nagar/dr-bb-jindal/0Z8u07oQ/
https://www.asklaila.com/listing/Delhi-NCR/greater-kailash-part-1/dr-raman-kapoor/kNFPgYfZ/
https://www.asklaila.com/listing/Delhi-NCR/dwarka-sector-7/dr-pankaj-n-surange/NpIBzM4K/
https://www.asklaila.com/listing/Delhi-NCR/vaishali-sector-3/dr-ritu-gupta/19IoQ4A7/
https://www.asklaila.com/listing/Delhi-NCR/vaishali-sector-5/dr-mala-bhattacharjee/ywTzyamp/
https://www.asklaila.com/listing/Delhi-NCR/vasundhara/dr-mohit-jindal/vN9FiMAd/
https://www.asklaila.com/listing/Delhi-NCR/janakpuri/dr-ravi-manocha/1Qe4iuK1/
https://www.asklaila.com/listing/Delhi-NCR/vikas-marg/sparsh/08ZpsI85/
https://www.asklaila.com/listing/Delhi-NCR/kamla-nagar/dr-deepak-guha/ETn71X1r/

How do I search within a website using the 'requests' module?

I want to search for different company names on the website. Website link: https://www.firmenwissen.de/index.html
On this website, I want to use the search engine and search companies. Here is the code I am trying to use:
from bs4 import BeautifulSoup as BS
import requests
import re
companylist = ['ABEX Dachdecker Handwerks-GmbH']
url = 'https://www.firmenwissen.de/index.html'
payloads = {
'searchform': 'UFT-8',
'phrase':'ABEX Dachdecker Handwerks-GmbH',
"mainSearchField__button":'submit'
}
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}
html = requests.post(url, data=payloads, headers=headers)
soup = BS(html.content, 'html.parser')
link_list= []
links = soup.findAll('a')
for li in links:
link_list.append(li.get('href'))
print(link_list)
This code should bring me the next page with company information. But unfortunately, it returns only the home page. How can I do this?
Change your initial url you are doing search for. Grab the appropriate hrefs only and add to a set to ensure no duplicates (or alter selector to return only one match if possible); add those items to a final set for looping to ensure only looping required number of links. I have used Session on assumption you will repeat for many companies.
Iterate over the set using selenium to navigate to each company url and extract whatever info you need.
This is an outline.
from bs4 import BeautifulSoup as BS
import requests
from selenium import webdriver
d = webdriver.Chrome()
companyList = ['ABEX Dachdecker Handwerks-GmbH','SUCHMEISTEREI GmbH']
url = 'https://www.firmenwissen.de/ergebnis.html'
baseUrl = 'https://www.firmenwissen.de'
headers = {'User-Agent': 'Mozilla/5.0'}
finalLinks = set()
## searches section; gather into set
with requests.Session() as s:
for company in companyList:
payloads = {
'searchform': 'UFT-8',
'phrase':company,
"mainSearchField__button":'submit'
}
html = s.post(url, data=payloads, headers=headers)
soup = BS(html.content, 'lxml')
companyLinks = {baseUrl + item['href'] for item in soup.select("[href*='firmeneintrag/']")}
# print(soup.select_one('.fp-result').text)
finalLinks = finalLinks.union(companyLinks)
for item in finalLinks:
d.get(item)
info = d.find_element_by_css_selector('.yp_abstract_narrow')
address = d.find_element_by_css_selector('.yp_address')
print(info.text, address.text)
d.quit()
Just the first links:
from bs4 import BeautifulSoup as BS
import requests
from selenium import webdriver
d = webdriver.Chrome()
companyList = ['ABEX Dachdecker Handwerks-GmbH','SUCHMEISTEREI GmbH', 'aktive Stuttgarter']
url = 'https://www.firmenwissen.de/ergebnis.html'
baseUrl = 'https://www.firmenwissen.de'
headers = {'User-Agent': 'Mozilla/5.0'}
finalLinks = []
## searches section; add to list
with requests.Session() as s:
for company in companyList:
payloads = {
'searchform': 'UFT-8',
'phrase':company,
"mainSearchField__button":'submit'
}
html = s.post(url, data=payloads, headers=headers)
soup = BS(html.content, 'lxml')
companyLink = baseUrl + soup.select_one("[href*='firmeneintrag/']")['href']
finalLinks.append(companyLink)
for item in set(finalLinks):
d.get(item)
info = d.find_element_by_css_selector('.yp_abstract_narrow')
address = d.find_element_by_css_selector('.yp_address')
print(info.text, address.text)
d.quit()

Categories

Resources