Can't find and process text taken out of an HTML

Can't find and process text taken out of an HTML - python

I'm trying to search in a webpage the "spanish" content but can't get it at all.
This is the code I have so far:
from bs4 import BeautifulSoup
import requests
import re
url = 'http://www.autotaskstatus.net/'
r = requests.get(url)
estado = r.status_code
r = r.content
soup = BeautifulSoup(r, "html.parser")
data = soup.find_all('span', attrs={'class':'name'})[1]
pais = 'Spanish'
data.get_text()
print(data.text)
I have there the "pais" var so it will be replaced by an input so the user can search the country they want.
The only data I get with a 1 there is "Limited Release" but if I go with a 0 I can't filter the results at all
I have been searching all over Internet and couldn't find anyone with this same problem so I can't find a solution.
I am using Python 3.6
Edit: since people seemed to find this unclear I'll explain it now
What I have on the page is: - just a part
<div data-component-id="fp5s6cp13l47"
class="component-inner-container status-green "
data-component-status="operational"
data-js-hook="">
<span class="name">
Concord
</span>
<span class="tooltip-base tool" title="https://concord.centrastage.net">?</span>
<span class="component-status">
Operational
</span>
So spanish is like "Concord" and what I want to take out is the "Spanish" (and later on the "operational") which will be in a var so it can later be changed for any country there

You can get the Spanish server status using this approach:
from bs4 import BeautifulSoup
import requests
URL = 'http://www.autotaskstatus.net/'
with requests.session() as s:
s.headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:50.0) Gecko/20100101 Firefox/50.0'}
r = s.get(URL)
soup = BeautifulSoup(r.content, "html.parser")
data = soup.find_all('div', attrs={'class':'component-inner-container'})
pais = 'Spanish'
print([d.find('span', {'class': 'name'}).text.strip() + ' - ' + d.find('span', {'class': 'component-status'}).text.strip() for d in data if pais in d.text])

Related

Grabbing some data from a card-body div class

Good day.
My script is on progress and I need help or ideas to make it work properly. I am able to grab some data but its not really that readable and useful and your help and ideas are needed.
from bs4 import BeautifulSoup
import requests
headers = {"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:92.0) Gecko/20100101 Firefox/92.0"}
url = "https://bscscan.com/tx/0xb07b68f72f0b58e8cfb8c8e896736f49b13775ebda25301475d24554a601ff97#eventlog"
urlpage = requests.get(url, headers=headers, timeout=10, allow_redirects=False)
soup = BeautifulSoup(urlpage.content, 'html.parser')
price = soup.find('div', class_='d-none d-md-inline-block u-label u-label--price rounded mt-1 ml-n1 text-nowrap').get_text()#.strip()
print ("Price: ", price)
data1 = soup.find('div', class_='card-body').get_text()#.strip()
print (data1)
data2 = soup.find('span', class_='btn btn-icon btn-soft-success rounded-circle').get_text()#.strip()
print (data2)
Current Output:
Price:
BNB: $422.35 (-3.05%) | 5 Gwei
Transaction Hash:
0xb07b68f72f0b58e8cfb8c8e896736f49b13775ebda25301475d24554a601ff97
Status:Success
Squeezed text (173 lines).
206
Wanted Output:
Price:
BNB: $422.35 (-3.05%) | 5 Gwei
207 #-- latest data
Address: 0x81e0ef68e103ee65002d3cf766240ed1c070334d
Topics: 0 0x598cd56214a374d15f638dd04913e0288cd76c7833ee66b15cf55845d875a187
Data
0000000000000000000000000000000000000000000000000000000061b23bae
00000000000000000000000000000000000000000000000000000000979144b0

Alternative which caters for always picking up latest transaction (if more transactions added). Because JavaScript doesn't run with requests content isn't as it appears on webpage. You need to target the element with id myTabContent.
I've attempted broadly to go with hopefully more stable selector lists and avoid some of the potentially less robust classes.
import requests
from bs4 import BeautifulSoup as bs
r = requests.get('https://bscscan.com/tx/0xb07b68f72f0b58e8cfb8c8e896736f49b13775ebda25301475d24554a601ff97#eventlog', headers = {'User-Agent':'Mozilla/5.0'})
soup = bs(r.content, 'lxml')
#select price info
price = soup.select_one('#ethPrice').get_text(' ', strip = True)
# select latest event
last_transaction = soup.select_one('#myTabContent div.media:nth-last-child(2)')
latest_number = int(last_transaction.select_one('.btn-icon__inner').text)
address = last_transaction.select_one('a.text-break').text
topic = last_transaction.select_one('li > .text-break').text
print('Price:', price)
print('Latest number:', latest_number)
print('Address:', address)
print('Topics:', topic)
print('Data')
for data in last_transaction.select('[id^=chunk].text-break'):
print(data.text)

Actually,selecting all data according to requirement a little bit complex.I apply css selector,however, you also can apply find_all/find method.
from bs4 import BeautifulSoup
import requests
headers = {"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:92.0) Gecko/20100101 Firefox/92.0"}
url = "https://bscscan.com/tx/0xb07b68f72f0b58e8cfb8c8e896736f49b13775ebda25301475d24554a601ff97#eventlog"
urlpage = requests.get(url, headers=headers, timeout=10, allow_redirects=False)
soup = BeautifulSoup(urlpage.content, 'html.parser')
price = soup.find('div', class_='d-none d-md-inline-block u-label u-label--price rounded mt-1 ml-n1 text-nowrap').get_text()#.strip()
print ("Price: ", price)
for card in soup.select('div.media')[1:2]:
num=card.select_one('.mt-1.mr-3').text
print(num)
address=card.select_one('.col-md-10.mb-0 a').text
print(address)
topic=card.select_one('.text-monospace.text-break').text
print(topic)
data1=card.select_one('#chunk_2_4').text
print(data1)
data2=card.select_one('#chunk_2_5').text
print(data2)
Output:
Price:
BNB: $422.41 (-3.65%) | 5 Gwei
207
0x81e0ef68e103ee65002d3cf766240ed1c070334d
0x598cd56214a374d15f638dd04913e0288cd76c7833ee66b15cf55845d875a187
0000000000000000000000000000000000000000000000000000000061b23bae
00000000000000000000000000000000000000000000000000000000979144b0
It's working. The problem was data2=card.select_one('#chunk_2_5') not exist so you are getting None type error but everything is okey:
from bs4 import BeautifulSoup
import requests
headers = {"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:92.0) Gecko/20100101 Firefox/92.0"}
url = "https://bscscan.com/tx/0x173c462e910c95a67c119c61566330a835e4785221e247fada6d2279052519f1#eventlog"
urlpage = requests.get(url, headers=headers, timeout=10, allow_redirects=False)
soup = BeautifulSoup(urlpage.content, 'html.parser')
price = soup.find('div', class_='d-none d-md-inline-block u-label u-label--price rounded mt-1 ml-n1 text-nowrap').get_text()#.strip()
print ("Price: ", price)
for card in soup.select('div.media')[1:2]:
num=card.select_one('.mt-1.mr-3').text
print(num)
address=card.select_one('.col-md-10.mb-0 a').text
print(address)
topic=card.select_one('.text-monospace.text-break').text
print(topic)
data1=card.select_one('#chunk_2_4').text
print(data1)
# data2=card.select_one('#chunk_2_5').text
# print(data2)
#If you need all updated data
#for all_data in card.select('[id^=chunk]'):
#print(all_data.text)
Output:
Price:
BNB: $422.25 (-3.15%) | 5 Gwei
315
0x7ee058420e5937496f5a2096f04caa7721cf70cc
0x694af1cc8727cdd0afbdd53d9b87b69248bd490224e9dd090e788546506e076f
0000000000000000000000000000000000000000000000000000000062e6b858

Loop duplicating results

I'm writing a code for web-scrape Transfermarkt website, but I'm having some issues on the code.
The code had returned an error that was fixed thru the topic: Loop thru multiple URLs in Python - InvalidSchema("No connection adapters were found for {!r}".format
After this fix, other problems came in.
First: the code is duplicating the results on data frame.
Second one, the code is taking only the last element of each URL. In fact, what I want is get all the agencies URLs in the pagina = range(1) and then scrape all players in each agency, thru the URL scrapped in the first part.
ps.: pagina = range(1) it will be range (1,40), its the numbers of pages that i will scrape to get all agency's links.
Can anyone give me a hand on this issues?
Thanks!
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
from requests.sessions import default_headers
nome=[]
posicao=[]
nacionalidade=[]
idade=[]
clube=[]
contrato=[]
valor=[]
tf = f"http://www.transfermarkt.com.br"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:75.0) Gecko/20100101 Firefox/75.0'
}
pagina = range(1,5)
def main(url):
with requests.Session() as req:
links = []
for lea in pagina:
print(f"Extraindo links da página {lea}")
r = req.get(url.format(lea), headers=headers)
soup = BeautifulSoup(r.content, 'html.parser')
link = [f"{tf}{item.next_element.get('href')}" for item in soup.findAll(
"td", class_="hauptlink")]
links.extend(link)
print(f"Collected {len(links)} Links")
time.sleep(1)
for url in links:
r= requests.get(url, headers=headers)
r.status_code
soup = BeautifulSoup(r.text, 'html.parser')
player_info= soup.find_all('tr', class_=['odd', 'even'])
for info in player_info:
player = info.find_all("td")
vall= info.find('td', {'class': 'zentriert hauptlink'})
nome.append(player[2].text)
posicao.append(player[3].text)
nacionalidade.append(player[4].img['alt'])
idade.append(player[5].text)
clube.append(player[6].img['alt'])
contrato.append(player[7].text)
valor.append(vall)
time.sleep(1)
df = pd.DataFrame(
{"NOME":nome,
"POSICAO":posicao,
"NACIONALIDADE":nacionalidade,
"IDADE":idade,
"CLUBE":clube,
"CONTRATO":contrato,
"VALOR":valor}
)
print(df)
df
#df.to_csv('MBB.csv', index=False)
main("https://www.transfermarkt.com.br/berater/beraterfirmenuebersicht/berater?ajax=yw1&page={}")

how to scrape nested tag elements with python

hi i would like to get some data which is on below < del> or < ins> tags but i could not find any solution for it can anyone has idea about this scraping and is there any short way for getting those informations
this is my python code
import requests
import json
from bs4 import BeautifulSoup
header = {'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7'}
base_url = "https://www.n11.com/super-firsatlar"
r = requests.get(base_url,headers=header)
if r.status_code == 200:
soup = BeautifulSoup(r.text, 'html.parser')
books = soup.find_all('li',attrs={"class":"column"})
result=[]
for book in books:
title=book.find('h3').text
link=base_url +book.find('a')['href']
picture = base_url + book.find('img')['src']
price = soup.find('a',attrs={"class":"ins"})
single ={'title':title,'link':link,'picture':picture,'price':price}
result.append(single)
with open('book.json','w', encoding='utf-8') as f:
json.dump(result ,f,indent=4,ensure_ascii=False)
else:
print(r.status_code)
<div class="proDetail">
<a href="https://test.com"class="oldPrice" title="Premium">
<del>69,00 TL</del></a>
<a href="https://test.com"class="newPrice" title="Premium">
<ins>14,90</ins>
</a>
</div>
and this is my output
{
"title": "Premium",
"link": "https://test.com",
"picture": "https://pic.gif",
"price": null
},

You are searching for the wrong class. First search for the class 'newPrice' to get the a-block with:
price = book.find('a', attrs={'class': 'newPrice'})
Then you can search inside this a-block for the ins-block like:
price = book.find('a', attrs={'class': 'newPrice'}).find('ins')
Then your result would look like:
<ins>14,90</ins>
For final result strip the html tags:
price = book.find('a', attrs={'class': 'newPrice'}).find('ins').text.strip()

How to extract links from elements?

I am trying to extract the links of every individual member but I am not getting output:
from bs4 import BeautifulSoup
import requests
r = requests.get('https://www.asklaila.com/search/Delhi-NCR/-/doctors/')
soup = BeautifulSoup(r.text,'lxml')
for link in soup.find_all('h2',class_='resultTitle'):
link1 = link.find('a')
print link1['href']

You need request url with header param. more details
Where resultContent top doctors in Delhi-NCR result div class, cardWrap every doctor cards div class.
from bs4 import BeautifulSoup
import requests
headers = {'User-Agent': 'Custom user agent'}
r = requests.get('https://www.asklaila.com/search/Delhi-NCR/-/doctors/',headers=headers)
soup = BeautifulSoup(r.text,'lxml')
resultContentArray = soup.find('div',{'class':'resultContent'}).find_all("div",{'class':'cardWrap'})
for rr in resultContentArray:
title = rr.find('h2',{'class':'resultTitle'})
link = rr.find("a",href=True)
if link is not None:
print(link['href'])
O/P:
https://www.asklaila.com/category/Delhi-NCR/-/doctors/doctor/?category=176
https://www.asklaila.com/search/Delhi-NCR/greater-kailash-1/doctors/
https://www.asklaila.com/search/Delhi-NCR/-/maternity-hospital/
https://www.asklaila.com/Delhi-NCR/
https://www.asklaila.com/listing/Delhi-NCR/madangir/dr-vp-kaushik/0Vm4m7jP/
https://www.asklaila.com/listing/Delhi-NCR/sector-19/dr-arvind-garg/1BEtXFWP/
https://www.asklaila.com/listing/Delhi-NCR/indira-puram/dr-sanjay-garg/kUUpPPzH/
https://www.asklaila.com/listing/Delhi-NCR/new-friends-colony/dr-rk-caroli/GK5X4dSI/
https://www.asklaila.com/listing/Delhi-NCR/vasant-vihar/dr-sourabh-nagpal/0v1s6pGr/
https://www.asklaila.com/listing/Delhi-NCR/ncr/care24/0bbotWCf/
https://www.asklaila.com/listing/Delhi-NCR/soami-nagar-north/sudaksh-physiotherapy-psychology-orthopaedic-psychiatry-clinic-/kJxps7Dn/
https://www.asklaila.com/listing/Delhi-NCR/vaishali-sector-3/dr-sb-singh/00PPdXnM/
https://www.asklaila.com/listing/Delhi-NCR/kaushambi/dr-uma-kant-gupta/0ivP1mJ6/
https://www.asklaila.com/listing/Delhi-NCR/vaishali-sector-4/dr-kanwal-deep/09eZqT9k/
https://www.asklaila.com/listing/Delhi-NCR/east-of-kailash/dr-harbhajan-singh/ngDklERb/
https://www.asklaila.com/listing/Delhi-NCR/uttam-nagar/dr-bb-jindal/0Z8u07oQ/
https://www.asklaila.com/listing/Delhi-NCR/greater-kailash-part-1/dr-raman-kapoor/kNFPgYfZ/
https://www.asklaila.com/listing/Delhi-NCR/dwarka-sector-7/dr-pankaj-n-surange/NpIBzM4K/
https://www.asklaila.com/listing/Delhi-NCR/vaishali-sector-3/dr-ritu-gupta/19IoQ4A7/
https://www.asklaila.com/listing/Delhi-NCR/vaishali-sector-5/dr-mala-bhattacharjee/ywTzyamp/
https://www.asklaila.com/listing/Delhi-NCR/vasundhara/dr-mohit-jindal/vN9FiMAd/
https://www.asklaila.com/listing/Delhi-NCR/janakpuri/dr-ravi-manocha/1Qe4iuK1/
https://www.asklaila.com/listing/Delhi-NCR/vikas-marg/sparsh/08ZpsI85/
https://www.asklaila.com/listing/Delhi-NCR/kamla-nagar/dr-deepak-guha/ETn71X1r/
https://www.asklaila.com/search/Delhi-NCR/-/doctors/20

Use:
html.parser
custom header User-agent
soup.select feature
from bs4 import BeautifulSoup
import requests
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
r = requests.get('https://www.asklaila.com/search/Delhi-NCR/-/doctors/', headers=headers)
soup = BeautifulSoup(r.content, 'html.parser')
for link in soup.select('h2[class="resultTitle"] > a'):
print(link['href'])
The output:
https://www.asklaila.com/listing/Delhi-NCR/madangir/dr-vp-kaushik/0Vm4m7jP/
https://www.asklaila.com/listing/Delhi-NCR/sector-19/dr-arvind-garg/1BEtXFWP/
https://www.asklaila.com/listing/Delhi-NCR/indira-puram/dr-sanjay-garg/kUUpPPzH/
https://www.asklaila.com/listing/Delhi-NCR/new-friends-colony/dr-rk-caroli/GK5X4dSI/
https://www.asklaila.com/listing/Delhi-NCR/vasant-vihar/dr-sourabh-nagpal/0v1s6pGr/
https://www.asklaila.com/listing/Delhi-NCR/ncr/care24/0bbotWCf/
https://www.asklaila.com/listing/Delhi-NCR/soami-nagar-north/sudaksh-physiotherapy-psychology-orthopaedic-psychiatry-clinic-/kJxps7Dn/
https://www.asklaila.com/listing/Delhi-NCR/vaishali-sector-3/dr-sb-singh/00PPdXnM/
https://www.asklaila.com/listing/Delhi-NCR/kaushambi/dr-uma-kant-gupta/0ivP1mJ6/
https://www.asklaila.com/listing/Delhi-NCR/vaishali-sector-4/dr-kanwal-deep/09eZqT9k/
https://www.asklaila.com/listing/Delhi-NCR/east-of-kailash/dr-harbhajan-singh/ngDklERb/
https://www.asklaila.com/listing/Delhi-NCR/uttam-nagar/dr-bb-jindal/0Z8u07oQ/
https://www.asklaila.com/listing/Delhi-NCR/greater-kailash-part-1/dr-raman-kapoor/kNFPgYfZ/
https://www.asklaila.com/listing/Delhi-NCR/dwarka-sector-7/dr-pankaj-n-surange/NpIBzM4K/
https://www.asklaila.com/listing/Delhi-NCR/vaishali-sector-3/dr-ritu-gupta/19IoQ4A7/
https://www.asklaila.com/listing/Delhi-NCR/vaishali-sector-5/dr-mala-bhattacharjee/ywTzyamp/
https://www.asklaila.com/listing/Delhi-NCR/vasundhara/dr-mohit-jindal/vN9FiMAd/
https://www.asklaila.com/listing/Delhi-NCR/janakpuri/dr-ravi-manocha/1Qe4iuK1/
https://www.asklaila.com/listing/Delhi-NCR/vikas-marg/sparsh/08ZpsI85/
https://www.asklaila.com/listing/Delhi-NCR/sector-40/dr-amit-yadav/1ik21lZw/

Using **SoupStrainer
import httplib2
from bs4 import BeautifulSoup, SoupStrainer
http = httplib2.Http()
status, response = http.request('https://www.asklaila.com/search/Delhi-NCR/-/doctors/')
for link in BeautifulSoup(response, 'html.parser', parse_only=SoupStrainer('a')):
if link.has_attr('href'):
print(link['href'])

There are twenty correct links to retrieve for members. A concise way is to use css selector of parent class with child combinator to get a tag within
from bs4 import BeautifulSoup
import requests
r = requests.get('https://www.asklaila.com/search/Delhi-NCR/-/doctors/',headers= {'User-Agent' : 'Mozilla/5.0'})
soup = BeautifulSoup(r.content,'lxml')
links = [item['href'] for item in soup.select('.resultTitle > a')]
print(links)

The server is looking for User-Agent in header to prevent users from scraping the content
you could set request headers as a work around.
from bs4 import BeautifulSoup
import requests
headers = dict()
headers['User-Agent']= "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:67.0) Gecko/20100101 Firefox/67.0"
r = requests.get('https://www.asklaila.com/search/Delhi-NCR/-/doctors/',headers=headers)
soup = BeautifulSoup(r.text,'lxml')
# with open('h.html','w') as w:
# w.write(soup.text)
for link in soup.find_all('h2',class_='resultTitle'):
link1 = link.find('a')
print link1['href']
Should give you
https://www.asklaila.com/listing/Delhi-NCR/madangir/dr-vp-kaushik/0Vm4m7jP/
https://www.asklaila.com/listing/Delhi-NCR/sector-19/dr-arvind-garg/1BEtXFWP/
https://www.asklaila.com/listing/Delhi-NCR/indira-puram/dr-sanjay-garg/kUUpPPzH/
https://www.asklaila.com/listing/Delhi-NCR/new-friends-colony/dr-rk-caroli/GK5X4dSI/
https://www.asklaila.com/listing/Delhi-NCR/vasant-vihar/dr-sourabh-nagpal/0v1s6pGr/
https://www.asklaila.com/listing/Delhi-NCR/ncr/care24/0bbotWCf/
https://www.asklaila.com/listing/Delhi-NCR/soami-nagar-north/sudaksh-physiotherapy-psychology-orthopaedic-psychiatry-clinic-/kJxps7Dn/
https://www.asklaila.com/listing/Delhi-NCR/vaishali-sector-3/dr-sb-singh/00PPdXnM/
https://www.asklaila.com/listing/Delhi-NCR/vaishali-sector-4/dr-kanwal-deep/09eZqT9k/
https://www.asklaila.com/listing/Delhi-NCR/kaushambi/dr-uma-kant-gupta/0ivP1mJ6/
https://www.asklaila.com/listing/Delhi-NCR/east-of-kailash/dr-harbhajan-singh/ngDklERb/
https://www.asklaila.com/listing/Delhi-NCR/uttam-nagar/dr-bb-jindal/0Z8u07oQ/
https://www.asklaila.com/listing/Delhi-NCR/greater-kailash-part-1/dr-raman-kapoor/kNFPgYfZ/
https://www.asklaila.com/listing/Delhi-NCR/dwarka-sector-7/dr-pankaj-n-surange/NpIBzM4K/
https://www.asklaila.com/listing/Delhi-NCR/vaishali-sector-3/dr-ritu-gupta/19IoQ4A7/
https://www.asklaila.com/listing/Delhi-NCR/vaishali-sector-5/dr-mala-bhattacharjee/ywTzyamp/
https://www.asklaila.com/listing/Delhi-NCR/vasundhara/dr-mohit-jindal/vN9FiMAd/
https://www.asklaila.com/listing/Delhi-NCR/janakpuri/dr-ravi-manocha/1Qe4iuK1/
https://www.asklaila.com/listing/Delhi-NCR/vikas-marg/sparsh/08ZpsI85/
https://www.asklaila.com/listing/Delhi-NCR/kamla-nagar/dr-deepak-guha/ETn71X1r/

How do I search within a website using the 'requests' module?

I want to search for different company names on the website. Website link: https://www.firmenwissen.de/index.html
On this website, I want to use the search engine and search companies. Here is the code I am trying to use:
from bs4 import BeautifulSoup as BS
import requests
import re
companylist = ['ABEX Dachdecker Handwerks-GmbH']
url = 'https://www.firmenwissen.de/index.html'
payloads = {
'searchform': 'UFT-8',
'phrase':'ABEX Dachdecker Handwerks-GmbH',
"mainSearchField__button":'submit'
}
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}
html = requests.post(url, data=payloads, headers=headers)
soup = BS(html.content, 'html.parser')
link_list= []
links = soup.findAll('a')
for li in links:
link_list.append(li.get('href'))
print(link_list)
This code should bring me the next page with company information. But unfortunately, it returns only the home page. How can I do this?

Change your initial url you are doing search for. Grab the appropriate hrefs only and add to a set to ensure no duplicates (or alter selector to return only one match if possible); add those items to a final set for looping to ensure only looping required number of links. I have used Session on assumption you will repeat for many companies.
Iterate over the set using selenium to navigate to each company url and extract whatever info you need.
This is an outline.
from bs4 import BeautifulSoup as BS
import requests
from selenium import webdriver
d = webdriver.Chrome()
companyList = ['ABEX Dachdecker Handwerks-GmbH','SUCHMEISTEREI GmbH']
url = 'https://www.firmenwissen.de/ergebnis.html'
baseUrl = 'https://www.firmenwissen.de'
headers = {'User-Agent': 'Mozilla/5.0'}
finalLinks = set()
## searches section; gather into set
with requests.Session() as s:
for company in companyList:
payloads = {
'searchform': 'UFT-8',
'phrase':company,
"mainSearchField__button":'submit'
}
html = s.post(url, data=payloads, headers=headers)
soup = BS(html.content, 'lxml')
companyLinks = {baseUrl + item['href'] for item in soup.select("[href*='firmeneintrag/']")}
# print(soup.select_one('.fp-result').text)
finalLinks = finalLinks.union(companyLinks)
for item in finalLinks:
d.get(item)
info = d.find_element_by_css_selector('.yp_abstract_narrow')
address = d.find_element_by_css_selector('.yp_address')
print(info.text, address.text)
d.quit()
Just the first links:
from bs4 import BeautifulSoup as BS
import requests
from selenium import webdriver
d = webdriver.Chrome()
companyList = ['ABEX Dachdecker Handwerks-GmbH','SUCHMEISTEREI GmbH', 'aktive Stuttgarter']
url = 'https://www.firmenwissen.de/ergebnis.html'
baseUrl = 'https://www.firmenwissen.de'
headers = {'User-Agent': 'Mozilla/5.0'}
finalLinks = []
## searches section; add to list
with requests.Session() as s:
for company in companyList:
payloads = {
'searchform': 'UFT-8',
'phrase':company,
"mainSearchField__button":'submit'
}
html = s.post(url, data=payloads, headers=headers)
soup = BS(html.content, 'lxml')
companyLink = baseUrl + soup.select_one("[href*='firmeneintrag/']")['href']
finalLinks.append(companyLink)
for item in set(finalLinks):
d.get(item)
info = d.find_element_by_css_selector('.yp_abstract_narrow')
address = d.find_element_by_css_selector('.yp_address')
print(info.text, address.text)
d.quit()

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Can't find and process text taken out of an HTML - python

Related

Grabbing some data from a card-body div class

Loop duplicating results

how to scrape nested tag elements with python

How to extract links from elements?

How do I search within a website using the 'requests' module?

Categories

Resources