I want to search for different company names on the website. Website link: https://www.firmenwissen.de/index.html
On this website, I want to use the search engine and search companies. Here is the code I am trying to use:
from bs4 import BeautifulSoup as BS
import requests
import re
companylist = ['ABEX Dachdecker Handwerks-GmbH']
url = 'https://www.firmenwissen.de/index.html'
payloads = {
'searchform': 'UFT-8',
'phrase':'ABEX Dachdecker Handwerks-GmbH',
"mainSearchField__button":'submit'
}
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}
html = requests.post(url, data=payloads, headers=headers)
soup = BS(html.content, 'html.parser')
link_list= []
links = soup.findAll('a')
for li in links:
link_list.append(li.get('href'))
print(link_list)
This code should bring me the next page with company information. But unfortunately, it returns only the home page. How can I do this?
Change your initial url you are doing search for. Grab the appropriate hrefs only and add to a set to ensure no duplicates (or alter selector to return only one match if possible); add those items to a final set for looping to ensure only looping required number of links. I have used Session on assumption you will repeat for many companies.
Iterate over the set using selenium to navigate to each company url and extract whatever info you need.
This is an outline.
from bs4 import BeautifulSoup as BS
import requests
from selenium import webdriver
d = webdriver.Chrome()
companyList = ['ABEX Dachdecker Handwerks-GmbH','SUCHMEISTEREI GmbH']
url = 'https://www.firmenwissen.de/ergebnis.html'
baseUrl = 'https://www.firmenwissen.de'
headers = {'User-Agent': 'Mozilla/5.0'}
finalLinks = set()
## searches section; gather into set
with requests.Session() as s:
for company in companyList:
payloads = {
'searchform': 'UFT-8',
'phrase':company,
"mainSearchField__button":'submit'
}
html = s.post(url, data=payloads, headers=headers)
soup = BS(html.content, 'lxml')
companyLinks = {baseUrl + item['href'] for item in soup.select("[href*='firmeneintrag/']")}
# print(soup.select_one('.fp-result').text)
finalLinks = finalLinks.union(companyLinks)
for item in finalLinks:
d.get(item)
info = d.find_element_by_css_selector('.yp_abstract_narrow')
address = d.find_element_by_css_selector('.yp_address')
print(info.text, address.text)
d.quit()
Just the first links:
from bs4 import BeautifulSoup as BS
import requests
from selenium import webdriver
d = webdriver.Chrome()
companyList = ['ABEX Dachdecker Handwerks-GmbH','SUCHMEISTEREI GmbH', 'aktive Stuttgarter']
url = 'https://www.firmenwissen.de/ergebnis.html'
baseUrl = 'https://www.firmenwissen.de'
headers = {'User-Agent': 'Mozilla/5.0'}
finalLinks = []
## searches section; add to list
with requests.Session() as s:
for company in companyList:
payloads = {
'searchform': 'UFT-8',
'phrase':company,
"mainSearchField__button":'submit'
}
html = s.post(url, data=payloads, headers=headers)
soup = BS(html.content, 'lxml')
companyLink = baseUrl + soup.select_one("[href*='firmeneintrag/']")['href']
finalLinks.append(companyLink)
for item in set(finalLinks):
d.get(item)
info = d.find_element_by_css_selector('.yp_abstract_narrow')
address = d.find_element_by_css_selector('.yp_address')
print(info.text, address.text)
d.quit()
Related
I want to get my results from my college's website with python, I typed this script:
import requests
import time
from bs4 import BeautifulSoup
# Make a request to the website
url = 'http://app1.helwan.edu.eg/Commerce/HasasnUpMlist.asp'
response = requests.get(url)
# Parse the response and create a BeautifulSoup object
soup = BeautifulSoup(response.text, 'html.parser')
# Find the input field we need to fill with our ID
input_field = soup.find('input', {'name': 'x_st_settingno', 'id': 'x_st_settingno'})
input_field['value'] = 8936 # Fill in our ID
# Find the submit button and click it
submit_button = soup.find('input', {'name': 'Submit', 'id': 'Submit'})
data = {input_field['name']: input_field['value'], submit_button['name']: submit_button['value']}
response2 = requests.post(url, data=data)
# Parse the response and create a BeautifulSoup object
soup2 = BeautifulSoup(response2.text, 'html.parser')
print(`soup.find('form:nth-of-type(2) table tbody tr:first-of-type td b font')`)
But it always returns None. I do not know why?
The print(soup.find('form:nth-of-type(2) table tbody tr:first-of-type td b font')) part is just the head of the table that contains the link to my results, If I looked for the link it returns None as well.
What am I doing wrong, I am not good with web scraping I just started learning, I hope you can help me guys.
import requests
from bs4 import BeautifulSoup
import pandas as pd
from urllib.parse import urljoin
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/109.0'
}
def main(url):
with requests.Session() as req:
req.headers.update(headers)
params = {
"Submit": "%C8%CD%CB",
"x_dep": "",
"x_gro": "",
"x_sec": "",
"x_st_name": "",
"x_st_settingno": "8936",
"z_dep": "=",
"z_gro": "=",
"z_sec": "LIKE",
"z_st_name": "LIKE",
"z_st_settingno": "="
}
r = req.get(url, params=params)
soup = BeautifulSoup(r.content, 'lxml')
res = urljoin(url, soup.select_one('.ewTableRow span.aspmaker a')[
'href'])
r = req.get(res)
df = pd.read_html(r.content)
print(df)
main('http://app1.helwan.edu.eg/Commerce/HasasnUpMlist.asp')
I'm writing a code for web-scrape Transfermarkt website, but I'm having some issues on the code.
The code had returned an error that was fixed thru the topic: Loop thru multiple URLs in Python - InvalidSchema("No connection adapters were found for {!r}".format
After this fix, other problems came in.
First: the code is duplicating the results on data frame.
Second one, the code is taking only the last element of each URL. In fact, what I want is get all the agencies URLs in the pagina = range(1) and then scrape all players in each agency, thru the URL scrapped in the first part.
ps.: pagina = range(1) it will be range (1,40), its the numbers of pages that i will scrape to get all agency's links.
Can anyone give me a hand on this issues?
Thanks!
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
from requests.sessions import default_headers
nome=[]
posicao=[]
nacionalidade=[]
idade=[]
clube=[]
contrato=[]
valor=[]
tf = f"http://www.transfermarkt.com.br"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:75.0) Gecko/20100101 Firefox/75.0'
}
pagina = range(1,5)
def main(url):
with requests.Session() as req:
links = []
for lea in pagina:
print(f"Extraindo links da página {lea}")
r = req.get(url.format(lea), headers=headers)
soup = BeautifulSoup(r.content, 'html.parser')
link = [f"{tf}{item.next_element.get('href')}" for item in soup.findAll(
"td", class_="hauptlink")]
links.extend(link)
print(f"Collected {len(links)} Links")
time.sleep(1)
for url in links:
r= requests.get(url, headers=headers)
r.status_code
soup = BeautifulSoup(r.text, 'html.parser')
player_info= soup.find_all('tr', class_=['odd', 'even'])
for info in player_info:
player = info.find_all("td")
vall= info.find('td', {'class': 'zentriert hauptlink'})
nome.append(player[2].text)
posicao.append(player[3].text)
nacionalidade.append(player[4].img['alt'])
idade.append(player[5].text)
clube.append(player[6].img['alt'])
contrato.append(player[7].text)
valor.append(vall)
time.sleep(1)
df = pd.DataFrame(
{"NOME":nome,
"POSICAO":posicao,
"NACIONALIDADE":nacionalidade,
"IDADE":idade,
"CLUBE":clube,
"CONTRATO":contrato,
"VALOR":valor}
)
print(df)
df
#df.to_csv('MBB.csv', index=False)
main("https://www.transfermarkt.com.br/berater/beraterfirmenuebersicht/berater?ajax=yw1&page={}")
I'm working with web scraping to, at first, collect the total pages. I have tested the code I made for another site and however I am having a problem getting the next page link (href).
Here's the code:
from urllib.parse import urljoin
from bs4 import BeautifulSoup
import requests
userName = 'brendanm1975' # just for testing
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}
pages = []
with requests.Session() as session:
page_number = 1
url = "https://www.last.fm/user/"+userName+"/library/artists?page="
while True:
response = session.get(url, headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')
pages.append(url)
next_link = soup.find("li", class_="pagination-next")
if next_link is None:
break
url = urljoin(url, next_link["href"])
page_number += 1
As you can see, the href of this site presents the link as "?page=2", which does not allow me to get its content (https://www.last.fm/user/brendanm1975/library/artists?page=2).
I've already inspected the variables, and I'm getting the values.
print(url) # output: https://www.last.fm/user/brendanm1975/library/artists?page=
next_link.find('a').get('href') # output: '?page=2'
Does anyone know how to get around this?
What happens?
You try to urljoin(url, next_link["href"]) but next_link do not have an attribute href cause you are selecting the <li> not the <a>.
How to fix?
Option#1 - Just select the <a> in your urljoin():
url = urljoin(url, next_link.a["href"])
Option#2 - Select the <a> directly:
next_link = soup.select_one('li.pagination-next a')
Example
from urllib.parse import urljoin
from bs4 import BeautifulSoup
import requests
userName = 'brendanm1975' # just for testing
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}
pages = []
with requests.Session() as session:
url = "https://www.last.fm/user/"+userName+"/library/artists?page=1"
while True:
response = session.get(url, headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')
pages.append(url)
next_link = soup.find("li", class_="pagination-next")
if next_link is None:
break
url = urljoin(url, next_link.a["href"])
Output
['https://www.last.fm/user/brendanm1975/library/artists?page=1',
'https://www.last.fm/user/brendanm1975/library/artists?page=2',
'https://www.last.fm/user/brendanm1975/library/artists?page=3',
'https://www.last.fm/user/brendanm1975/library/artists?page=4',
'https://www.last.fm/user/brendanm1975/library/artists?page=5',
'https://www.last.fm/user/brendanm1975/library/artists?page=6',
'https://www.last.fm/user/brendanm1975/library/artists?page=7',
'https://www.last.fm/user/brendanm1975/library/artists?page=8',
'https://www.last.fm/user/brendanm1975/library/artists?page=9',
'https://www.last.fm/user/brendanm1975/library/artists?page=10',
'https://www.last.fm/user/brendanm1975/library/artists?page=11',
'https://www.last.fm/user/brendanm1975/library/artists?page=12',
'https://www.last.fm/user/brendanm1975/library/artists?page=13',
'https://www.last.fm/user/brendanm1975/library/artists?page=14',
'https://www.last.fm/user/brendanm1975/library/artists?page=15',
'https://www.last.fm/user/brendanm1975/library/artists?page=16',
'https://www.last.fm/user/brendanm1975/library/artists?page=17',
'https://www.last.fm/user/brendanm1975/library/artists?page=18',...]
This script returns a list of URLs found on the web page.
import requests
from bs4 import BeautifulSoup as BS
from bs4 import Comment
with requests.session() as r:
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:88.0) Gecko/20100101 Firefox/88.0'}
r = requests.get('https://ctflearn.com', verify=False, headers=headers)
response = r.text
soup = BS(response, 'html.parser')
tags = soup.find_all('a')
for tag in tags:
links = tag.get('href')
if links[0] == '/':
appended_link = 'https://ctflearn.com' + links
print(appended_link)
elif links[0] == '#':
pass
else:
print(links)
However, what I am interested in is to visit these web pages and find the links within these pages as well. I know it is possible by using a for loop, but I don't know how to implement it.
Thanks for the help.
You could simply try to use two lists (to_visit and visited) with a check if a url is already in one of those lists before you add it to to_visit.
import requests
from bs4 import BeautifulSoup as BS
from bs4 import Comment
with requests.session() as r:
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:88.0) Gecko/20100101 Firefox/88.0'}
to_visit = ['https://ctflearn.com']
visited = []
while (len(to_visit) > 0):
url = to_visit.pop(0)
visited.append(url)
print('visited: "{0}"'.format(url))
r = requests.get(url, verify=True, headers=headers)
response = r.text
soup = BS(response, 'html.parser')
tags = soup.find_all('a')
for tag in tags:
links = tag.get('href')
if links == None:
continue
elif links[0] == '/':
appended_link = 'https://ctflearn.com' + links
if (appended_link not in visited and appended_link not in to_visit):
to_visit.append(appended_link)
elif links[0] == '#':
pass
else:
if (links not in visited and links not in to_visit):
to_visit.append(links)
but at some point you will run into a problem because you will find and try to access something that is not an url which is why I would recommend using a validator:
import validators
validators.url(url) # returns True if "url" is a valid url
I am trying to extract the links of every individual member but I am not getting output:
from bs4 import BeautifulSoup
import requests
r = requests.get('https://www.asklaila.com/search/Delhi-NCR/-/doctors/')
soup = BeautifulSoup(r.text,'lxml')
for link in soup.find_all('h2',class_='resultTitle'):
link1 = link.find('a')
print link1['href']
You need request url with header param. more details
Where resultContent top doctors in Delhi-NCR result div class, cardWrap every doctor cards div class.
from bs4 import BeautifulSoup
import requests
headers = {'User-Agent': 'Custom user agent'}
r = requests.get('https://www.asklaila.com/search/Delhi-NCR/-/doctors/',headers=headers)
soup = BeautifulSoup(r.text,'lxml')
resultContentArray = soup.find('div',{'class':'resultContent'}).find_all("div",{'class':'cardWrap'})
for rr in resultContentArray:
title = rr.find('h2',{'class':'resultTitle'})
link = rr.find("a",href=True)
if link is not None:
print(link['href'])
O/P:
https://www.asklaila.com/category/Delhi-NCR/-/doctors/doctor/?category=176
https://www.asklaila.com/search/Delhi-NCR/greater-kailash-1/doctors/
https://www.asklaila.com/search/Delhi-NCR/-/maternity-hospital/
https://www.asklaila.com/Delhi-NCR/
https://www.asklaila.com/listing/Delhi-NCR/madangir/dr-vp-kaushik/0Vm4m7jP/
https://www.asklaila.com/listing/Delhi-NCR/sector-19/dr-arvind-garg/1BEtXFWP/
https://www.asklaila.com/listing/Delhi-NCR/indira-puram/dr-sanjay-garg/kUUpPPzH/
https://www.asklaila.com/listing/Delhi-NCR/new-friends-colony/dr-rk-caroli/GK5X4dSI/
https://www.asklaila.com/listing/Delhi-NCR/vasant-vihar/dr-sourabh-nagpal/0v1s6pGr/
https://www.asklaila.com/listing/Delhi-NCR/ncr/care24/0bbotWCf/
https://www.asklaila.com/listing/Delhi-NCR/soami-nagar-north/sudaksh-physiotherapy-psychology-orthopaedic-psychiatry-clinic-/kJxps7Dn/
https://www.asklaila.com/listing/Delhi-NCR/vaishali-sector-3/dr-sb-singh/00PPdXnM/
https://www.asklaila.com/listing/Delhi-NCR/kaushambi/dr-uma-kant-gupta/0ivP1mJ6/
https://www.asklaila.com/listing/Delhi-NCR/vaishali-sector-4/dr-kanwal-deep/09eZqT9k/
https://www.asklaila.com/listing/Delhi-NCR/east-of-kailash/dr-harbhajan-singh/ngDklERb/
https://www.asklaila.com/listing/Delhi-NCR/uttam-nagar/dr-bb-jindal/0Z8u07oQ/
https://www.asklaila.com/listing/Delhi-NCR/greater-kailash-part-1/dr-raman-kapoor/kNFPgYfZ/
https://www.asklaila.com/listing/Delhi-NCR/dwarka-sector-7/dr-pankaj-n-surange/NpIBzM4K/
https://www.asklaila.com/listing/Delhi-NCR/vaishali-sector-3/dr-ritu-gupta/19IoQ4A7/
https://www.asklaila.com/listing/Delhi-NCR/vaishali-sector-5/dr-mala-bhattacharjee/ywTzyamp/
https://www.asklaila.com/listing/Delhi-NCR/vasundhara/dr-mohit-jindal/vN9FiMAd/
https://www.asklaila.com/listing/Delhi-NCR/janakpuri/dr-ravi-manocha/1Qe4iuK1/
https://www.asklaila.com/listing/Delhi-NCR/vikas-marg/sparsh/08ZpsI85/
https://www.asklaila.com/listing/Delhi-NCR/kamla-nagar/dr-deepak-guha/ETn71X1r/
https://www.asklaila.com/search/Delhi-NCR/-/doctors/20
Use:
html.parser
custom header User-agent
soup.select feature
from bs4 import BeautifulSoup
import requests
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
r = requests.get('https://www.asklaila.com/search/Delhi-NCR/-/doctors/', headers=headers)
soup = BeautifulSoup(r.content, 'html.parser')
for link in soup.select('h2[class="resultTitle"] > a'):
print(link['href'])
The output:
https://www.asklaila.com/listing/Delhi-NCR/madangir/dr-vp-kaushik/0Vm4m7jP/
https://www.asklaila.com/listing/Delhi-NCR/sector-19/dr-arvind-garg/1BEtXFWP/
https://www.asklaila.com/listing/Delhi-NCR/indira-puram/dr-sanjay-garg/kUUpPPzH/
https://www.asklaila.com/listing/Delhi-NCR/new-friends-colony/dr-rk-caroli/GK5X4dSI/
https://www.asklaila.com/listing/Delhi-NCR/vasant-vihar/dr-sourabh-nagpal/0v1s6pGr/
https://www.asklaila.com/listing/Delhi-NCR/ncr/care24/0bbotWCf/
https://www.asklaila.com/listing/Delhi-NCR/soami-nagar-north/sudaksh-physiotherapy-psychology-orthopaedic-psychiatry-clinic-/kJxps7Dn/
https://www.asklaila.com/listing/Delhi-NCR/vaishali-sector-3/dr-sb-singh/00PPdXnM/
https://www.asklaila.com/listing/Delhi-NCR/kaushambi/dr-uma-kant-gupta/0ivP1mJ6/
https://www.asklaila.com/listing/Delhi-NCR/vaishali-sector-4/dr-kanwal-deep/09eZqT9k/
https://www.asklaila.com/listing/Delhi-NCR/east-of-kailash/dr-harbhajan-singh/ngDklERb/
https://www.asklaila.com/listing/Delhi-NCR/uttam-nagar/dr-bb-jindal/0Z8u07oQ/
https://www.asklaila.com/listing/Delhi-NCR/greater-kailash-part-1/dr-raman-kapoor/kNFPgYfZ/
https://www.asklaila.com/listing/Delhi-NCR/dwarka-sector-7/dr-pankaj-n-surange/NpIBzM4K/
https://www.asklaila.com/listing/Delhi-NCR/vaishali-sector-3/dr-ritu-gupta/19IoQ4A7/
https://www.asklaila.com/listing/Delhi-NCR/vaishali-sector-5/dr-mala-bhattacharjee/ywTzyamp/
https://www.asklaila.com/listing/Delhi-NCR/vasundhara/dr-mohit-jindal/vN9FiMAd/
https://www.asklaila.com/listing/Delhi-NCR/janakpuri/dr-ravi-manocha/1Qe4iuK1/
https://www.asklaila.com/listing/Delhi-NCR/vikas-marg/sparsh/08ZpsI85/
https://www.asklaila.com/listing/Delhi-NCR/sector-40/dr-amit-yadav/1ik21lZw/
Using **SoupStrainer
import httplib2
from bs4 import BeautifulSoup, SoupStrainer
http = httplib2.Http()
status, response = http.request('https://www.asklaila.com/search/Delhi-NCR/-/doctors/')
for link in BeautifulSoup(response, 'html.parser', parse_only=SoupStrainer('a')):
if link.has_attr('href'):
print(link['href'])
There are twenty correct links to retrieve for members. A concise way is to use css selector of parent class with child combinator to get a tag within
from bs4 import BeautifulSoup
import requests
r = requests.get('https://www.asklaila.com/search/Delhi-NCR/-/doctors/',headers= {'User-Agent' : 'Mozilla/5.0'})
soup = BeautifulSoup(r.content,'lxml')
links = [item['href'] for item in soup.select('.resultTitle > a')]
print(links)
The server is looking for User-Agent in header to prevent users from scraping the content
you could set request headers as a work around.
from bs4 import BeautifulSoup
import requests
headers = dict()
headers['User-Agent']= "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:67.0) Gecko/20100101 Firefox/67.0"
r = requests.get('https://www.asklaila.com/search/Delhi-NCR/-/doctors/',headers=headers)
soup = BeautifulSoup(r.text,'lxml')
# with open('h.html','w') as w:
# w.write(soup.text)
for link in soup.find_all('h2',class_='resultTitle'):
link1 = link.find('a')
print link1['href']
Should give you
https://www.asklaila.com/listing/Delhi-NCR/madangir/dr-vp-kaushik/0Vm4m7jP/
https://www.asklaila.com/listing/Delhi-NCR/sector-19/dr-arvind-garg/1BEtXFWP/
https://www.asklaila.com/listing/Delhi-NCR/indira-puram/dr-sanjay-garg/kUUpPPzH/
https://www.asklaila.com/listing/Delhi-NCR/new-friends-colony/dr-rk-caroli/GK5X4dSI/
https://www.asklaila.com/listing/Delhi-NCR/vasant-vihar/dr-sourabh-nagpal/0v1s6pGr/
https://www.asklaila.com/listing/Delhi-NCR/ncr/care24/0bbotWCf/
https://www.asklaila.com/listing/Delhi-NCR/soami-nagar-north/sudaksh-physiotherapy-psychology-orthopaedic-psychiatry-clinic-/kJxps7Dn/
https://www.asklaila.com/listing/Delhi-NCR/vaishali-sector-3/dr-sb-singh/00PPdXnM/
https://www.asklaila.com/listing/Delhi-NCR/vaishali-sector-4/dr-kanwal-deep/09eZqT9k/
https://www.asklaila.com/listing/Delhi-NCR/kaushambi/dr-uma-kant-gupta/0ivP1mJ6/
https://www.asklaila.com/listing/Delhi-NCR/east-of-kailash/dr-harbhajan-singh/ngDklERb/
https://www.asklaila.com/listing/Delhi-NCR/uttam-nagar/dr-bb-jindal/0Z8u07oQ/
https://www.asklaila.com/listing/Delhi-NCR/greater-kailash-part-1/dr-raman-kapoor/kNFPgYfZ/
https://www.asklaila.com/listing/Delhi-NCR/dwarka-sector-7/dr-pankaj-n-surange/NpIBzM4K/
https://www.asklaila.com/listing/Delhi-NCR/vaishali-sector-3/dr-ritu-gupta/19IoQ4A7/
https://www.asklaila.com/listing/Delhi-NCR/vaishali-sector-5/dr-mala-bhattacharjee/ywTzyamp/
https://www.asklaila.com/listing/Delhi-NCR/vasundhara/dr-mohit-jindal/vN9FiMAd/
https://www.asklaila.com/listing/Delhi-NCR/janakpuri/dr-ravi-manocha/1Qe4iuK1/
https://www.asklaila.com/listing/Delhi-NCR/vikas-marg/sparsh/08ZpsI85/
https://www.asklaila.com/listing/Delhi-NCR/kamla-nagar/dr-deepak-guha/ETn71X1r/