I would scraper urls of player of all pages from this website https://www.transfermarkt.it/detailsuche/spielerdetail/suche/27564780
but I can scrape only the first one, why?
I write a cicle for with range()
import pandas as pd
import requests
from bs4 import BeautifulSoup
list_url=[]
def get_player_urls(page):
headers = {
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:87.0) Gecko/20100101 Firefox/87.0"
}
link = 'https://www.transfermarkt.it/detailsuche/spielerdetail/suche/27564780/page/{page}'
content = requests.get(link, headers=headers)
soup = BeautifulSoup(content.text, 'html.parser')
for urls in soup.find_all('a', class_='spielprofil_tooltip'):
url = 'https://www.transfermarkt.it' + urls.get('href')
print(url)
list_url.append(url)
return
for page in range(1,11,1):
get_player_urls(page)
df_url = pd.DataFrame(list_url)
df_url.to_csv('df_url.csv', index=False, header=False)
You're not actually imputing the page into the url. Also, no need to put return on your function. You aren't returning anything:
import pandas as pd
import requests
from bs4 import BeautifulSoup
list_url=[]
def get_player_urls(page):
headers = {
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:87.0) Gecko/20100101 Firefox/87.0"
}
link = 'https://www.transfermarkt.it/detailsuche/spielerdetail/suche/27564780/page/{page}'.format(page=page) #<-- Add this
content = requests.get(link, headers=headers)
soup = BeautifulSoup(content.text, 'html.parser')
for urls in soup.find_all('a', class_='spielprofil_tooltip'):
url = 'https://www.transfermarkt.it' + urls.get('href')
print(url)
list_url.append(url)
for page in range(1,11,1):
get_player_urls(page)
df_url = pd.DataFrame(list_url)
df_url.to_csv('df_url.csv', index=False, header=False)
Related
I scraped a html table from yahoofinance website and tried to export the table to csv file. However, it does not return the correct output in the csv file. The printed output on my terminal appears to be just fine. What have I done wrong here?
import requests
from bs4 import BeautifulSoup
import csv
import pandas as pd
mystocks = ["XOM", "CVX", "COP", "EOG"]
stockdata = []
def getData(symbol):
headers = {"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:89.0) Gecko/20100101 Firefox/89.0"}
url = f"https://finance.yahoo.com/quote/{symbol}/key-statistics"
soup = BeautifulSoup(requests.get(url, headers=headers).content, "html.parser")
print("Ticker - "+symbol)
for t in soup.select("table"):
for tr in t.select("tr:has(td)"):
for sup in tr.select("sup"):
sup.extract()
stockdata = [td.get_text(strip=True) for td in tr.select("td")]
if len(stockdata) == 2:
print("{:<50} {}".format(*stockdata))
for item in mystocks:
stockdata.append(getData(item))
df = pd.DataFrame(stockdata)
df.to_csv('file_name.csv')
You are printing, not returning the data.
If you want all the data in one table it is good to add a column with the symbol for which the row was originated. You could use something like this
import requests
from bs4 import BeautifulSoup
import csv
import pandas as pd
mystocks = ["XOM", "CVX", "COP", "EOG"]
stockdata = []
def getData(symbol):
headers = {"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:89.0) Gecko/20100101 Firefox/89.0"}
url = f"https://finance.yahoo.com/quote/{symbol}/key-statistics"
soup = BeautifulSoup(requests.get(url, headers=headers).content, "html.parser")
print("Ticker - "+symbol)
for t in soup.select("table"):
for tr in t.select("tr:has(td)"):
for sup in tr.select("sup"):
sup.extract()
stockdata = [td.get_text(strip=True) for td in tr.select("td")]
if len(stockdata) == 2:
# add a column with the symbol to help affterwards
yield [item] + stockdata
# this will concatenate the rows for all the symbols in mystocks
df = pd.DataFrame([r for item in mystocks for r in getData(item)])
df.to_csv('file_name.csv')
I'm trying to grab the link inside a td. My code does not display the link or produce the desired output. What I need to change.
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
from time import sleep
import requests
headers = {"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:92.0) Gecko/20100101 Firefox/92.0"}
urllink = "https://bscscan.com/txs?block=11711353&ps=100&p=1"
reqblockdetails = requests.get(urllink, headers=headers, timeout=5)
soupblockdetails = BeautifulSoup(reqblockdetails.content, 'html.parser')
rowsblockdetails = soupblockdetails.findAll('table')[0].findAll('tr')
sleep(1)
for row in rowsblockdetails[1:]:
txnhash = row.find_all('td')[1].text[0:]
txnhashdetails = txnhash.strip()
destination = row.find_all('td')[8].text[0:]
destination = destination.strip()
if str(destination) == "CoinOne: CONE Token":
urldest = soupblockdetails.find('a', attrs={'class': 'hash-tag text-truncate'}).text
print (" {:>1} {:<5}".format(txnhashdetails, destination))
print (urldest)
else:
pass
Current Output:
0x8265a6ba5ce531df645b883e8735af57241f43e92eb3c9a88f43b89310f964bc CoinOne: CONE Token Validator: Stake2me
Needed Output:
0x8265a6ba5ce531df645b883e8735af57241f43e92eb3c9a88f43b89310f964bc CoinOne: CONE Token 0x9628735017f1a985ebaac0b203efb9e8d3ed0fef
It would be better to search for <a> element in currently selected <td> but not in whole document so I changed code to td = row.find_all('td')[8] and later to td.find('a', ...).
Here is a working code:
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
from time import sleep
import requests
headers = {"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:92.0) Gecko/20100101 Firefox/92.0"}
urllink = "https://bscscan.com/txs?block=11711353&ps=100&p=1"
reqblockdetails = requests.get(urllink, headers=headers, timeout=5)
soupblockdetails = BeautifulSoup(reqblockdetails.content, 'html.parser')
rowsblockdetails = soupblockdetails.findAll('table')[0].findAll('tr')
sleep(1)
for row in rowsblockdetails[1:]:
txnhash = row.find_all('td')[1].text[0:]
txnhashdetails = txnhash.strip()
td = row.find_all('td')[8]
destination = td.text[0:].strip()
if str(destination) == "CoinOne: CONE Token":
urldest = td.find('a', attrs={'class': 'hash-tag text-truncate'})["href"].lstrip("/address/")
print (" {:>1} {:<5}".format(txnhashdetails, destination))
print (urldest)
else:
pass
Hope, it will work. try this:
t_link = soupblockdetails.find('span', attrs={'class': 'hash-tag text-truncate'})
urldest = t_link.a['href']
I'd like to scrape a webite to get some percentages. So far this is the code:
import requests
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
from urllib.error import URLError, HTTPError
lista=[]
site = 'https://es.investing.com/indices/indices-futures'
harware = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:84.0) Gecko/20100101 Firefox/84.0'}
request = Request(site,headers=harware)
page = urlopen(request)
soup = BeautifulSoup(page, 'html.parser')
#print(soup)
cotizacion = soup.find_all('td',{"class": "datatable_cell__3gwri datatable_cell--align-end__Wua8C datatable_cell--" + "down__2CL8n" +" datatable_cell--bold__3e0BR table-browser_col-chg-pct__9p1T3"})
for datos in cotizacion:
indices = datos.get_text()
lista.append(indices)
print(lista)
With this, I am getting a bunch of percentages in a list.
But my problem is that the class attribute only gets data when the percentage is negative because the class name is for down ("down__2CL8n"), but when it is up the class name is the same except for that part ("up__2984w"). And I wanna get both, positive and negative.
So I tryed the find with:
soup.find_all('td',{"class": "datatable_cell__3gwri datatable_cell--align-end__Wua8C datatable_cell--" + "down__2CL8n" or "up__2984w" +" datatable_cell--bold__3e0BR table-browser_col-chg-pct__9p1T3"})
But that doesn't work.
How will be the format to get a variable part of the string?
The desired output is under the attribute table-browser_col-chg-pct__9p1T3, To only select the first table you can use a CSS Selector .mb-6 td.table-browser_col-chg-pct__9p1T3.
import requests
from bs4 import BeautifulSoup
URL = "https://es.investing.com/indices/indices-futures"
headers = {
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:84.0) Gecko/20100101 Firefox/84.0"
}
soup = BeautifulSoup(requests.get(URL, headers=headers).content, "html.parser")
print([tag.text for tag in soup.select(".mb-6 td.table-browser_col-chg-pct__9p1T3")])
Output:
['+0,12%', '+0,73%', '+1,97%', '+0,95%', '+1,13%', '+0,03%', '-0,15%', '-0,73%', '-0,05%', '+0,22%', '-0,65%', '-0,16%', '-0,37%', '-0,21%', '+0,11%', '-0,41%', '-0,40%', '-0,15%', '-0,38%', '+0,69%', '-0,89%', '-1,13%', '+0,23%', '-0,89%', '-0,75%', '-1,51%', '-0,22%', '+0,43%', '-1,27%', '+0,92%']
I would avoid what could be dynamic class values and instead determine which column the desired values fall under; then use :nth-of-type to slice out that column from the table. To get the table I would go with an attribute = value selector to get the parent element with data-test=price-table, then move to the child table element with a descendant combinator. The aim would be to try and develop something more robust over time. Of course, this introduces a header string dependency in particular.
import requests
from bs4 import BeautifulSoup
URL = "https://es.investing.com/indices/indices-futures"
headers = {"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:84.0) Gecko/20100101 Firefox/84.0"}
soup = BeautifulSoup(requests.get(URL, headers=headers).content, "html.parser")
index = [i.text for i in soup.select('[data-test=price-table] table th')].index('% Var.') + 1
print([i.text for i in soup.select(f"[data-test=price-table] table td:nth-of-type({index})")])
You could also just use pandas read_html:
import pandas as pd
table = pd.read_html('https://es.investing.com/indices/indices-futures')[0]
table['% Var.']
You could just do this as next step(assuming the order doesn't matter):
cotizacion += soup.find_all('td',{"class": "datatable_cell__3gwri datatable_cell--align-end__Wua8C datatable_cell--" + "up__2984w" +" datatable_cell--bold__3e0BR table-browser_col-chg-pct__9p1T3"})
Edit:
As suggest by comments that the order matter, you can refer to this ans:
https://stackoverflow.com/a/14257743/8651239
I'm trying to scrape product data of off adidas and nike and am successful in getting the first product's details, but can't get it to iterate through to any additional products. It looks like this is because bs4 is not nesting items inside the I'm pointing to into a list and therefore not allowing me to iterate. I've tried searching around for solutions, but most things I read just point to changing the parser to lxml or html5. I don't think its an issue with the site's html since I'm getting the same issue for both nike and adidas. Both chunks of code are:
Nike
from bs4 import BeautifulSoup
import requests
import pandas as pd
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'}
url = 'https://www.nike.com/w/mens-shoes-nik1zy7ok'
page = requests.get(url, headers=headers)
if page.status_code == 200:
soup = BeautifulSoup(page.content, 'html.parser')
product_grid = soup.findAll('div', attrs={'class': 'product-grid__items'})
#print(product_grid)
products = []
for card in product_grid:
name = card.find('a', attrs={'class': 'product-card__link-overlay'})
products.append(name.text)
print(products)
Reutrns
['Nike Air VaporMax Flyknit 3']
Adidas
from bs4 import BeautifulSoup
import requests
import pandas as pd
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'}
url = 'https://www.adidas.com/us/men-shoes'
page = requests.get(url, headers=headers)
if page.status_code == 200:
soup = BeautifulSoup(page.content, 'html5lib')
product_grid = soup.findAll('div', attrs={'class': 'product-container___3GvlZ'})
#print(product_grid)
products = []
for card in product_grid:
name = card.find('div', attrs={'class': 'gl-product-card__name'})
products.append(name.text)
print(products)
Returns
['NMD_R1 Shoes']
You can check the following code:
from bs4 import BeautifulSoup
import requests
import pandas as pd
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'}
url = 'https://www.nike.com/w/mens-shoes-nik1zy7ok'
page = requests.get(url, headers=headers)
if page.status_code == 200:
soup = BeautifulSoup(page.content, 'html.parser')
product_grid = soup.findAll('div', attrs={'class': 'product-grid__items'})
products = []
for card in product_grid:
names = card.findAll('a', attrs={'class': 'product-card__link-overlay'})
for element in names:
products.append(element.text)
print(products)
The issue was in: name = card.find('a', attrs={'class': 'product-card__link-overlay'}).
If you print it out, you get a single name because you are doing .find not .findAll
I am trying to extract the links of every individual member but I am not getting output:
from bs4 import BeautifulSoup
import requests
r = requests.get('https://www.asklaila.com/search/Delhi-NCR/-/doctors/')
soup = BeautifulSoup(r.text,'lxml')
for link in soup.find_all('h2',class_='resultTitle'):
link1 = link.find('a')
print link1['href']
You need request url with header param. more details
Where resultContent top doctors in Delhi-NCR result div class, cardWrap every doctor cards div class.
from bs4 import BeautifulSoup
import requests
headers = {'User-Agent': 'Custom user agent'}
r = requests.get('https://www.asklaila.com/search/Delhi-NCR/-/doctors/',headers=headers)
soup = BeautifulSoup(r.text,'lxml')
resultContentArray = soup.find('div',{'class':'resultContent'}).find_all("div",{'class':'cardWrap'})
for rr in resultContentArray:
title = rr.find('h2',{'class':'resultTitle'})
link = rr.find("a",href=True)
if link is not None:
print(link['href'])
O/P:
https://www.asklaila.com/category/Delhi-NCR/-/doctors/doctor/?category=176
https://www.asklaila.com/search/Delhi-NCR/greater-kailash-1/doctors/
https://www.asklaila.com/search/Delhi-NCR/-/maternity-hospital/
https://www.asklaila.com/Delhi-NCR/
https://www.asklaila.com/listing/Delhi-NCR/madangir/dr-vp-kaushik/0Vm4m7jP/
https://www.asklaila.com/listing/Delhi-NCR/sector-19/dr-arvind-garg/1BEtXFWP/
https://www.asklaila.com/listing/Delhi-NCR/indira-puram/dr-sanjay-garg/kUUpPPzH/
https://www.asklaila.com/listing/Delhi-NCR/new-friends-colony/dr-rk-caroli/GK5X4dSI/
https://www.asklaila.com/listing/Delhi-NCR/vasant-vihar/dr-sourabh-nagpal/0v1s6pGr/
https://www.asklaila.com/listing/Delhi-NCR/ncr/care24/0bbotWCf/
https://www.asklaila.com/listing/Delhi-NCR/soami-nagar-north/sudaksh-physiotherapy-psychology-orthopaedic-psychiatry-clinic-/kJxps7Dn/
https://www.asklaila.com/listing/Delhi-NCR/vaishali-sector-3/dr-sb-singh/00PPdXnM/
https://www.asklaila.com/listing/Delhi-NCR/kaushambi/dr-uma-kant-gupta/0ivP1mJ6/
https://www.asklaila.com/listing/Delhi-NCR/vaishali-sector-4/dr-kanwal-deep/09eZqT9k/
https://www.asklaila.com/listing/Delhi-NCR/east-of-kailash/dr-harbhajan-singh/ngDklERb/
https://www.asklaila.com/listing/Delhi-NCR/uttam-nagar/dr-bb-jindal/0Z8u07oQ/
https://www.asklaila.com/listing/Delhi-NCR/greater-kailash-part-1/dr-raman-kapoor/kNFPgYfZ/
https://www.asklaila.com/listing/Delhi-NCR/dwarka-sector-7/dr-pankaj-n-surange/NpIBzM4K/
https://www.asklaila.com/listing/Delhi-NCR/vaishali-sector-3/dr-ritu-gupta/19IoQ4A7/
https://www.asklaila.com/listing/Delhi-NCR/vaishali-sector-5/dr-mala-bhattacharjee/ywTzyamp/
https://www.asklaila.com/listing/Delhi-NCR/vasundhara/dr-mohit-jindal/vN9FiMAd/
https://www.asklaila.com/listing/Delhi-NCR/janakpuri/dr-ravi-manocha/1Qe4iuK1/
https://www.asklaila.com/listing/Delhi-NCR/vikas-marg/sparsh/08ZpsI85/
https://www.asklaila.com/listing/Delhi-NCR/kamla-nagar/dr-deepak-guha/ETn71X1r/
https://www.asklaila.com/search/Delhi-NCR/-/doctors/20
Use:
html.parser
custom header User-agent
soup.select feature
from bs4 import BeautifulSoup
import requests
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
r = requests.get('https://www.asklaila.com/search/Delhi-NCR/-/doctors/', headers=headers)
soup = BeautifulSoup(r.content, 'html.parser')
for link in soup.select('h2[class="resultTitle"] > a'):
print(link['href'])
The output:
https://www.asklaila.com/listing/Delhi-NCR/madangir/dr-vp-kaushik/0Vm4m7jP/
https://www.asklaila.com/listing/Delhi-NCR/sector-19/dr-arvind-garg/1BEtXFWP/
https://www.asklaila.com/listing/Delhi-NCR/indira-puram/dr-sanjay-garg/kUUpPPzH/
https://www.asklaila.com/listing/Delhi-NCR/new-friends-colony/dr-rk-caroli/GK5X4dSI/
https://www.asklaila.com/listing/Delhi-NCR/vasant-vihar/dr-sourabh-nagpal/0v1s6pGr/
https://www.asklaila.com/listing/Delhi-NCR/ncr/care24/0bbotWCf/
https://www.asklaila.com/listing/Delhi-NCR/soami-nagar-north/sudaksh-physiotherapy-psychology-orthopaedic-psychiatry-clinic-/kJxps7Dn/
https://www.asklaila.com/listing/Delhi-NCR/vaishali-sector-3/dr-sb-singh/00PPdXnM/
https://www.asklaila.com/listing/Delhi-NCR/kaushambi/dr-uma-kant-gupta/0ivP1mJ6/
https://www.asklaila.com/listing/Delhi-NCR/vaishali-sector-4/dr-kanwal-deep/09eZqT9k/
https://www.asklaila.com/listing/Delhi-NCR/east-of-kailash/dr-harbhajan-singh/ngDklERb/
https://www.asklaila.com/listing/Delhi-NCR/uttam-nagar/dr-bb-jindal/0Z8u07oQ/
https://www.asklaila.com/listing/Delhi-NCR/greater-kailash-part-1/dr-raman-kapoor/kNFPgYfZ/
https://www.asklaila.com/listing/Delhi-NCR/dwarka-sector-7/dr-pankaj-n-surange/NpIBzM4K/
https://www.asklaila.com/listing/Delhi-NCR/vaishali-sector-3/dr-ritu-gupta/19IoQ4A7/
https://www.asklaila.com/listing/Delhi-NCR/vaishali-sector-5/dr-mala-bhattacharjee/ywTzyamp/
https://www.asklaila.com/listing/Delhi-NCR/vasundhara/dr-mohit-jindal/vN9FiMAd/
https://www.asklaila.com/listing/Delhi-NCR/janakpuri/dr-ravi-manocha/1Qe4iuK1/
https://www.asklaila.com/listing/Delhi-NCR/vikas-marg/sparsh/08ZpsI85/
https://www.asklaila.com/listing/Delhi-NCR/sector-40/dr-amit-yadav/1ik21lZw/
Using **SoupStrainer
import httplib2
from bs4 import BeautifulSoup, SoupStrainer
http = httplib2.Http()
status, response = http.request('https://www.asklaila.com/search/Delhi-NCR/-/doctors/')
for link in BeautifulSoup(response, 'html.parser', parse_only=SoupStrainer('a')):
if link.has_attr('href'):
print(link['href'])
There are twenty correct links to retrieve for members. A concise way is to use css selector of parent class with child combinator to get a tag within
from bs4 import BeautifulSoup
import requests
r = requests.get('https://www.asklaila.com/search/Delhi-NCR/-/doctors/',headers= {'User-Agent' : 'Mozilla/5.0'})
soup = BeautifulSoup(r.content,'lxml')
links = [item['href'] for item in soup.select('.resultTitle > a')]
print(links)
The server is looking for User-Agent in header to prevent users from scraping the content
you could set request headers as a work around.
from bs4 import BeautifulSoup
import requests
headers = dict()
headers['User-Agent']= "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:67.0) Gecko/20100101 Firefox/67.0"
r = requests.get('https://www.asklaila.com/search/Delhi-NCR/-/doctors/',headers=headers)
soup = BeautifulSoup(r.text,'lxml')
# with open('h.html','w') as w:
# w.write(soup.text)
for link in soup.find_all('h2',class_='resultTitle'):
link1 = link.find('a')
print link1['href']
Should give you
https://www.asklaila.com/listing/Delhi-NCR/madangir/dr-vp-kaushik/0Vm4m7jP/
https://www.asklaila.com/listing/Delhi-NCR/sector-19/dr-arvind-garg/1BEtXFWP/
https://www.asklaila.com/listing/Delhi-NCR/indira-puram/dr-sanjay-garg/kUUpPPzH/
https://www.asklaila.com/listing/Delhi-NCR/new-friends-colony/dr-rk-caroli/GK5X4dSI/
https://www.asklaila.com/listing/Delhi-NCR/vasant-vihar/dr-sourabh-nagpal/0v1s6pGr/
https://www.asklaila.com/listing/Delhi-NCR/ncr/care24/0bbotWCf/
https://www.asklaila.com/listing/Delhi-NCR/soami-nagar-north/sudaksh-physiotherapy-psychology-orthopaedic-psychiatry-clinic-/kJxps7Dn/
https://www.asklaila.com/listing/Delhi-NCR/vaishali-sector-3/dr-sb-singh/00PPdXnM/
https://www.asklaila.com/listing/Delhi-NCR/vaishali-sector-4/dr-kanwal-deep/09eZqT9k/
https://www.asklaila.com/listing/Delhi-NCR/kaushambi/dr-uma-kant-gupta/0ivP1mJ6/
https://www.asklaila.com/listing/Delhi-NCR/east-of-kailash/dr-harbhajan-singh/ngDklERb/
https://www.asklaila.com/listing/Delhi-NCR/uttam-nagar/dr-bb-jindal/0Z8u07oQ/
https://www.asklaila.com/listing/Delhi-NCR/greater-kailash-part-1/dr-raman-kapoor/kNFPgYfZ/
https://www.asklaila.com/listing/Delhi-NCR/dwarka-sector-7/dr-pankaj-n-surange/NpIBzM4K/
https://www.asklaila.com/listing/Delhi-NCR/vaishali-sector-3/dr-ritu-gupta/19IoQ4A7/
https://www.asklaila.com/listing/Delhi-NCR/vaishali-sector-5/dr-mala-bhattacharjee/ywTzyamp/
https://www.asklaila.com/listing/Delhi-NCR/vasundhara/dr-mohit-jindal/vN9FiMAd/
https://www.asklaila.com/listing/Delhi-NCR/janakpuri/dr-ravi-manocha/1Qe4iuK1/
https://www.asklaila.com/listing/Delhi-NCR/vikas-marg/sparsh/08ZpsI85/
https://www.asklaila.com/listing/Delhi-NCR/kamla-nagar/dr-deepak-guha/ETn71X1r/