Scraping tag attribute BeautifulSoup - python

I would scrape all data-oid tag from this page, but return nothing in the output
Code
url = 'https://www.betexplorer.com/soccer/south-korea/k-league-2/bucheon-fc-1995-jeonnam/EDwej14E/'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
table = soup.find('table', class_='table-main')
for rows in table.find_all('tr')[1:]:
for row in rows.find_all('td'):
data = row.get_attrs['data-oid']
print(data)

The part table part of the page is loaded from external URL via JavaScript. To get the data along with the tags with data-oid= parameters, you can use this example:
import requests
from bs4 import BeautifulSoup
url = "https://www.betexplorer.com/soccer/south-korea/k-league-2/bucheon-fc-1995-jeonnam/EDwej14E/"
match_id = "EDwej14E" # <-- this is the last part of URL
api_url = "https://www.betexplorer.com/match-odds/{}/1/1x2/".format(match_id)
headers = {"Referer": "https://www.betexplorer.com"}
data = requests.get(api_url, headers=headers).json()
soup = BeautifulSoup(data["odds"], "html.parser")
# your code:
table = soup.find("table", class_="table-main")
for rows in table.find_all("tr")[1:]:
for row in rows.select("td[data-oid]"):
data = row["data-oid"]
print(data)
Prints:
...
4kqjpxv464x0xc6aif
4kqjpxv464x0xc6aie
4kqjpxv498x0x0
4kqjpxv464x0xc6aif
4kqjpxv464x0xc6aie
4kqjpxv498x0x0
4kqjpxv464x0xc6aif

Related

How can Beautifulsoup scrape the pages inside this list of hyperlinks?

I am trying to scrape the contents of the hyperlinks on the left side of this page. I am already able to scrape the contents of the hyperlinks, so now I am trying to run the script on each individual hyperlink that is on the left side of the page.
URL: https://bitinfocharts.com/top-100-richest-dogecoin-addresses-3.html
I think what needs to be done is the url be a dynamic variable, and that variable is a loop which will go through all of the hyperlinks in the URL above. Although I'm not exactly sure if this is the best way to approach it, as this is my first project
Any advice is greatly appreciated.
Here is the code that I am trying to plug this into.
import csv
import requests
from bs4 import BeautifulSoup as bs
url = 'https://bitinfocharts.com/dogecoin/address/DN5Hp2kCkvCsdwr5SPmwHpiJgjKnC5wcT7'
headers = {"User-Agent": "Mozilla/5.0"}
r = requests.get(url, headers=headers)
soup = bs(r.content, 'lxml')
table = soup.find(id="table_maina")
headers = []
datarows = []
#Get crypto address for the filename
item = soup.find('h1').text
newitem = item.replace('Dogecoin','')
finalitem = newitem.replace('Address','')
for row in table.find_all('tr'):
heads = row.find_all('th')
if heads:
headers = [th.text for th in heads]
else:
datarows.append([td.text for td in row.find_all('td')])
fcsv = csv.writer(open(f'{finalitem}.csv', 'w', newline=''))
fcsv.writerow(headers)
fcsv.writerows(datarows)
A simple way would be to make an initial request and extract all the links in the second column of the table.
Then loop those links, make requests, and continue with your existing code, except to also handle cases where no table present.
import csv
import requests
from bs4 import BeautifulSoup as bs
headers = []
datarows = []
with requests.Session() as s:
s.headers = {"User-Agent": "Safari/537.36"}
r = s.get('https://bitinfocharts.com/top-100-richest-dogecoin-addresses-3.html')
soup = bs(r.content, 'lxml')
address_links = [i['href'] for i in soup.select('.table td:nth-child(2) > a')]
for url in address_links:
r = s.get(url)
soup = bs(r.content, 'lxml')
table = soup.find(id="table_maina")
if table:
item = soup.find('h1').text
newitem = item.replace('Dogecoin','')
finalitem = newitem.replace('Address','')
for row in table.find_all('tr'):
heads = row.find_all('th')
if heads:
headers = [th.text for th in heads]
else:
datarows.append([td.text for td in row.find_all('td')])
fcsv = csv.writer(open(f'{finalitem}.csv', 'w', newline=''))
fcsv.writerow(headers)
fcsv.writerows(datarows)
else:
print('no table for: ', url)

Obtain second tag inside a find_all

I am trying to obtain the second tag inside a specific td but I'm not being able to obtain just the text of the second tag, as I am getting data from all the a.
Later I will do a for to obtain the data of the 10 td. As you can see in the image I want the data of the second a inside each of the 10 td:
my code:
from requests import get
from bs4 import BeautifulSoup
headers = {'User-Agent': 'Mozilla/5.0'}
url = 'https://www.oddsportal.com/soccer/spain/laliga'
response = get(url, headers=headers)
html_soup = BeautifulSoup(response.text, 'html.parser')
type(html_soup)
match_containers = html_soup.find_all("td",{ "class" : "name table-participant"})
print(len(match_containers))
first_match = match_containers[0]
first_title = first_match.text
print (first_title)
You need to select for the second a tag
import requests
from bs4 import BeautifulSoup as bs
url = 'https://www.oddsportal.com/soccer/spain/laliga'
r = requests.get(url, headers = {'User-Agent' : 'Mozilla/5.0'})
soup = bs(r.content, 'lxml')
print([item.text for item in soup.select('#tournamentTable tr[xeid] [href*=soccer]')])
Though you can drop the table id and use:
print([item.text for item in soup.select('tr[xeid] [href*=soccer]')])
For the rows of the table, with useful match data as a list, I would use:
rows = soup.select('#tournamentTable tr[xeid]')

Access Hidden Data on a page

I need to access the following website: http://mothoq.com/store/22
scroll down till i see the phone icon.
click on it, and scrape the phone number.
I have successfully connected to the website, and able to scrape all data needed, except of the phone number.
I have tried to use
soup.find_all('p',attrs={"align":"center"})
my code is:
import requests
import pandas as pd
from bs4 import BeautifulSoup
records = []
storeId = 22
url = "http://mothoq.com/store/" + str(storeId)
r = requests.get(url)
content = r.text
soup = BeautifulSoup(content, "html5lib")
results = soup.find('div', attrs={'id': 'subtitle'})
for storeData in results:
storeName = soup.find('h1')
url = soup.find('font').text
contacts = soup.find_all('p', attrs={"class":"store_connect_details"})
for storeContact in contacts:
storePhone = soup.find_all('p', attrs={"align":"center"})
storeTwitter = soup.find('a', attrs={"class":"connect_icon_twitter"})['href']
storeFacebook = soup.find('a', attrs={"class":"connect_icon_facebook"})['href']
storeLinkedin = soup.find('a', attrs={"class":"connect_icon_linkedin"})['href']
print(storePhone)
Thanks!
You should search for hidden div with id="store-telephone-form" and take second
<p> tag from it.
import requests
import pandas as pd
from bs4 import BeautifulSoup
records = []
storeId = 22
url = "http://mothoq.com/store/" + str(storeId)
r = requests.get(url)
content = r.text
soup = BeautifulSoup(content, "lxml")
results = soup.find('div', attrs={'id': 'subtitle'})
storeName = soup.find('h1')
url = soup.find('font').text
contacts = soup.find_all('p', attrs={"class":"store_connect_details"})
try:
storePhone = soup.find('div', attrs={"id":"store-telephone-form"}).select('p')[1].text
storeTwitter = soup.find('a', attrs={"class":"connect_icon_twitter"}).get('href')
storeFacebook = soup.find('a', attrs={"class":"connect_icon_facebook"}).get('href')
storeLinkedin = soup.find('a', attrs={"class":"connect_icon_linkedin"}).get('href')
except:
pass
print(storePhone)

How do I simultaneously scrape two pages and produce two distinct lists within one nested 'for-loop'?

I'm scraping from two URLs that have the same DOM structure, and so I'm trying to find a way to scrape both of them at the same time.
The only caveat is that the data scraped from both these pages need to end up on distinctly named lists.
To explain with example, here is what I've tried:
import os
import requests
from bs4 import BeautifulSoup as bs
urls = ['https://www.basketball-reference.com/leaders/ws_career.html',
'https://www.basketball-reference.com/leaders/ws_per_48_career.html',]
ws_list = []
ws48_list = []
categories = [ws_list, ws48_list]
for url in urls:
response = requests.get(url, headers=headers)
soup = bs(response.content, 'html.parser')
section = soup.find('table', class_='stats_table')
for a in section.find_all('a'):
player_name = a.text
for cat_list in categories:
cat_list.append(player_name)
print(ws48_list)
print(ws_list)
This ends up printing two identical lists when I was shooting for 2 lists unique to its page.
How do I accomplish this? Would it be better practice to code it another way?
Instead of trying to append to already existing lists. Just create new ones. Make a function to do the scrape and pass each url in turn to it.
import os
import requests
from bs4 import BeautifulSoup as bs
urls = ['https://www.basketball-reference.com/leaders/ws_career.html',
'https://www.basketball-reference.com/leaders/ws_per_48_career.html',]
def parse_page(url, headers={}):
response = requests.get(url, headers=headers)
soup = bs(response.content, 'html.parser')
section = soup.find('table', class_='stats_table')
return [a.text for a in section.find_all('a')]
ws_list, ws48_list = [parse_page(url) for url in urls]
print('ws_list = %r' % ws_list)
print('ws8_list = %r' % ws48_list)
Just add them to the appropriate list and the problem is solved?
for i, url in enumerate(urls):
response = requests.get(url)
soup = bs(response.content, 'html.parser')
section = soup.find('table', class_='stats_table')
for a in section.find_all('a'):
player_name = a.text
categories[i].append(player_name)
print(ws48_list)
print(ws_list)
You can use a function to define your scraping logic, then just call it for your urls.
import os
import requests
from bs4 import BeautifulSoup as bs
def scrape(url):
response = requests.get(url)
soup = bs(response.content, 'html.parser')
section = soup.find('table', class_='stats_table')
names = []
for a in section.find_all('a'):
player_name = a.text
names.append(player_name)
return names
ws_list = scrape('https://www.basketball-reference.com/leaders/ws_career.html')
ws48_list = scrape('https://www.basketball-reference.com/leaders/ws_per_48_career.html')
print(ws_list)
print(ws48_list)

Problems retrieving information from imdb

I'm trying to get the movie titles from an imdb watchlist. This is my code:
import requests, bs4
res = requests.get(url)
res.raise_for_status()
soup = bs4.BeautifulSoup(res.text, "html.parser")
print(soup.find_all('.lister-item-header'))
Even though '.lister-item-header' exists in the chrome developer console it doesn't exist in the html file that the requests module downloaded. I've also tried using regular expressions. What would be the best way of retrieving the titles?
You should select elements by their class in this way.
import requests
import bs4
url = 'http://www.imdb.com/chart/top'
res = requests.get(url)
res.raise_for_status()
soup = bs4.BeautifulSoup(res.text, "html.parser")
rows = soup.select('.titleColumn > a')
for row in rows:
print(row.text)
Or you can do it in this way:
import requests
import bs4
url = 'http://www.imdb.com/chart/top'
res = requests.get(url)
res.raise_for_status()
soup = bs4.BeautifulSoup(res.text, "html.parser")
rows = soup.find_all('td', class_='titleColumn')
for row in rows:
print(row.a.text)
The data is load from a json object which is embedded into the raw html file, so we can parse it and get the title.
import requests
import bs4
import json
url = 'http://www.imdb.com/user/ur69187878/watchlist?ref_=wt_nv_wl‌​_all_1'
res = requests.get(url)
res.raise_for_status()
soup = bs4.BeautifulSoup(res.text, "html.parser")
# rows = soup.find_all('h3', class_='list-item-header')
js_elements = soup.find_all('script')
js_text = None
search_str = 'IMDbReactInitialState.push('
for element in js_elements:
text = element.text
if search_str in text:
js_text = text.strip()
break
json_start = js_text.index(search_str) + len(search_str)
json_text = js_text[json_start:-2]
json_obj = json.loads(js_text[json_start:-2])
for title in json_obj['titles']:
json_title = json_obj['titles'][title]
print(json_title['primary']['title'])
But I have to say that this is not a general method to attack this kind of problems, if you wanna have a general solution for all pages whose data is loaded from json or api, you can use some other ways such as Selenium.

Categories

Resources