I want to retrieve specific stat (PPDA) from multiple matchs on this site:
https//understat.com/match/xxxx
I have created the follow to parse the HTML and loop through each match using Python but i am struggling how to extract the specific stat and load it to a csv and to a graph. I am a beginner and any help would be appreciated!
Code:
import pandas as pd
import re
import random
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
import datetime
import csv
for i in range(9577,9807):
ppda_url = 'https://understat.com/match/' + str(i)
ppda_data = requests.get(ppda_url)
ppda_html = ppda_data.content
xml
soup = BeautifulSoup(ppda_html, 'lxml')
options=webdriver.ChromeOptions()
driver = webdriver.Chrome(chrome_options=options)
driver.get(ppda_url)
soup = BeautifulSoup(driver.page_source, 'lxml')
To extract the data with BeautifulSoup and write it to a CSV file, first find the div element with the text PPDA. Then find the next div element with the class progress-value then the nextdiv element with the class progress-value and get the data from these two last divs. Write it to a csv file like so.
import requests
from bs4 import BeautifulSoup
import csv
with open('ppda.csv', 'w', newline='') as csvfile:
for i in range(9577,9807):
ppda_url = 'https://understat.com/match/' + str(i)
ppda_data = requests.get(ppda_url)
ppda_html = ppda_data.content
soup = BeautifulSoup(ppda_html, 'lxml')
ppda = soup.find("div", string='PPDA')
home = ppda.findNext('div', {'class':"progress-value"})
print (home.text, home.findNext('div', {'class':"progress-value"}).text)
writer = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)
writer.writerow([home.text, home.findNext('div', {'class':"progress-value"}).text])
To graph it look at matplotlib for a start.
import requests
from bs4 import BeautifulSoup
import pandas as pd
import matplotlib.pyplot as plt
df = pd.DataFrame(columns=['HOME', 'AWAY'])
for i in range(9577,9807):
ppda_url = 'https://understat.com/match/' + str(i)
ppda_data = requests.get(ppda_url)
ppda_html = ppda_data.content
soup = BeautifulSoup(ppda_html, 'lxml')
ppda = soup.find("div", string='PPDA')
home = ppda.findNext('div', {'class':"progress-value"})
print (home.text, home.findNext('div', {'class':"progress-value"}).text)
df = df.append({'HOME': float(home.text), 'AWAY' : float(home.findNext('div', {'class':"progress-value"}).text)}, ignore_index=True)
#print (df)
df.to_csv("ppda2.csv", encoding='utf-8', index=False)
df.plot.bar()
plt.show()
Outputs: CSV file and Graph
Related
I am trying to scrape a website to get the info and output it to a CSV file. For the data I am trying to extract, there is an output to the terminal but I need that to be in a CSV file.
I have tried several different methods but cannot find a solution.
The CSV file is created but it's just empty. There is probably something really simple.
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import csv
import time
from bs4 import BeautifulSoup
DRIVER_PATH = '/Users/jasonbeedle/Desktop/snaviescraper/chromedriver'
options = Options()
options.page_load_strategy = 'normal'
# Navigate to url
driver = webdriver.Chrome(options=options, executable_path=DRIVER_PATH)
driver.get("http://best4sport.tv/2hd/2020-12-10/")
options.add_argument("--window-size=1920x1080")
results = driver.find_element_by_class_name('program1_content_container')
soup = BeautifulSoup(results.text, 'html.parser')
# results = driver.find_element_by_class_name('program1_content_container')
p_data1 = soup.find_all("div", {"class_name": "program1_content_container"})
p_data2 = soup.find_all("div", {"class_name": "program_time"})
p_data3 = soup.find_all("div", {"class_name": "sport"})
p_data4 = soup.find_all("div", {"class": "program_text"})
print("Here is your data, I am off ot sleep now see ya ")
print(results.text)
# Create csv
programme_list = []
# Programme List
for item in p_data1:
try:
name = item.contents[1].find_all(
"div", {"class": "program1_content_container"})[0].text
except:
name = ''
p_data1 = [time]
programme_list.append(p_data1)
# Programme Time
for item in p_data2:
try:
time = item.contents[1].find_all(
"div", {"class": "program_time"})[0].text
except:
time = ''
p_data2 = [time]
programme_list.append(p_data2)
# Which sport
for item in p_data3:
try:
time = item.contents[1].find_all(
"div", {"class": "sport"})[0].text
except:
time = ''
p_data3 = [time]
programme_list.append(p_data3)
with open('sport.csv', 'w') as file:
writer = csv.writer(file)
for row in programme_list:
writer.writerow(row)
I have just tried to add an object called data_output Then I tried to print the data_output
data_output = [p_data1, p_data2, p_data3, p_data4]
...
print(data_output)
The output in the terminal is
Load data into pandas dataframe and export into csv.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
from bs4 import BeautifulSoup
DRIVER_PATH = '/Users/jasonbeedle/Desktop/snaviescraper/chromedriver'
driver = webdriver.Chrome(executable_path=DRIVER_PATH)
driver.get("http://best4sport.tv/2hd/2020-12-10/")
results =WebDriverWait(driver,10).until(EC.visibility_of_element_located((By.CSS_SELECTOR,".program1_content_container")))
soup = BeautifulSoup(results.get_attribute("outerHTML"), 'html.parser')
program_time=[]
sport=[]
program_text=[]
program_info=[]
for item in soup.select(".program_details "):
if item.find_next(class_='program_time'):
program_time.append(item.find_next(class_='program_time').text.strip())
else:
program_time.append("Nan")
if item.find_next(class_='sport'):
sport.append(item.find_next(class_='sport').text.strip())
else:
sport.append("Nan")
if item.find_next(class_='program_text'):
program_text.append(item.find_next(class_='program_text').text.strip())
else:
program_text.append("Nan")
if item.find_next(class_='program_info'):
program_info.append(item.find_next(class_='program_info').text.strip())
else:
program_info.append("Nan")
df=pd.DataFrame({"program_time":program_time,"sport":sport,"program_text":program_text,"program_info":program_info})
print(df)
df.to_csv("sport.csv")
csv snapshot after creation
If you don't have pandas then you need to install it.
pip install pandas
As Blue Fishy said you can try to change to w mode only, but you may run in an encoding error.
Solution that works on your data
import csv
programme_list = ['19:55','MOTORU SPORTS','Motoru sporta "5 minūte"','Iknedēļas Alda Putniņa veidots apskats par motoru sportu','20:00','BASKETBOLS','...']
with open('sport.csv', 'w', encoding='utf-8') as file:
writer = csv.writer(file, delimiter=',', lineterminator='\n')
for row in programme_list:
print(row)
writer.writerow([row])
Output
19:55
MOTORU SPORTS
"Motoru sporta ""5 minūte"""
Iknedēļas Alda Putniņa veidots apskats par motoru sportu
20:00
BASKETBOLS
...
Instead of writing binary, can you try changing wb to w?
Change
with open('sport.csv', 'wb') as file:
to
with open('sport.csv', 'w') as file:
EDITED:
Sorry for being a bit late. Here is the code modified based on your original code FYI.
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import csv
import time
from bs4 import BeautifulSoup
from selenium.webdriver.chrome.options import Options
DRIVER_PATH = '/Users/jasonbeedle/Desktop/snaviescraper/chromedriver'
options = Options()
options.page_load_strategy = 'normal'
# Navigate to url
driver = webdriver.Chrome(options=options, executable_path=DRIVER_PATH)
driver.get("http://best4sport.tv/2hd/2020-12-10/")
options.add_argument("--window-size=1920x1080")
results = driver.find_element_by_class_name('program1_content_container')
page = driver.page_source
soup = BeautifulSoup(page, 'html.parser')
# results = driver.find_element_by_class_name('program1_content_container')
p_data1 = soup.find_all("p", {"class": "program_info"})
p_data2 = soup.find_all("p", {"class": "program_time"})
p_data3 = soup.find_all("p", {"class": "sport"})
p_data4 = soup.find_all("p", {"class": "program_text"})
# Create csv
programme_list = []
# Programme List
for i in range(len(p_data1)):
programme_list.append([p_data1[i].text.strip(), p_data2[i].text.strip(), p_data3[i].text.strip(), p_data4[i].text.strip()])
with open('sport.csv', 'w', encoding='utf-8') as file:
writer = csv.writer(file)
writer.writerow(["program_info", "program_time", "sport", "program_text"])
for row in programme_list:
writer.writerow(row)
Excel Screenshot here
Hi I am a code newbie an I am trying to get news titles from cnn.com just like the image of an excel file attached below.
However the problem is, that I don't know how to add each columns, such as World/Politics/Health and my code get data only from LAST element of the tuple list (in this code, 'politics').
So here is my code. Thank you in advance!
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import requests
import time
import pandas as pd
import os
from bs4 import BeautifulSoup as soup
from bs4 import NavigableString
import re
import xlsxwriter
from openpyxl import Workbook
path = "C:/Users/Desktop/chromedriver.exe"
driver = webdriver.Chrome(path)
# per section
a =['world','health','politics']
wb = Workbook()
ws = wb.active
for i in a:
nl = []
driver.get("https://edition.cnn.com/"+str(i))
driver.implicitly_wait(3)
html = driver.page_source
soup = BeautifulSoup(html, "lxml")
find_ingre = soup.select("span.cd__headline-text")
for i in find_ingre:
nl.append(i.get_text())
# make dataframe --> save xlsx
import pandas as pd
from pandas import Series, DataFrame
df = pd.DataFrame(nl)
df.to_excel("cnn_recent_topics.xlsx",index=False)
result now --->
result that I want to get --->
Could you try this, comment if you need explaination:
def custom_scrape(topic):
nl = []
driver.get("https://edition.cnn.com/"+str(topic))
driver.implicitly_wait(3)
html = driver.page_source
soup = BeautifulSoup(html, "lxml")
find_ingre = soup.select("span.cd__headline-text")
for i in find_ingre:
nl.append(i.get_text())
return nl
topics =['world','health','politics']
result = pd.DataFrame()
for topic in topics:
temp_df = pd.DataFrame(nl)
temp_df.columns = [topic]
result = pd.concat([result, temp_df], ignore_index=True, axis=1)
i want to scrape web data using input values from excel and scraping web for each row_value taken and save the output to same excel file.
from bs4 import BeautifulSoup
import requests
from urllib import request
import os
import pandas as pd
ciks = pd.read_csv("ciks.csv")
ciks.head()
output
CIK
0 1557822
1 1598429
2 1544670
3 1574448
4 1592290
then
for x in ciks:
url="https://www.sec.gov/cgi-bin/browse-edgar?CIK=" + x +"&owner=exclude&action=getcompany"
r = request.urlopen(url)
bytecode = r.read()
htmlstr = bytecode.decode()
soup = BeautifulSoup(bytecode)
t = soup.find('span',{'class':'companyName'})
print(t.text)
i got an erorr :
----> 9 print (t.text)
AttributeError: 'NoneType' object has no attribute 'text'
here, i want to scrape web data taking each row value as input from the CSV file.
It would be easier to convert the column values as list and then use it in the for loop - see solution below,
from bs4 import BeautifulSoup
import requests
from urllib import request
import os
import pandas as pd
#ciks = pd.read_csv("ciks.csv")
df = pd.read_csv("ciks.csv")
mylist = df['CIK'].tolist()# CIK is the column name
company =[]
for item in mylist:
print(item)
url="https://www.sec.gov/cgi-bin/browse-edgar?CIK=" + str(item) +"&owner=exclude&action=getcompany"
r = request.urlopen(url)
bytecode = r.read()
htmlstr = bytecode.decode()
soup = BeautifulSoup(bytecode,features="lxml")
t = soup.find('span',{'class':'companyName'})
company.append(t.text)
print(t.text)
df.assign(company= company)
print(df)
df.to_csv("ciks.csv")
I wrote this code on Python 2.7.13, for scraping datatable from a website.
import urllib2
from bs4 import BeautifulSoup
import csv
import os
out=open("proba.csv","rb")
data=csv.reader(out)
def make_soup(url):
thepage = urllib2.urlopen(url)
soupdata = BeautifulSoup(thepage, "html.parser")
return soupdata
maindatatable=""
soup = make_soup("https://www.mnb.hu/arfolyamok")
for record in soup.findAll('tr'):
datatable=""
for data in record.findAll('td'):
datatable=datatable+","+data.text
maindatatable = maindatatable + "\n" + datatable[1:]
header = "Penznem,Devizanev,Egyseg,Penznemforintban"
print maindatatable
file = open(os.path.expanduser("proba.csv"),"wb")
utf16_str1 =header.encode('utf16')
utf16_str2 = maindatatable.encode('utf16')
file.write(utf16_str1)
file.write(utf16_str2)
file.close()
I want to export this into CSV with the next 4 rows:
"Penznem Devaizanev Egyseg Penznemforintban"
The data are separated with "," but the last two values is ONE row. (283,45)
How can I fix it?
you can not avoid last coma directly but,
What you can simply do is to use another seprator i.e. ;(semicolon)
and when you open file in exel,calc select (;)semicolon as seprator and you will get result as expected!
import urllib2
from bs4 import BeautifulSoup
import csv
import os
out=open("proba.csv","rb")
data=csv.reader(out)
def make_soup(url):
thepage = urllib2.urlopen(url)
soupdata = BeautifulSoup(thepage, "html.parser")
return soupdata
maindatatable=""
soup = make_soup("https://www.mnb.hu/arfolyamok")
for record in soup.findAll('tr'):
datatable=""
for data in record.findAll('td'):
datatable=datatable+";"+data.text
maindatatable = maindatatable + "\n" + datatable[1:]
header = "Penznem;Devizanev;Egyseg;Penznemforintban"
print maindatatable
file = open(os.path.expanduser("proba.csv"),"wb")
utf16_str1 =header.encode('utf16')
utf16_str2 = maindatatable.encode('utf16')
file.write(utf16_str1)
file.write(utf16_str2)
file.close()
I'm very new at this and I'm having trouble writing a CSV file from a webscraping. Can anyone help? Thank you!
import sys
import urllib2
import csv
import requests
from bs4 import BeautifulSoup
#web scraping wind data
r_wind = requests.get('http://w1.weather.gov/data/obhistory/KCQX.html')
html_wind = r_wind.text
soup = BeautifulSoup(html_wind, "html.parser")
table = soup.find('table')
rows_wind = table.findAll('tr')
rows_wind = rows_wind[1:]
#writing to a csv file
csvfile_wind = open("wind.csv","wb")
output_wind = csv.writer(csvfile_wind, delimiter=',',quotechar='"',quoting=csv.QUOTE_MINIMAL)
for row in rows_wind:
Date = cells[0].text.strip()
Time = cells[1].text.strip()
Wind = cells[2].text.strip()
output.writerow([Date,Time,Wind])
csvfile_wind.close()