Python - Data Scraping Tables

Python - Data Scraping Tables - python

I'm attempting to scrape data for all the quarterbacks who have been drafted. http://www.nfl.com/draft/history/fulldraft?type=position
I'm able to scrape the data. However, there are blank lines that I cannot get rid of. Excel file output
Here is the code that I used.
import urllib
import urllib.request
from bs4 import BeautifulSoup
import os
def make_soup(url):
thepage = urllib.request.urlopen(url)
soupdata = BeautifulSoup(thepage, "html.parser")
return soupdata
playerdata = playerdatasaved = ""
soup = make_soup("http://www.nfl.com/draft/history/fulldraft?type=position")
for record in soup.findAll('tr'):
playerdata = ""
for data in record.findAll('td'):
playerdata = playerdata + "," + data.text
if len(playerdata)!= 0:
playerdatasaved = playerdatasaved + "\n" + playerdata[1:]
header = "Round, Selection #, Player, Position, School, Team Drafted" + "\n"
file = open("Quarterbacks.csv","wb")
file.write(bytes(header, encoding = "ascii", errors = 'igonore'))
file.write(bytes(playerdatasaved, encoding = "ascii", errors = 'igonore'))
I've tried to use an if statement to check for \n breaks and remove the breaks. Also, I've tried to turn the data into a string and use a replace or split command. None of these corrected the issue.
Thanks for any help that you can give me!

Related

Why isn't this web-scraping code returning any results?

I am trying get the headlines of each and everyday from economic times India from 2020-01-01 to 2020-12-31, this is what I have tried:
import requests
from bs4 import BeautifulSoup
import time
import datetime
from dateutil import rrule
from calendar import monthrange
import csv
def read_url(year, month, starttime):
url = f'https://economictimes.indiatimes.com/archivelist/year-{year},month-{month},starttime-{starttime}.cms'
response = requests.get(url)
if response.status_code != 200:
raise Exception(f"Failed to retrieve data from the website. Response status code: {response.status_code}")
soup = BeautifulSoup(response.text, 'html.parser')
return soup
def get_starttime(year,month,day):
date1= '1990-12-30'
timestamp1 = time.mktime(datetime.datetime.strptime(date1, '%Y-%m-%d').timetuple())
date2 = str(year) + '-' + str(month) + '-' + str(day)
timestamp2 = time.mktime(datetime.datetime.strptime(date2, '%Y-%m-%d').timetuple())
starttime=((timestamp2 - timestamp1)/86400)
return str(starttime).replace(".0", "")
headlines_from = '2020-01-01'
headlines_to = '2020-10-31'
headlines_datetime_from = datetime.datetime.strptime(headlines_from, '%Y-%m-%d')
headlines_datetime_to = datetime.datetime.strptime(headlines_to, '%Y-%m-%d')
for dt in rrule.rrule(rrule.MONTHLY,dtstart= headlines_datetime_from,until=headlines_datetime_to):
year = int(dt.strftime('%Y'))
month = int(dt.strftime('%m'))
for day in range (1,(monthrange(year,month)[1]+1)):
starttime = get_starttime(year,month,day)
data_str_eng = str(year) + '-'+ '{:02d}'.format(month) + '-' +'{:02d}'.format(day)
headlines = []
soup = read_url(year,month,starttime)
for td in soup.findAll('td',{'class':'contentbox5'}):
for headline in td.findAll('a'):
if 'archive' not in headline.get('href'):
if len(headline.contents)>0:
if headline.contents[0] not in headlines:
headlines.append(headlines.contents[0])
time.sleep(1)
file = open(f'C:/Users/somar/OneDrive - Technological University of the Shannon Midwest/mythesis/mynew thesis topic/economic_news_headlines_{data_str_eng}.csv', 'w')
with file:
write = csv.writer(file, escapechar = '\\' , quoting = csv.QUOTE_NONE)
for item in headlines:
write.writerow([item,])
The code is running properly but i am getting 0kb data files.

This works for me. The issue is probably with code that creates the dates for your url. Perhaps those urls are sending you to an "empty" page that doesn't contain the html your searching for. Respectfully, it seems like you have spent little time troubleshooting your code, so please do so. If you run into other problems come back and I'll help you out.
Note: I changed read_url to take url's just for the sake of trouble shooting. I also passed 2 new arguments to the open function to fix an encoding error and to not have a newline every other row (in the csv), respectively.
def read_url(url):
response = requests.get(url)
if response.status_code != 200:
raise Exception(f"Failed to retrieve data from the website. Response status code: {response.status_code}")
soup = BeautifulSoup(response.text, 'html.parser')
return soup
if __name__ == '__main__':
url = "https://economictimes.indiatimes.com/archivelist/year-2022,month-12,starttime-44911.cms"
# get sites page source, pass it to bs obj, return bs obj
soup = read_url(url)
headlines = []
for td in soup.findAll('td',{'class':'contentbox5'}):
for headline in td.findAll('a'):
if 'archive' not in headline.get('href'):
if len(headline.contents)>0:
if headline.contents[0] not in headlines:
headlines.append(headline.contents[0])
# print(headlines)
# write to file using utf-8 encoding and without adding a newline every other row
file = open(f'PATH/economic_news_headlines.csv', 'w', encoding="utf-8", newline='')
with file:
write = csv.writer(file, escapechar = '\\' , quoting = csv.QUOTE_NONE)
for item in headlines:
write.writerow([item,])

scraper filters out word's instead of line

So I was trying to make a filter that filter's out the crap from this scrape, but I have an issue where it filters out the words. I would like to filter out the whole line instead of the words.
from bs4 import BeautifulSoup
import requests
import os
def Scrape():
page = input("Page: ")
url = "https://openuserjs.org/?p=" + page
source = requests.get(url)
soup = BeautifulSoup(source.text,'lxml')
os.system('cls')
Filter(soup)
def Filter(soup):
crap = ""
f = open("Data/Crap.txt", "r")
for craptext in f:
crap = craptext
for Titles in soup.select("a.tr-link-a>b"):
print(Titles.text.replace(crap, "").strip())
while True:
Scrape()

Instead of:
print(Titles.text.replace(crap, "").strip())
Try using:
if crap not in Titles.text:
print(Titles.text.strip())

How to loop through BS4 data and print div tag correctly

I am trying to copy all the data within an HTML page that has the certain class "chapter_header_styling" with BS4.
This was working when I manually inputed the URL – but is tedious when there are multiple books and various chapters. So I then created another script that would generate all the chapter URLs for the book and combine them into a text file bchap.txt (book chapters).
Since then I have altered the file and added various break points so ignore my lack of comments and unused arrays/list. I have narrowed it down to the ###Comment## where it doesn't work. It's probably not nested right but I'm not sure... I had this working to a point but can't figure out why it won't paste the mydivs data into the book.html file. If anyone with more experience could point me in the right direction much would be appreciated.
#mkbook.py
# coding: utf-8
from bs4 import BeautifulSoup
import requests
LINK = "https://codes.iccsafe.org/content/FAC2017"
pop = ""
#z = ""
chapters = open("bchap.txt",'r')
a = []
for aline in chapters:
chap = aline
#print (chap)
#pop = ""
pop = LINK+chap
#print (pop)
r = requests.get(pop)
data = r.text
#print(data)
soup = BeautifulSoup(data, 'html.parser')
mydivs = soup.findAll("div", {"class": ["annotator", "chapter_header_styling"]})
f = open("BOOK.html","a")
f.write("test <br/>")
########################################
#MY PROBLEM IS BELOW NOT PRINTING DIV DATA INTO TXT FILE
########################################
for div in mydivs:
print (div)
z = str(div)
print(z) #doesn't printout...why???
f.write(z)
print len(mydivs)
f.close()
chapters.close()
##############################################
## this is the old mkbook.py code before I looped it - inputing url 1 # time
#
# coding: utf-8
from bs4 import BeautifulSoup
import requests
r = requests.get("https://codes.iccsafe.org/content/FAC2017/preface")
data = r.text
soup = BeautifulSoup(data, 'html.parser')
a = []
mydivs = soup.findAll("div",{"class":["annotator",
"chapter_header_styling"]})
f = open("BOOK.html","a")
for div in mydivs:
z = str(div)
f.write(z)
f.close()
print len(mydivs) #outputs 1 if copied div data.
#######################################
#mkchap.py
# coding: utf-8
from bs4 import BeautifulSoup
import requests
r = requests.get("https://codes.iccsafe.org/content/FAC2017")
data = r.text
soup = BeautifulSoup(data, 'html.parser')
a = []
soup.findAll('option',{"value":True})
list = soup.findAll('option')
with open('bchap.txt', 'w') as filehandle:
for l in list:
filehandle.write(l['value'])
filehandle.write("\n")
print l['value']
#with open('bchap.txt', 'w') as filehandle:
# filehandle.write("%s\n" % list)
filehandle.close()

The problem seems to be that you are constructing your url using a wrong base url.
LINK = "https://codes.iccsafe.org/content/FAC2017"
If you take a look at your 1st request you can see this clearly.
print(pop)
print(r.status_code)
Outputs:
https://codes.iccsafe.org/content/FAC2017/content/FAC2017
404
After running the code to populate bchap.txt, its output is
/content/FAC2017
/content/FAC2017/legend
/content/FAC2017/copyright
/content/FAC2017/preface
/content/FAC2017/chapter-1-application-and-administration
/content/FAC2017/chapter-2-scoping-requirements
/content/FAC2017/chapter-3-building-blocks
/content/FAC2017/chapter-4-accessible-routes
/content/FAC2017/chapter-5-general-site-and-building-elements
/content/FAC2017/chapter-6-plumbing-elements-and-facilities
/content/FAC2017/chapter-7-communication-elements-and-features
/content/FAC2017/chapter-8-special-rooms-spaces-and-elements
/content/FAC2017/chapter-9-built-in-elements
/content/FAC2017/chapter-10-recreation-facilities
/content/FAC2017/list-of-figures
/content/FAC2017/fair-housing-accessibility-guidelines-design-guidelines-for-accessible-adaptable-dwellings
/content/FAC2017/advisory
Lets change the base url first and try again.
from bs4 import BeautifulSoup
import requests
LINK = "https://codes.iccsafe.org"
pop = ""
chapters = open("bchap.txt",'r')
a = []
for aline in chapters:
chap = aline
pop = LINK+chap
r = requests.get(pop)
print(pop)
print(r.status_code)
chapters.close()
Outputs:
https://codes.iccsafe.org/content/FAC2017
404
...
why? b'coz of the \n. If we do a
print(repr(pop))
It will output
'https://codes.iccsafe.org/content/FAC2017\n'
You'll have to strip away that \n also. The final code that worked is
from bs4 import BeautifulSoup
import requests
LINK = "https://codes.iccsafe.org"
pop = ""
chapters = open("bchap.txt",'r')
a = []
for aline in chapters:
chap = aline
pop = LINK+chap
r = requests.get(pop.strip())
data = r.text
soup = BeautifulSoup(data, 'html.parser')
mydivs = soup.findAll("div", class_="annotator chapter_header_styling")
f = open("BOOK.html","a")
for div in mydivs:
z = str(div)
f.write(z)
f.close()
chapters.close()

Creating a text-delimited file from HTML tables using BeautifulSoup

I'm trying to create a text-delimited file containing the data from the "Actions" table on webpages like this one: http://stats.swehockey.se/Game/Events/300978
I would like each line to include the game # (from the end of the URL) and then the text from the line on the table. For example:
300972 | 60:00 | GK Out | OHK | 33. Hudacek, Julius
I haven't been able to get each row to actually separate. I've tried parsing through each row and column, using a list of stripped strings, and searching by different tags, classes, and styles.
Here's what I currently have:
from bs4 import BeautifulSoup
import urllib.request
def createtext():
gamestr = urlstr + "|"
#Find all table lines. Create one pipe-delimited line for each.
aptext = gamestr
for el in soup.find_all('tr'):
playrow = el.find_all('td', 'tdOdd')
for td in playrow:
if(td.find(text=True)) not in ("", None, "\n"):
aptext = aptext + ''.join(td.text) + "|"
aptext = aptext + "\n" + gamestr
#Creates file with Game # as filename and writes the data to the file
currentfile = urlstr + ".txt"
with open(currentfile, "w") as f:
f.write(str(aptext))
#Grabs the HTML file and creates the soup
urlno = 300978
urlstr = str(urlno)
url = ("http://stats.swehockey.se/Game/Events/" + urlstr)
request = urllib.request.Request(url)
response = urllib.request.urlopen(request)
pbpdoc = response.read().decode('utf-8')
soup = BeautifulSoup(pbpdoc)
createtext()
Thanks for any help or guidance!

First of all, you don't have to construct the CSV data manually, Python provides a built-in csv module for that.
Then, since you are up to "actions" only, I'd identify the "actions" table and find the events-only rows. This can be done with the help of a filtering function checking the first cell to not be empty:
import csv
from bs4 import BeautifulSoup
import requests
def only_action_rows(tag):
if tag.name == 'tr':
first_cell = tag.find('td', class_='tdOdd')
return first_cell and first_cell.get_text(strip=True)
event_id = 300978
url = "http://stats.swehockey.se/Game/Events/{event_id}".format(event_id=event_id)
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
actions_table = soup.find("h2", text="Actions").find_parent("table")
data = [[event_id] + [td.get_text(strip=True) for td in row.find_all('td', class_='tdOdd')]
for row in actions_table.find_all(only_action_rows)]
with open("output.csv", "w") as f:
writer = csv.writer(f)
writer.writerows(data)
Note that I'm using requests here.

Python - scraping a paginated site and writing the results to a file

I am a complete programming beginner, so please forgive me if I am not able to express my problem very well. I am trying to write a script that will look through a series of pages of news and will record the article titles and their links. I have managed to get that done for the first page, the problem is getting the content of the subsequent pages. By searching in stackoverflow, I think I managed to find a solution that will make the script access more than one URL BUT it seems to be overwriting the content extracted from each page it accesses so I always end up with the same number of recorded articles in the file. Something that might help: I know that URLs follow the following model: "/ultimas/?page=1", "/ultimas/?page=2", etc. and it appears to be using AJAX to request new articles
Here is my code:
import csv
import requests
from bs4 import BeautifulSoup as Soup
import urllib
r = base_url = "http://agenciabrasil.ebc.com.br/"
program_url = base_url + "/ultimas/?page="
for page in range(1, 4):
url = "%s%d" % (program_url, page)
soup = Soup(urllib.urlopen(url))
letters = soup.find_all("div", class_="titulo-noticia")
letters[0]
lobbying = {}
for element in letters:
lobbying[element.a.get_text()] = {}
letters[0].a["href"]
prefix = "http://agenciabrasil.ebc.com.br"
for element in letters:
lobbying[element.a.get_text()]["link"] = prefix + element.a["href"]
for item in lobbying.keys():
print item + ": " + "\n\t" + "link: " + lobbying[item]["link"] + "\n\t"
import os, csv
os.chdir("...")
with open("lobbying.csv", "w") as toWrite:
writer = csv.writer(toWrite, delimiter=",")
writer.writerow(["name", "link",])
for a in lobbying.keys():
writer.writerow([a.encode("utf-8"), lobbying[a]["link"]])
import json
with open("lobbying.json", "w") as writeJSON:
json.dump(lobbying, writeJSON)
print "Fim"
Any help on how I might go about adding the content of each page to the final file would be very appreciated. Thank you!

How about this one if serving the same purpose:
import csv, requests
from lxml import html
base_url = "http://agenciabrasil.ebc.com.br"
program_url = base_url + "/ultimas/?page={0}"
outfile = open('scraped_data.csv', 'w', newline='')
writer = csv.writer(outfile)
writer.writerow(["Caption","Link"])
for url in [program_url.format(page) for page in range(1, 4)]:
response = requests.get(url)
tree = html.fromstring(response.text)
for title in tree.xpath("//div[#class='noticia']"):
caption = title.xpath('.//span[#class="field-content"]/a/text()')[0]
policy = title.xpath('.//span[#class="field-content"]/a/#href')[0]
writer.writerow([caption , base_url + policy])

It looks like the code in your for loop (for page in range(1, 4):) isn't been called due to your file not been correctly indented:
If you tidy up your code, it works:
import csv, requests, os, json, urllib
from bs4 import BeautifulSoup as Soup
r = base_url = "http://agenciabrasil.ebc.com.br/"
program_url = base_url + "/ultimas/?page="
for page in range(1, 4):
url = "%s%d" % (program_url, page)
soup = Soup(urllib.urlopen(url))
letters = soup.find_all("div", class_="titulo-noticia")
lobbying = {}
for element in letters:
lobbying[element.a.get_text()] = {}
prefix = "http://agenciabrasil.ebc.com.br"
for element in letters:
lobbying[element.a.get_text()]["link"] = prefix + element.a["href"]
for item in lobbying.keys():
print item + ": " + "\n\t" + "link: " + lobbying[item]["link"] + "\n\t"
#os.chdir("...")
with open("lobbying.csv", "w") as toWrite:
writer = csv.writer(toWrite, delimiter=",")
writer.writerow(["name", "link",])
for a in lobbying.keys():
writer.writerow([a.encode("utf-8"), lobbying[a]["link"]])
with open("lobbying.json", "w") as writeJSON:
json.dump(lobbying, writeJSON)
print "Fim"

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Python - Data Scraping Tables - python

Related

Why isn't this web-scraping code returning any results?

scraper filters out word's instead of line

How to loop through BS4 data and print div tag correctly

Creating a text-delimited file from HTML tables using BeautifulSoup

Python - scraping a paginated site and writing the results to a file

Categories

Resources