Scraping data > stop at white line - python

from bs4 import BeautifulSoup
import urllib
import json
import os
jaren = [str("2012"), str("2010"), str("2006"), str("2003"),str("2002"), str("1998"), str("1994"), str("1989"), str("1986"), str("1982"), str("1981"), str("1977"), str("1972"), str("1971"), str("1967"), str("1963"), str("1959"), str("1956")]
DESIRED_COLUMNS = {1, 2, 5} #scrapes only afk, aantal & zetels
verkiezingsData = []
filename = raw_input('Enter a filename: ') or 'data.json'
#open file and open json array
with open(filename, "w") as file:
file.write("[{")
for Jaargetal in jaren:
#url source
r = urllib.urlopen("http://www.nlverkiezingen.com/TK" + Jaargetal +".html").read()
soup = BeautifulSoup(r, "html.parser")
tables = soup.find_all("table")
for table in tables:
header = soup.find_all("h1")[0].getText()
#print header
with open(filename, "a+") as file:
file.write("\"%s\": [" % header) #header as beginning json
trs = table.find_all("tr")[0].getText()
del verkiezingsData[:] #clear list before adding new data
#add the 3 columns to a list
for tr in table.find_all("tr")[:22]: #22 aantal columns van top till bottom
for index, val in enumerate(tr.find_all('td')):
if index in DESIRED_COLUMNS: #linkt naar desired columns bovenin
verkiezingsData.append(val.getText().strip())
#json array van de 3 vallues
for a, b, c in zip(verkiezingsData[::3], verkiezingsData[1::3], verkiezingsData[2::3]): #link naar desired columns 1,2,5
data2 = {'afk':a,"aantal":b, "zetels":c}
#file writing
with open(filename, 'a') as outfile:
json.dump(data2, outfile)
outfile.write(",")
#open file, delete last comma and close array
with open(filename, 'ab+') as file:
file.seek(-1, os.SEEK_END)
file.truncate()
file.write("],")
#open file, delete last comma, and close array
with open(filename, 'r+b') as file:
file.seek(-1, os.SEEK_END)
file.truncate()
file.write("}]")
#open file and pretty print json data
with open(filename, 'r') as file:
prettydata = json.load(file)
with open(filename, 'w') as file:
json.dump(prettydata, file, sort_keys=True, indent=4, separators=(',', ': '))
I made a scraper which scrapes from nlverkiezingen.com
When it saves as a json file:
"Tweede-Kamerverkiezingen - 12 september 2012": [
{
"aantal": "Aantal",
"afk": "Afk.",
"zetels": "Zetels"
},
{
"aantal": "2504948",
"afk": "VVD",
"zetels": "41"
},
The first row is: Aantal/Afk/Zetels.
I don't want this to be scraped.
How can I change this? That the scraping begins from the second row
The second thing is, that the last row is everywhere different. sometimes the 20th row, 15th row.
How can I change this? That the scraping ends when he sees a white/empty row when it is scraping?

do not know if get it right, it is just a guess, but maybe something like
for tr in table.find_all("tr")[1:22]
to skip the first row?

The first row is: Aantal/Afk/Zetels. I don't want this to be scraped.
Replace
for tr in table.find_all("tr")[:22]: with
for tr in table.find_all("tr")[1:22]:
Python has zero-based indexing, so 1 refers to the second row in your table.
How can I change this? That the scraping ends when he sees a white/empty row when it is scraping?
The in the empty table cells will be parsed by BeautifulSoup as the u"\xa0" Python string. Check the content of the first tag on each row and compare it to that value, and use that for breaking out of your loop.

Related

Webscraping issue where dataframe to csv put output into one cell

I am trying to help out our soccer coach who is doing some work on helping underprivileged kids get recruited. I am trying to scrape a "topdrawer" website page so we can track where players get placed. I am not a python expert at all and am banging my head against the wall. I got some help yesterday and tried to implement - see two sets of code below. Neither puts the data into a nice table we can sort and analyze etc. Thanks in advance for any help.
import bs4 as bs
import urllib.request
import pandas as pd
import csv
max_page_num = 14
max_page_dig = 1 # number of digits in the page number
with open('result.csv',"w", newline='') as f:
f.write("Name, Gender, State, Position, Grad, Club/HS, Rating, Commitment \n")
for i in range(0, max_page_num):
page_num = (max_page_dig - len(str(i))) * "0" +str(i) #gives a string in the format of 1, 01 or 001, 005 etc
source = "https://www.topdrawersoccer.com/search/?query=&divisionId=&genderId=m&graduationYear=2020&positionId=0&playerRating=&stateId=All&pageNo=" + page_num + "&area=commitments"
df = pd.read_html(source)
df = pd.DataFrame(df)
df.to_csv('results.csv', header=False, index=False, mode='a') #'a' should append each table to the csv file, instead of overwriting it.
The second method jumbles the output up into one line with /n separators etc
import bs4 as bs
import urllib.request
import pandas as pd
import csv
max_page_num = 14
max_page_dig = 1 # number of digits in the page number
with open('result.csv',"w", newline='') as f:
f.write("Name, Gender, State, Position, Grad, Club/HS, Rating, Commitment \n")
for i in range(0, max_page_num):
page_num = (max_page_dig - len(str(i))) * "0" +str(i) #gives a string in the format of 1, 01 or 001, 005 etc
print(page_num)
source = "https://www.topdrawersoccer.com/search/?query=&divisionId=&genderId=m&graduationYear=2020&positionId=0&playerRating=&stateId=All&pageNo=" + page_num + "&area=commitments"
print(source)
url = urllib.request.urlopen(source).read()
soup = bs.BeautifulSoup(url,'lxml')
table = soup.find('table')
#table = soup.table
table_rows = table.find_all('tr')
with open('result.csv', 'a', newline='') as f:
for tr in table_rows:
td = tr.find_all('td')
row = [i.text for i in td]
f.write(str(row))
in the first version the data is all place on one line and not separated.
The second version puts each page into one cell and splits the pages in half.
Page may have many <table> in HTML (sometimes used to create menu or to organize elements on page) and pandas.read_html() creates DataFrame for every <table> on page and it always returns list with all created DataFrames (even if there was only one <table>) and you have to check which one has your data. You can display every DataFrame from list to see which one you need. This way I know that first DataFrame has your data and you have to use [0] to get it.
import pandas as pd
max_page_num = 15 # it has to be 15 instead of 14 because `range(15)` will give `0-14`
with open('result.csv', 'w', newline='') as f:
f.write('Name, Gender, State, Position, Grad, Club/HS, Rating, Commitment\n')
for i in range(max_page_num):
print('page:', i)
page_num = str(i)
source = "https://www.topdrawersoccer.com/search/?query=&divisionId=&genderId=m&graduationYear=2020&positionId=0&playerRating=&stateId=All&pageNo=" + page_num + "&area=commitments"
all_tables = pd.read_html(source)
df = all_tables[0]
print('items:', len(df))
df.to_csv('results.csv', header=False, index=False, mode='a') #'a' should append each table to the csv file, instead of overwriting it.
EDIT:
In second version you should use strip() to remove \n which csv would tread as beginning of new row.
You shouldn't use str(row) because it creates string with [ ] which is not correct in csv file. You should rather use ",".join(row) to create string. And you have to add \n at the end of every row because write() doesn't add it.
But it could be better to use csv module and its writerow() for this. It will convert list to string with , as separtor and add \n automatically. If some item will have , or \n then it will put it in " " to create correct row.
import bs4 as bs
import urllib.request
import csv
max_page_num = 15
fh = open('result.csv', "w", newline='')
csv_writer = csv.writer(fh)
csv_writer.writerow( ["Name", "Gender", "State", "Position", "Grad", "Club/HS", "Rating", "Commitment"] )
for i in range(max_page_num):
print('page:', i)
page_num = str(i)
source = "https://www.topdrawersoccer.com/search/?query=&divisionId=&genderId=m&graduationYear=2020&positionId=0&playerRating=&stateId=All&pageNo=" + page_num + "&area=commitments"
url = urllib.request.urlopen(source).read()
soup = bs.BeautifulSoup(url, 'lxml')
table = soup.find('table')
table_rows = table.find_all('tr')
for tr in table_rows:
td = tr.find_all('td')
#row = [i.text.strip() for i in td] # strip to remove spaces and '\n'
row = [i.get_text(strip=True) for i in td] # strip to remove spaces and '\n'
if row: # check if row is not empty
#print(row)
csv_writer.writerow(row)
fh.close()

Combine two python scripts for web search

I'm trying to download files from a site and due to search result limitations (max 300), I need to search each item individually. I have a csv file that has a complete list which I've written some basic code to return the ID# column.
With some help, I've got another script that iterates through each search result and downloads a file. What I need to do now is to combine the two so that it will search each individual ID# and download the file.
I know my loop is messed up here, I just can't figure out where and if I'm even looping in the right order
import requests, json, csv
faciltiyList = []
with open('Facility List.csv', 'r') as f:
csv_reader = csv.reader(f, delimiter=',')
for searchterm in csv_reader:
faciltiyList.append(searchterm[0])
url = "https://siera.oshpd.ca.gov/FindFacility.aspx"
r = requests.get(url+"?term="+str(searchterm))
searchresults = json.loads(r.content.decode('utf-8'))
for report in searchresults:
rpt_id = report['RPT_ID']
reporturl = f"https://siera.oshpd.ca.gov/DownloadPublicFile.aspx?archrptsegid={rpt_id}&reporttype=58&exportformatid=8&versionid=1&pageid=1"
r = requests.get(reporturl)
a = r.headers['Content-Disposition']
filename = a[a.find("filename=")+9:len(a)]
file = open(filename, "wb")
file.write(r.content)
r.close()
The original code I have is here:
import requests, json
searchterm="ALAMEDA (COUNTY)"
url="https://siera.oshpd.ca.gov/FindFacility.aspx"
r=requests.get(url+"?term="+searchterm)
searchresults=json.loads(r.content.decode('utf-8'))
for report in searchresults:
rpt_id=report['RPT_ID']
reporturl=f"https://siera.oshpd.ca.gov/DownloadPublicFile.aspx?archrptsegid={rpt_id}&reporttype=58&exportformatid=8&versionid=1&pageid=1"
r=requests.get(reporturl)
a=r.headers['Content-Disposition']
filename=a[a.find("filename=")+9:len(a)]
file = open(filename, "wb")
file.write(r.content)
r.close()
The searchterm ="ALAMEDA (COUNTY)" results in more than 300 results, so I'm trying to replace "ALAMEDA (COUNTY)" with a list that'll run through each name (ID# in this case) so that I'll get just one result, then run again for the next on the list
CSV - just 1 line
Tested with a CSV file with just 1 line:
406014324,"HOLISTIC PALLIATIVE CARE, INC.",550004188,Parent Facility,5707 REDWOOD RD,OAKLAND,94619,1,ALAMEDA,Not Applicable,,Open,1/1/2018,Home Health Agency/Hospice,Hospice,37.79996,-122.17075
Python code
This script reads the IDs from the CSV file. Then, it fetches the results from URL and finally writes the desired contents to the disk.
import requests, json, csv
# read Ids from csv
facilityIds = []
with open('Facility List.csv', 'r') as f:
csv_reader = csv.reader(f, delimiter=',')
for searchterm in csv_reader:
facilityIds.append(searchterm[0])
# fetch and write file contents
url = "https://siera.oshpd.ca.gov/FindFacility.aspx"
for facilityId in facilityIds:
r = requests.get(url+"?term="+str(facilityId))
reports = json.loads(r.content.decode('utf-8'))
# print(f"reports = {reports}")
for report in reports:
rpt_id = report['RPT_ID']
reporturl = f"https://siera.oshpd.ca.gov/DownloadPublicFile.aspx?archrptsegid={rpt_id}&reporttype=58&exportformatid=8&versionid=1&pageid=1"
r = requests.get(reporturl)
a = r.headers['Content-Disposition']
filename = a[a.find("filename=")+9:len(a)]
# print(f"filename = {filename}")
with open(filename, "wb") as o:
o.write(r.content)
Repl.it link

Having problems writing to text files. Text files being overwritten/cut

I would like to write new data to the beginning of my text file, with the previous data shifting down 1 line each time new data is imported, I would like everything to be organized, but every time I import something gets deleted.
Code:
import requests
from bs4 import BeautifulSoup
from datetime import datetime
response = requests.get('https://www.lotteryusa.com/michigan/lucky-4-life/')
soup = BeautifulSoup(response.text, 'html.parser')
date = soup.find(class_='date')
results = soup.find(class_='draw-result list-unstyled list-inline')
d = datetime.strptime(date.time['datetime'], '%Y-%m-%d')
Lucky = (d.strftime("%m%d%Y")+(',')+results.get_text()[:-20].strip().replace('\n',','))
print(Lucky)
with open("webscraper2noteppad++", "r+") as f:
file = f.readlines()
f.seek(0,0)
f.write(Lucky)
Also tried doing this
with open("webscraper2noteppad++", "r+") as f:
file = f.read()
f.seek(0,0)
f.write(Lucky + '\n')
but I have to put 10 lines between the already existing data, and the new data. So it can be can be imported on top without deleting.
You can first read the content of your file, the append it to the new data and then write everything to the file:
with open("webscraper2noteppad++", "r") as f:
data = f.read()
with open("webscraper2noteppad++", "w") as f:
f.write('{}{}{}'.format(lucky, '\n' if data else '', data))

Write data into csv

I am crawling data from Wikipedia and it works so far. I can display it on the terminal, but I can't write it the way I need it into a csv file :-/
The code is pretty long, but I paste it here anyway and hope that somebody can help me.
import csv
import requests
from bs4 import BeautifulSoup
def spider():
url = 'https://de.wikipedia.org/wiki/Liste_der_Gro%C3%9F-_und_Mittelst%C3%A4dte_in_Deutschland'
code = requests.get(url).text # Read source code and make unicode
soup = BeautifulSoup(code, "lxml") # create BS object
table = soup.find(text="Rang").find_parent("table")
for row in table.find_all("tr")[1:]:
partial_url = row.find_all('a')[0].attrs['href']
full_url = "https://de.wikipedia.org" + partial_url
get_single_item_data(full_url) # goes into the individual sites
def get_single_item_data(item_url):
page = requests.get(item_url).text # Read source code & format with .text to unicode
soup = BeautifulSoup(page, "lxml") # create BS object
def getInfoBoxBasisDaten(s):
return str(s) == 'Basisdaten' and s.parent.name == 'th'
basisdaten = soup.find_all(string=getInfoBoxBasisDaten)[0]
basisdaten_list = ['Bundesland', 'Regierungsbezirk:', 'Höhe:', 'Fläche:', 'Einwohner:', 'Bevölkerungsdichte:',
'Postleitzahl', 'Vorwahl:', 'Kfz-Kennzeichen:', 'Gemeindeschlüssel:', 'Stadtgliederung:',
'Adresse', 'Anschrift', 'Webpräsenz:', 'Website:', 'Bürgermeister', 'Bürgermeisterin',
'Oberbürgermeister', 'Oberbürgermeisterin']
with open('staedte.csv', 'w', newline='', encoding='utf-8') as csvfile:
fieldnames = ['Bundesland', 'Regierungsbezirk:', 'Höhe:', 'Fläche:', 'Einwohner:', 'Bevölkerungsdichte:',
'Postleitzahl', 'Vorwahl:', 'Kfz-Kennzeichen:', 'Gemeindeschlüssel:', 'Stadtgliederung:',
'Adresse', 'Anschrift', 'Webpräsenz:', 'Website:', 'Bürgermeister', 'Bürgermeisterin',
'Oberbürgermeister', 'Oberbürgermeisterin']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames, delimiter=';', quotechar='|', quoting=csv.QUOTE_MINIMAL, extrasaction='ignore')
writer.writeheader()
for i in basisdaten_list:
wanted = i
current = basisdaten.parent.parent.nextSibling
while True:
if not current.name:
current = current.nextSibling
continue
if wanted in current.text:
items = current.findAll('td')
print(BeautifulSoup.get_text(items[0]))
print(BeautifulSoup.get_text(items[1]))
writer.writerow({i: BeautifulSoup.get_text(items[1])})
if '<th ' in str(current): break
current = current.nextSibling
print(spider())
The output is incorrect in 2 ways. The cells are their right places and only one city is written, all others are missing. It looks like this:
But it should look like this + all other cities in it:
'... only one city is written ...': You call get_single_item_data for each city. Then inside this function you open the output file with the same name, in the statement with open('staedte.csv', 'w', newline='', encoding='utf-8') as csvfile: which will overwrite the output file each time you call the function.
Each variable is written to a new row: In the statement writer.writerow({i: BeautifulSoup.get_text(items[1])}) you write the value for one variable to a row. What you need to do instead is to make a dictionary for values before you start looking for page values. As you accumulate the values from the page you shove them into the dictionary by field name. Then after you have found all of the values available you call writer.writerow.

Scraping needs to stop when it sees a white row

from bs4 import BeautifulSoup
import urllib
import json
import os
jaren = [str("2012"), str("2010"), str("2006"), str("2003"),str("2002"), str("1998"), str("1994"), str("1989"), str("1986"), str("1982"), str("1981"), str("1977"), str("1972"), str("1971"), str("1967"), str("1963"), str("1959"), str("1956")]
DESIRED_COLUMNS = {1, 2, 5} #scrapes only afk, aantal & zetels
verkiezingsData = []
filename = raw_input('Enter a filename: ') or 'data.json'
#open file and open json array
with open(filename, "w") as file:
file.write("[{")
for Jaargetal in jaren:
#url source
r = urllib.urlopen("http://www.nlverkiezingen.com/TK" + Jaargetal +".html").read()
soup = BeautifulSoup(r, "html.parser")
tables = soup.find_all("table")
for table in tables:
header = soup.find_all("h1")[0].getText()
#print header
with open(filename, "a+") as file:
file.write("\"%s\": [" % header) #header as beginning json
trs = table.find_all("tr")[0].getText()
del verkiezingsData[:] #clear list before adding new data
#add the 3 columns to a list
for tr in table.find_all("tr")[1:22]: #22 columns top till bottom
for index, val in enumerate(tr.find_all('td')):
if index in DESIRED_COLUMNS: #linkt naar desired columns bovenin
verkiezingsData.append(val.getText().strip())
#json array van de 3 vallues
for a, b, c in zip(verkiezingsData[::3], verkiezingsData[1::3], verkiezingsData[2::3]): #link naar desired columns 1,2,5
data2 = {'afk':a,"aantal":b, "zetels":c}
#file writing
with open(filename, 'a') as outfile:
json.dump(data2, outfile)
outfile.write(",")
#open file, delete last comma and close array
with open(filename, 'ab+') as file:
file.seek(-1, os.SEEK_END)
file.truncate()
file.write("],")
#open file, delete last comma, and close array
with open(filename, 'r+b') as file:
file.seek(-1, os.SEEK_END)
file.truncate()
file.write("}]")
#open file and pretty print json data
with open(filename, 'r') as file:
prettydata = json.load(file)
with open(filename, 'w') as file:
json.dump(prettydata, file, sort_keys=True, indent=4, separators=(',', ': '))
I made a scraper which scrapes from nlverkiezingen.com
It scrapes Aantal/Afk/Zetels
It has a string which scrapes a lot of years.
jaren = [str("2012"), str("2010"), str("2006"), str("2003"),str("2002"), str("1998"), str("1994"), str("1989")]
It needs to begin at row 1.
for tr in table.find_all("tr")[1:22]: #22 columns top till bottom
But needs to end when it sees a white line/white space instead of row 22^. (every year ends on another row) How can I code this?
Or is it possible to say somewhere in the code - about every year in the string - when it needs to stop scraping rows? for example 2010 row 22 1959 row 10
I will assume here that you are asking that you want to break out of the for loop when tr dom is empty.
for tr in table.find_all("tr")[1:22]:
if tr.getText() == "":
break
If you need to stop the iteration if a specific column like 'aantal' is empty then maybe try something like this:
# add the 3 columns to a list
for tr in table.find_all("tr")[1:22]: # 22 columns top till bottom
td_cols = tr.find_all('td')
if td_cols[2].getText() == '': # second column is aantal, if empty break loop
break
# else go on as usual
for index, val in enumerate(td_cols):
# do stuff
However, I believe you should try structure your code differently so that you have more control over the flow.

Categories

Resources