Webscraping with mechanize and beautifulsoup - cannot write output file - python

I need to get lots of data specific to rivercruising, so I am working with alteryx, and for scraping I want to use python from the command line. I need to write the output file to json or to csv. The output file is empty. The hashtags in the code are for processing the output file in alteryx, as the scraped text already contains ",". Preferably I would love to map the output to Json. My code is as follows:
from mechanize import Browser
from bs4 import BeautifulSoup
import lxml
mech = Browser()
url = 'http://www.cruiseshipschedule.com/viking-river-cruises/viking-aegir-schedule/'
page = mech.open(url)
html = page.read()
html.replace('charset="ISO-8859-1"','charset=utf-8')
s = BeautifulSoup(html, "lxml")
content = s.findAll('div', id="content")
link = s.findAll("a")
h1 = s.findAll("h1")
table = s.findAll("table", border="1")
for link in s.findAll("a"):
linktext = link.text
linkhref = link.get("href")
for h1 in s.findAll("h1"):
ship = h1.text
h2_1 = s.h2
h2_1.text
h2_2 = h2_1.find_next('h2')
itinerary_1 = h2_2.text
h2_3 = h2_2.find_next('h2')
itinerary_2 = h2_3.text
h2_4 = h2_3.find_next('h2')
itinerary_3 = h2_4.text
for table in content:
table0 = s.findAll("table", border='0')
for tr in s.findAll("table", border='1'):
trs1 = s.findAll("tr")
table1 = tr.text.replace('\n','|')
tds1 = s.findAll('td')
uls1 = s.findAll('ul')
lis1 = s.findAll('li')
for tr in s.findAll("table", border='0'):
trs2 = s.findAll("tr")
table2 = tr.text.replace('\n','|')
tds2 = s.findAll('td')
uls2 = s.findAll('ul')
lis2 = s.findAll('li')
all_data=ship+"#"+table1+"#"+table2+"#"+itinerary_1+"#"+itinerary_2+"#"+itinerary_3
all_data = open("Z:/txt files/all_data.txt", "w")
print all_data >> "Z:/txt files/all_data.txt"

To get output to your file, try something like instead of the last 2 lines in your code above:
with open('all_data_txt, 'w') as f:
f.write(all_data.encode('utf8'))

Related

CSV -(excel)- Python. Seems like wrong writing on csv from python

I´m trying to export some data from a website and I first tried on one single page. I´ve to import text delimited by titles:
['Drug name','General Information','Clinical Results','Side Effects','Mechanism of Action','Literature
References','Additional Information','Approval Date','Date Created','Company Name']
The url is https://www.centerwatch.com/directories/1067-fda-approved-drugs/listing/3092-afinitor-everolimus
The code currently works, it gives me all the data. But when I insert it on the CSV , the information is not delimited as I wish.
As it is one single page, the excel should have ONE row... but it doesn´t
The code:
from bs4 import BeautifulSoup
import requests
import csv
csv_file = open('Drugs.csv','w')
csv_writer = csv.writer(csv_file, delimiter ='+')
csv_writer.writerow(['Drug name','General Information','Clinical Results','Side Effects','Mechanism of Action','Literature References','Additional Information','Approval Date','Date Created','Company Name'])
link = requests.get('https://www.centerwatch.com/directories/1067-fda-approved-drugs/listing/3092-afinitor-everolimus')
aux =[]
soup = BeautifulSoup(link.content, 'lxml')
drugName = soup.find('div', class_='company-navigation').find('h1').text
gralInfo = soup.find('div', class_='body directory-listing-profile__description')
y = 0
for h2 in gralInfo.find_all('h2'):
print (y)
text =''
for sibling in h2.find_next_siblings():
if (sibling.name == 'h2'):
break
else:
text = text + sibling.get_text(separator ='\n') + '\n'
print(text)
aux.append(text)
print()
print()
y = y + 1
auxi = []
for info in soup.find_all('div', class_='contact directory-listing-profile__master-detail'):
print(info.text)
auxi.append(info.text)
csv_writer.writerow([drugName, aux[0], aux[1], aux[2], aux[3], aux[4], aux[5], auxi[0], auxi[1], auxi[2]])

How to loop through BS4 data and print div tag correctly

I am trying to copy all the data within an HTML page that has the certain class "chapter_header_styling" with BS4.
This was working when I manually inputed the URL – but is tedious when there are multiple books and various chapters. So I then created another script that would generate all the chapter URLs for the book and combine them into a text file bchap.txt (book chapters).
Since then I have altered the file and added various break points so ignore my lack of comments and unused arrays/list. I have narrowed it down to the ###Comment## where it doesn't work. It's probably not nested right but I'm not sure... I had this working to a point but can't figure out why it won't paste the mydivs data into the book.html file. If anyone with more experience could point me in the right direction much would be appreciated.
#mkbook.py
# coding: utf-8
from bs4 import BeautifulSoup
import requests
LINK = "https://codes.iccsafe.org/content/FAC2017"
pop = ""
#z = ""
chapters = open("bchap.txt",'r')
a = []
for aline in chapters:
chap = aline
#print (chap)
#pop = ""
pop = LINK+chap
#print (pop)
r = requests.get(pop)
data = r.text
#print(data)
soup = BeautifulSoup(data, 'html.parser')
mydivs = soup.findAll("div", {"class": ["annotator", "chapter_header_styling"]})
f = open("BOOK.html","a")
f.write("test <br/>")
########################################
#MY PROBLEM IS BELOW NOT PRINTING DIV DATA INTO TXT FILE
########################################
for div in mydivs:
print (div)
z = str(div)
print(z) #doesn't printout...why???
f.write(z)
print len(mydivs)
f.close()
chapters.close()
##############################################
## this is the old mkbook.py code before I looped it - inputing url 1 # time
#
# coding: utf-8
from bs4 import BeautifulSoup
import requests
r = requests.get("https://codes.iccsafe.org/content/FAC2017/preface")
data = r.text
soup = BeautifulSoup(data, 'html.parser')
a = []
mydivs = soup.findAll("div",{"class":["annotator",
"chapter_header_styling"]})
f = open("BOOK.html","a")
for div in mydivs:
z = str(div)
f.write(z)
f.close()
print len(mydivs) #outputs 1 if copied div data.
#######################################
#mkchap.py
# coding: utf-8
from bs4 import BeautifulSoup
import requests
r = requests.get("https://codes.iccsafe.org/content/FAC2017")
data = r.text
soup = BeautifulSoup(data, 'html.parser')
a = []
soup.findAll('option',{"value":True})
list = soup.findAll('option')
with open('bchap.txt', 'w') as filehandle:
for l in list:
filehandle.write(l['value'])
filehandle.write("\n")
print l['value']
#with open('bchap.txt', 'w') as filehandle:
# filehandle.write("%s\n" % list)
filehandle.close()
The problem seems to be that you are constructing your url using a wrong base url.
LINK = "https://codes.iccsafe.org/content/FAC2017"
If you take a look at your 1st request you can see this clearly.
print(pop)
print(r.status_code)
Outputs:
https://codes.iccsafe.org/content/FAC2017/content/FAC2017
404
After running the code to populate bchap.txt, its output is
/content/FAC2017
/content/FAC2017/legend
/content/FAC2017/copyright
/content/FAC2017/preface
/content/FAC2017/chapter-1-application-and-administration
/content/FAC2017/chapter-2-scoping-requirements
/content/FAC2017/chapter-3-building-blocks
/content/FAC2017/chapter-4-accessible-routes
/content/FAC2017/chapter-5-general-site-and-building-elements
/content/FAC2017/chapter-6-plumbing-elements-and-facilities
/content/FAC2017/chapter-7-communication-elements-and-features
/content/FAC2017/chapter-8-special-rooms-spaces-and-elements
/content/FAC2017/chapter-9-built-in-elements
/content/FAC2017/chapter-10-recreation-facilities
/content/FAC2017/list-of-figures
/content/FAC2017/fair-housing-accessibility-guidelines-design-guidelines-for-accessible-adaptable-dwellings
/content/FAC2017/advisory
Lets change the base url first and try again.
from bs4 import BeautifulSoup
import requests
LINK = "https://codes.iccsafe.org"
pop = ""
chapters = open("bchap.txt",'r')
a = []
for aline in chapters:
chap = aline
pop = LINK+chap
r = requests.get(pop)
print(pop)
print(r.status_code)
chapters.close()
Outputs:
https://codes.iccsafe.org/content/FAC2017
404
...
why? b'coz of the \n. If we do a
print(repr(pop))
It will output
'https://codes.iccsafe.org/content/FAC2017\n'
You'll have to strip away that \n also. The final code that worked is
from bs4 import BeautifulSoup
import requests
LINK = "https://codes.iccsafe.org"
pop = ""
chapters = open("bchap.txt",'r')
a = []
for aline in chapters:
chap = aline
pop = LINK+chap
r = requests.get(pop.strip())
data = r.text
soup = BeautifulSoup(data, 'html.parser')
mydivs = soup.findAll("div", class_="annotator chapter_header_styling")
f = open("BOOK.html","a")
for div in mydivs:
z = str(div)
f.write(z)
f.close()
chapters.close()

BeautifulSoup Scraping Formatting

This is my first time using BeautifulSoup and I am attempting to scrap store location data from a local convenience store.
However I'm running into some issues on trying to remove empty lines when data is being passed into a CSV file, I've tried .replace('\n','') and .strip() both did not worked.
Also I'm having problems with splitting data that is scraped and contained in the same sibling method.
I've added the script below:
from bs4 import BeautifulSoup
from requests import get
import urllib.request
import sched, time
import csv
url = 'http://www.cheers.com.sg/web/store_location.jsp'
response = get(url)
soup = BeautifulSoup(response.text, 'html.parser')
#print (soup.prettify())
#open a file for writing
location_data = open('data/soupdata.csv', 'w', newline='')
#create the csv writer object
csvwriter = csv.writer(location_data)
cheers = soup.find('div' , id="store_container")
count = 0
#Loop for Header tags
for paragraph in cheers.find_all('b'):
header1 = paragraph.text.replace(':' , '')
header2 = paragraph.find_next('b').text.replace(':' , '')
header3 = paragraph.find_next_siblings('b')[1].text.replace(':' , '')
if count == 0:
csvwriter.writerow([header1, header2, header3])
count += 1
break
for paragraph in cheers.find_all('br'):
brnext = paragraph.next_sibling.strip()
brnext1 = paragraph.next_sibling
test1 = brnext1.next_sibling.next_sibling
print(test1)
csvwriter.writerow([brnext, test1])
location_data.close()
Sample of output generated:
Sample of what output should look like:
How can I achieve this?
Thanks in advance.
To make it slightly organized, you can try like the following. I've used .select() instead of .find_all().
import csv
from bs4 import BeautifulSoup
import requests
url = 'http://www.cheers.com.sg/web/store_location.jsp'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
with open("output.csv","w",newline="") as infile:
writer = csv.writer(infile)
writer.writerow(["Address","Telephone","Store hours"])
for items in soup.select("#store_container .store_col"):
addr = items.select_one("b").next_sibling.next_sibling
tel = items.select_one("b:nth-of-type(2)").next_sibling
store = items.select_one("b:nth-of-type(3)").next_sibling
writer.writerow([addr,tel,store])
You just need to change the way of extracting address, tel and store hours
import csv
from bs4 import BeautifulSoup
from requests import get
url = 'http://www.cheers.com.sg/web/store_location.jsp'
response = get(url)
soup = BeautifulSoup(response.text, 'html.parser')
# print (soup.prettify())
# open a file for writing
location_data = open('data/soupdata.csv', 'w', newline='')
# create the csv writer object
csvwriter = csv.writer(location_data)
cheers = soup.find('div', id="store_container")
count = 0
# Loop for Header tags
for paragraph in cheers.find_all('b'):
header1 = paragraph.text.replace(':', '')
header2 = paragraph.find_next('b').text.replace(':', '')
header3 = paragraph.find_next_siblings('b')[1].text.replace(':', '')
if count == 0:
csvwriter.writerow([header1, header2, header3])
count += 1
break
for paragraph in cheers.find_all('div'):
label = paragraph.find_all('b')
if len(label) == 3:
print(label)
address = label[0].next_sibling.next_sibling
tel = label[1].next_sibling
hours = label[2].next_sibling
csvwriter.writerow([address, tel, hours])
location_data.close()

Creating a text-delimited file from HTML tables using BeautifulSoup

I'm trying to create a text-delimited file containing the data from the "Actions" table on webpages like this one: http://stats.swehockey.se/Game/Events/300978
I would like each line to include the game # (from the end of the URL) and then the text from the line on the table. For example:
300972 | 60:00 | GK Out | OHK | 33. Hudacek, Julius
I haven't been able to get each row to actually separate. I've tried parsing through each row and column, using a list of stripped strings, and searching by different tags, classes, and styles.
Here's what I currently have:
from bs4 import BeautifulSoup
import urllib.request
def createtext():
gamestr = urlstr + "|"
#Find all table lines. Create one pipe-delimited line for each.
aptext = gamestr
for el in soup.find_all('tr'):
playrow = el.find_all('td', 'tdOdd')
for td in playrow:
if(td.find(text=True)) not in ("", None, "\n"):
aptext = aptext + ''.join(td.text) + "|"
aptext = aptext + "\n" + gamestr
#Creates file with Game # as filename and writes the data to the file
currentfile = urlstr + ".txt"
with open(currentfile, "w") as f:
f.write(str(aptext))
#Grabs the HTML file and creates the soup
urlno = 300978
urlstr = str(urlno)
url = ("http://stats.swehockey.se/Game/Events/" + urlstr)
request = urllib.request.Request(url)
response = urllib.request.urlopen(request)
pbpdoc = response.read().decode('utf-8')
soup = BeautifulSoup(pbpdoc)
createtext()
Thanks for any help or guidance!
First of all, you don't have to construct the CSV data manually, Python provides a built-in csv module for that.
Then, since you are up to "actions" only, I'd identify the "actions" table and find the events-only rows. This can be done with the help of a filtering function checking the first cell to not be empty:
import csv
from bs4 import BeautifulSoup
import requests
def only_action_rows(tag):
if tag.name == 'tr':
first_cell = tag.find('td', class_='tdOdd')
return first_cell and first_cell.get_text(strip=True)
event_id = 300978
url = "http://stats.swehockey.se/Game/Events/{event_id}".format(event_id=event_id)
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
actions_table = soup.find("h2", text="Actions").find_parent("table")
data = [[event_id] + [td.get_text(strip=True) for td in row.find_all('td', class_='tdOdd')]
for row in actions_table.find_all(only_action_rows)]
with open("output.csv", "w") as f:
writer = csv.writer(f)
writer.writerows(data)
Note that I'm using requests here.

Parse HTML table data to JSON and save to text file in Python 2.7

I'm trying to extract the data on the crime rate across states from
this webpage, link to web page
http://www.disastercenter.com/crime/uscrime.htm
I am able to get this into text file. But I would like to get the
response in Json format. How can I do this in python.
Here is my code:
import urllib
import re
from bs4 import BeautifulSoup
link = "http://www.disastercenter.com/crime/uscrime.htm"
f = urllib.urlopen(link)
myfile = f.read()
soup = BeautifulSoup(myfile)
soup1=soup.find('table', width="100%")
soup3=str(soup1)
result = re.sub("<.*?>", "", soup3)
print(result)
output=open("output.txt","w")
output.write(result)
output.close()
The following code will get the data from the two tables and output all of it as a json formatted string.
Working Example (Python 2.7.9):
from lxml import html
import requests
import re as regular_expression
import json
page = requests.get("http://www.disastercenter.com/crime/uscrime.htm")
tree = html.fromstring(page.text)
tables = [tree.xpath('//table/tbody/tr[2]/td/center/center/font/table/tbody'),
tree.xpath('//table/tbody/tr[5]/td/center/center/font/table/tbody')]
tabs = []
for table in tables:
tab = []
for row in table:
for col in row:
var = col.text_content()
var = var.strip().replace(" ", "")
var = var.split('\n')
if regular_expression.match('^\d{4}$', var[0].strip()):
tab_row = {}
tab_row["Year"] = var[0].strip()
tab_row["Population"] = var[1].strip()
tab_row["Total"] = var[2].strip()
tab_row["Violent"] = var[3].strip()
tab_row["Property"] = var[4].strip()
tab_row["Murder"] = var[5].strip()
tab_row["Forcible_Rape"] = var[6].strip()
tab_row["Robbery"] = var[7].strip()
tab_row["Aggravated_Assault"] = var[8].strip()
tab_row["Burglary"] = var[9].strip()
tab_row["Larceny_Theft"] = var[10].strip()
tab_row["Vehicle_Theft"] = var[11].strip()
tab.append(tab_row)
tabs.append(tab)
json_data = json.dumps(tabs)
output = open("output.txt", "w")
output.write(json_data)
output.close()
This might be what you want, if you can use the requests and lxml modules. The data structure presented here is very simple, adjust this to your needs.
First, get a response from your requested URL and parse the result into an HTML tree:
import requests
from lxml import etree
import json
response = requests.get("http://www.disastercenter.com/crime/uscrime.htm")
tree = etree.HTML(response.text)
Assuming you want to extract both tables, create this XPath and unpack the results. totals is "Number of Crimes" and rates is "Rate of Crime per 100,000 People":
xpath = './/table[#width="100%"][#style="background-color: rgb(255, 255, 255);"]//tbody'
totals, rates = tree.findall(xpath)
Extract the raw data (td.find('./') means first child item, whatever tag it has) and clean the strings (r'' raw strings are needed for Python 2.x):
raw_data = []
for tbody in totals, rates:
rows = []
for tr in tbody.getchildren():
row = []
for td in tr.getchildren():
child = td.find('./')
if child is not None and child.tag != 'br':
row.append(child.text.strip(r'\xa0').strip(r'\n').strip())
else:
row.append('')
rows.append(row)
raw_data.append(rows)
Zip together the table headers in the first two rows, then delete the redundant rows, seen as the 11th & 12th steps in slice notation:
data = {}
data['tags'] = [tag0 + tag1 for tag0, tag1 in zip(raw_data[0][0], raw_data[0][1])]
for raw in raw_data:
del raw[::12]
del raw[::11]
Store the rest of the raw data and create a JSON file (optional: eliminate whitespace with separators=(',', ':')):
data['totals'], data['rates'] = raw_data[0], raw_data[1]
with open('data.json', 'w') as f:
json.dump(data, f, separators=(',', ':'))

Categories

Resources