The I wrote some code to scrape data off of a web page and put it in csv format, but the end result isn't what I want. The figures/values are handled properly, but the column headings aren't. Here's what I mean:
Above is how it looks in the spreadsheet.
And this is how it looks in a text editor. I don't understand what's wrong because when I open the array in Spyder it looks alright:
Here's my code:
from numpy import array, insert, transpose, savetxt
from bs4 import BeautifulSoup as bs
from requests import get
from csv import writer
def obtain_links():
data = {"inflation":get("https://www.cbn.gov.ng/rates/inflrates.asp").text,
"crude_oil":get("https://www.cbn.gov.ng/rates/DailyCrude.asp").text,
"for_res":get("https://www.cbn.gov.ng/IntOps/Reserve.asp").text,
"exch":get("https://www.cbn.gov.ng/rates/").text,
"m_market":get("https://www.cbn.gov.ng/rates/mnymktind.asp").text}
return data
data = obtain_links()
def parse_inf(data=data):
html = bs(data["inflation"],"lxml")
year = html.find("td",width="62%").find("h2").text
months = html.find("div",id="ContentTextinner").find_all("th")
inf_months = [i.text for i in months]
inf_months = [f"{i} {year}" for i in inf_months]
inf_months[0] = inf_months[0][:5]
inf_months = array(inf_months).transpose()
measure = html.find("div",id="ContentTextinner").find_all("td",align="left")
measure = [i.text for i in measure][:-3]
values = [[i.text[:5] for i in html.find("div",id="ContentTextinner").find_all("td",class_="style2",style="width: 15%")],
[i.text[:5] for i in html.find("div",id="ContentTextinner").find_all("td",class_="style2",style="width: 16%")],
[i.text[:5] for i in html.find("div",id="ContentTextinner").find_all("td",class_="style2",width="20%")]]
values.insert(0,measure)
values = array(values)
inf_data = insert(values,0,inf_months,axis=1)
return inf_data
inf_data = parse_inf()
savetxt("/home/user/Documents/Scraped Data/Inflation Data.csv",inf_data,fmt="%s",delimiter=",")
Any ideas chaps and chappettes?
Related
I'm new to web scraping. I'm trying to scrape data from the news site.
I have this code:
from bs4 import BeautifulSoup as soup
import pandas as pd
import requests
detik_url = "https://news.detik.com/indeks/2"
detik_url
html = requests.get(detik_url)
bsobj = soup(html.content, 'lxml')
bsobj
for link in bsobj.findAll("h3"):
print("Headline : {}".format(link.text.strip()))
links = []
for news in bsobj.findAll('article',{'class':'list-content__item'}):
links.append(news.a['href'])
for link in links:
page = requests.get(link)
bsobj = soup(page.content)
div = bsobj.findAll('div',{'class':'detail__body itp_bodycontent_wrapper'})
for p in div:
print(p.find('p').text.strip())
How do I utilize a Pandas Dataframe to store the obtained content into a CSV file?
You can store your content in a pandas dataframe, and then write the structure to a csv file.
Suppose you want to save all the text in your p.find('p').text.strip(), along with the headline in a csv file, you can store your headline in any variable (say head):
So, from your code:
for link in links:
page = requests.get(link)
bsobj = soup(page.content)
div = bsobj.findAll('div',{'class':'detail__body itp_bodycontent_wrapper'})
for p in div: # <----- Here we make the changes
print(p.find('p').text.strip())
In the line shown above, we do the following:
import pandas as pd
# Create an empty array to store all the data
generated_text = [] # create an array to store your data
for link in links:
page = requests.get(link)
bsobj = soup(page.content)
div = bsobj.findAll('div',{'class':'detail__body itp_bodycontent_wrapper'})
for p in div:
# print statement if you want to see the output
generated_text.append(p.find('p').text.strip()) # <---- save the data in an array
# then write this into a csv file using pandas, first you need to create a
# dataframe from our list
df = pd.DataFrame(generated_text, columns = [head])
# save this into a csv file
df.to_csv('csv_name.csv', index = False)
Also, instead of the for loop, you can directly use list comprehensions and save to your CSV.
# instead of the above snippet, replace the whole `for p in div` loop by
# So from your code above:
.....
bsobj = soup(page.content)
div = bsobj.findAll('div',{'class':'detail__body itp_bodycontent_wrapper'})
# Remove the whole `for p in div:` and instead use this:
df = pd.DataFrame([p.find('p').text.strip() for p in div], columns = [head])
....
df.to_csv('csv_name.csv', index = False)
Also, you can convert your array generated from list comprehension to a numpy array and directly write it to a csv file:
import numpy as np
import pandas as pd
# On a side note:
# convert your normal array to numpy array or use list comprehension to make a numpy array,
# also there are faster ways to convert a normal array to numpy array which you can explore,
# from there you can write to a csv
pd.DataFrame(nparray).to_csv('csv_name.csv'))
Please find below the code that I am using to put distorted data into CSV file in the table format:
import requests
from bs4 import BeautifulSoup
import csv
f = open('moneyControl-bonus','w' , newline = '')
writer = csv.writer(f)
f2 = open('moneyControl-dividend','w' , newline = '')
writer2 = csv.writer(f2)
url = 'https://www.moneycontrol.com/stocks/marketinfo/upcoming_actions/home.html'
headers = {'user-agent':'Mozilla/5.0'}
response = requests.get(url,headers)
soup = BeautifulSoup(response.content,'lxml')
div = soup.find_all('div',class_='tbldata36 PT10')[0]
for table in div.find_all('table'):
for row in table.find_all('tr'):
writer.writerow[row]
div2 = soup.find_all('div',class_='tbldata36 PT20')[0]
for table2 in soup.find_all('table'):
for row2 in table2.find_all('tr'):
writer2.writerow[row2]
Have you tried using pandas? This the default library to be used in writing the csv format.
# pip install pandas
import pandas as pd
You can write into the CSV file by either passing list of lists or by dictonary.
df = pd.DataFrame({'col1':[1,2,3,4],'col2':['a','b','c','d']})
df = pd.DataFrame([[1,2,3,4],['a','b','c','d']],columns=['col1','col2'])
df.to_csv(path_and_name_of_file)
You can use many different formats with that one such as Excel, table, text, json etc.
Please take a look at the official DataFrame Documentation
I have a csv called 'df' with 1 column. I have a header and 10 urls.
Col
"http://www.cnn.com"
"http://www.fark.com"
etc
etc
This is my ERROR code
import bs4 as bs
df_link = pd.read_csv('df.csv')
for link in df_link:
x = urllib2.urlopen(link[0])
new = x.read()
# Code does not even get past here as far as I checked
soup = bs.BeautifulSoup(new,"lxml")
for text in soup.find_all('a',href = True):
text.append((text.get('href')))
I am getting an error which says
ValueError: unknown url type: C
I also get other variations of this error like
The issue is, it is not even getting past
x = urllib2.urlopen(link[0])
On the other hand; This is the WORKING CODE...
url = "http://www.cnn.com"
x = urllib2.urlopen(url)
new = x.read()
soup = bs.BeautifulSoup(new,"lxml")
for link in soup.find_all('a',href = True):
links.append((link.get('href')))
Fixed answer
I didn't realize you were using pandas, so what I said wasn't very helpful.
The way you want to do this using pandas is to iterate over the rows and extract the info from them. The following should work without having to get rid of the header:
import bs4 as bs
import pandas as pd
import urllib2
df_link = pd.read_csv('df.csv')
for link in df_link.iterrows():
url = link[1]['Col']
x = urllib2.urlopen(url)
new = x.read()
# Code does not even get past here as far as I checked
soup = bs.BeautifulSoup(new,"lxml")
for text in soup.find_all('a',href = True):
text.append((text.get('href')))
Original misleading answer below
It looks like the header of your CSV file is not being treated separately, and so in the first iteration through df_link, link[0] is "Col", which isn't a valid URL.
I'm trying to get a data from this site
and then use some of it. Sorry for not copy-paste it but it's a long xml. So far I tried to get this data those ways:
from urllib.request import urlopen
url = "http://degra.wi.pb.edu.pl/rozklady/webservices.php?"
s = urlopen(url)
content = s.read()
as print(content) looks good, now I would like to get a data from it
<tabela_rozklad data-aktualizacji="1480583567">
<DZIEN>2</DZIEN>
<GODZ>3</GODZ>
<ILOSC>2</ILOSC>
<TYG>0</TYG>
<ID_NAUCZ>66</ID_NAUCZ>
<ID_SALA>79</ID_SALA>
<ID_PRZ>104</ID_PRZ>
<RODZ>W</RODZ>
<GRUPA>1</GRUPA>
<ID_ST>13</ID_ST>
<SEM>1</SEM>
<ID_SPEC>0</ID_SPEC>
</tabela_rozklad>
How can I handle this data to easy use it?
You can use Beautiful soup and capture the tags you want. The code below should get you started!
import pandas as pd
import requests
from bs4 import BeautifulSoup
url = "http://degra.wi.pb.edu.pl/rozklady/webservices.php?"
# secure url content
response = requests.get(url).content
soup = BeautifulSoup(response)
# find each tabela_rozklad
tables = soup.find_all('tabela_rozklad')
# for each tabela_rozklad looks like there is 12 nested corresponding tags
tags = ['dzien', 'godz', 'ilosc', 'tyg', 'id_naucz', 'id_sala',
'id_prz', 'rodz', 'grupa', 'id_st', 'sem', 'id_spec']
# initialize empty dataframe
df = pd.DataFrame()
# iterate over each tabela_rozklad and extract each tag and append to pandas dataframe
for table in tables:
all = map(lambda x: table.find(x).text, tags)
df = df.append([all])
# insert tags as columns
df.columns = tags
# display first 5 rows of table
df.head()
# and the shape of the data
df.shape # 665 rows, 12 columns
# and now you can get to the information using traditional pandas functionality
# for instance, count observations by rodz
df.groupby('rodz').count()
# or subset only observations where rodz = J
J = df[df.rodz == 'J']
I'm having a few issues. First off, when I try to write a CSV file from a web scraping, nothing is written. The file does save, but it's completely blank. Ultimately, I'm hoping to open it and call on the water temperature column to calculate an average.
My other issue is that I only want a few of the columns from the table in my CSV file. Can someone verify that what I did is correct? I only want the first 3 columns, and then the 14th column.
Thank you!
import sys
import urllib2
import csv
import requests
from bs4 import BeautifulSoup
r_temp1 = requests.get('http://www.ndbc.noaa.gov/data/realtime2/BZBM3.txt')
html_temp1 = r_temp1.text
soup = BeautifulSoup(html_temp1, "html.parser")
table_temp1 = soup.find('table')
rows_temp1 = table.findAll('tr')
rows_temp1 = rows_temp1[1:]
#writing to a csv file
csvfile_temp1 = open("temp1.csv","wb")
output_temp1 = csv.writer(csvfile_temp1, delimiter=',',quotechar='"',quoting=csv.QUOTE_MINIMAL)
for row in rows_temp1:
Year = cells[0].text.strip()
Month = cells[1].text.strip()
Day = cells[2].text.strip()
W_temp = cells[14].text.strip()
output.writerow([Year,Month,Day,W_temp])
csvfile_temp1.close()
You're not seeing anything in the file because there are no rows in rows_temp1. That array is empty because there are no table rows in a text file. It looks like you are expecting an HTML file with a table, but the file is just a plain text file.
Here is a version that does what you want:
import csv
import requests
r_temp1 = requests.get('http://www.ndbc.noaa.gov/data/realtime2/BZBM3.txt')
rows_temp1 = r_temp1.text.split('\n')
#writing to a csv file
csvfile_temp1 = open("temp1.csv","wb")
output_temp1 = csv.writer(csvfile_temp1, delimiter=',',quotechar='"',quoting=csv.QUOTE_MINIMAL)
for row in rows_temp1:
if not row: continue
cells = row.split()
Year = cells[0].strip()
Month = cells[1].strip()
Day = cells[2].strip()
W_temp = cells[14].strip()
output_temp1.writerow([Year,Month,Day,W_temp])
csvfile_temp1.close()
Running your code gives:
File "hh.py", line 11, in <module>
rows_temp1 = table.findAll('tr')
NameError: name 'table' is not defined
And indeed in line 10 you define table_temp1, and not table. Don't know if you have other issue, but start by reading the errors you get