I'm new to web scraping. I'm trying to scrape data from the news site.
I have this code:
from bs4 import BeautifulSoup as soup
import pandas as pd
import requests
detik_url = "https://news.detik.com/indeks/2"
detik_url
html = requests.get(detik_url)
bsobj = soup(html.content, 'lxml')
bsobj
for link in bsobj.findAll("h3"):
print("Headline : {}".format(link.text.strip()))
links = []
for news in bsobj.findAll('article',{'class':'list-content__item'}):
links.append(news.a['href'])
for link in links:
page = requests.get(link)
bsobj = soup(page.content)
div = bsobj.findAll('div',{'class':'detail__body itp_bodycontent_wrapper'})
for p in div:
print(p.find('p').text.strip())
How do I utilize a Pandas Dataframe to store the obtained content into a CSV file?
You can store your content in a pandas dataframe, and then write the structure to a csv file.
Suppose you want to save all the text in your p.find('p').text.strip(), along with the headline in a csv file, you can store your headline in any variable (say head):
So, from your code:
for link in links:
page = requests.get(link)
bsobj = soup(page.content)
div = bsobj.findAll('div',{'class':'detail__body itp_bodycontent_wrapper'})
for p in div: # <----- Here we make the changes
print(p.find('p').text.strip())
In the line shown above, we do the following:
import pandas as pd
# Create an empty array to store all the data
generated_text = [] # create an array to store your data
for link in links:
page = requests.get(link)
bsobj = soup(page.content)
div = bsobj.findAll('div',{'class':'detail__body itp_bodycontent_wrapper'})
for p in div:
# print statement if you want to see the output
generated_text.append(p.find('p').text.strip()) # <---- save the data in an array
# then write this into a csv file using pandas, first you need to create a
# dataframe from our list
df = pd.DataFrame(generated_text, columns = [head])
# save this into a csv file
df.to_csv('csv_name.csv', index = False)
Also, instead of the for loop, you can directly use list comprehensions and save to your CSV.
# instead of the above snippet, replace the whole `for p in div` loop by
# So from your code above:
.....
bsobj = soup(page.content)
div = bsobj.findAll('div',{'class':'detail__body itp_bodycontent_wrapper'})
# Remove the whole `for p in div:` and instead use this:
df = pd.DataFrame([p.find('p').text.strip() for p in div], columns = [head])
....
df.to_csv('csv_name.csv', index = False)
Also, you can convert your array generated from list comprehension to a numpy array and directly write it to a csv file:
import numpy as np
import pandas as pd
# On a side note:
# convert your normal array to numpy array or use list comprehension to make a numpy array,
# also there are faster ways to convert a normal array to numpy array which you can explore,
# from there you can write to a csv
pd.DataFrame(nparray).to_csv('csv_name.csv'))
Related
The I wrote some code to scrape data off of a web page and put it in csv format, but the end result isn't what I want. The figures/values are handled properly, but the column headings aren't. Here's what I mean:
Above is how it looks in the spreadsheet.
And this is how it looks in a text editor. I don't understand what's wrong because when I open the array in Spyder it looks alright:
Here's my code:
from numpy import array, insert, transpose, savetxt
from bs4 import BeautifulSoup as bs
from requests import get
from csv import writer
def obtain_links():
data = {"inflation":get("https://www.cbn.gov.ng/rates/inflrates.asp").text,
"crude_oil":get("https://www.cbn.gov.ng/rates/DailyCrude.asp").text,
"for_res":get("https://www.cbn.gov.ng/IntOps/Reserve.asp").text,
"exch":get("https://www.cbn.gov.ng/rates/").text,
"m_market":get("https://www.cbn.gov.ng/rates/mnymktind.asp").text}
return data
data = obtain_links()
def parse_inf(data=data):
html = bs(data["inflation"],"lxml")
year = html.find("td",width="62%").find("h2").text
months = html.find("div",id="ContentTextinner").find_all("th")
inf_months = [i.text for i in months]
inf_months = [f"{i} {year}" for i in inf_months]
inf_months[0] = inf_months[0][:5]
inf_months = array(inf_months).transpose()
measure = html.find("div",id="ContentTextinner").find_all("td",align="left")
measure = [i.text for i in measure][:-3]
values = [[i.text[:5] for i in html.find("div",id="ContentTextinner").find_all("td",class_="style2",style="width: 15%")],
[i.text[:5] for i in html.find("div",id="ContentTextinner").find_all("td",class_="style2",style="width: 16%")],
[i.text[:5] for i in html.find("div",id="ContentTextinner").find_all("td",class_="style2",width="20%")]]
values.insert(0,measure)
values = array(values)
inf_data = insert(values,0,inf_months,axis=1)
return inf_data
inf_data = parse_inf()
savetxt("/home/user/Documents/Scraped Data/Inflation Data.csv",inf_data,fmt="%s",delimiter=",")
Any ideas chaps and chappettes?
I'm trying to soup-ify get requests
from bs4 import BeautifulSoup
import requests
import pandas as pd
html_page = requests.get('"https://www.dataquest.io"')
soup = BeautifulSoup(html_page, "lxml")
soup.find_all('<\a>')
However, this just returns an empty list
This will pull the table rows and assign each row to a dictionary, which is appended to a list. You may want to tweak the selectors slightly.
from bs4 import BeautifulSoup
import requests
from pprint import pprint
output_data = [] # This is a LoD containing all of the table data
for i in range(1, 453): # For loop used to paginate
data_page = requests.get(f'https://www.dataquest.io?')
print(data_page)
soup = BeautifulSoup(data_page.text, "lxml")
# Find all of the table rows
elements = soup.select('div.head_table_t')
try:
secondary_elements = soup.select('div.list_table_subs')
elements = elements + secondary_elements
except:
pass
print(len(elements))
# Iterate through the rows and select individual column and assign it to the dictionary with the correct header
for element in elements:
data = {}
data['Name'] = element.select_one('div.col_1 a').text.strip()
data['Page URL'] = element.select_one('div.col_1 a')['href']
output_data.append(data) # Append dictionary (contact info) to the list
pprint(data) # Pretty Print the dictionary out (to see what you're receiving, this can be removed)
I need to download the data table and export to excel in"http://www.dicj.gov.mo/web/en/information/DadosEstat_mensal/2019/index.html" Inspecting the page using Chrome, inspect function. The data is in "http://www.dicj.gov.mo/web/en/information/DadosEstat_mensal/2019/report_en.xml?id=2". However, it is no longer in Table format.
url = "http://www.dicj.gov.mo/web/en/information/DadosEstat_mensal/2019/index.html"
table= pd.read_html(url)[2]
table.info()
print(table)
table.to_excel("GGR.xlsx")
I see that now your source web site returns the content in XML format.
To process it, you can apply BeautifulSoup. Assuming that you have installed it,
proceed as follows:
Import necessary modules.
from bs4 import BeautifulSoup
import requests
Read the source page:
page = requests.get('http://www.dicj.gov.mo/web/en/information/DadosEstat_mensal/2019/report_en.xml?id=2')
soup = BeautifulSoup(page.text, 'lxml')
Read column names and create the MultiIndex for columns in the target DataFrame:
col = soup.find_all('column')
h1 = [ col[i].contents[0] for i in range(1,3) ]
h2 = [ col[i].contents[0] for i in range(3,6) ]
cols = pd.MultiIndex.from_product([h1, h2])
Process source records, creating ind (index) and rows:
recs = soup.find_all('record')
ind = []
rows = []
for rec in recs:
cells = rec.find_all('data')
ind.append(cells[0].contents[0])
rows.append([ cells[i].contents[0] for i in range(1,7) ])
And the last step - create the target DataFrame:
df = pd.DataFrame(rows, index=ind, columns=cols)
I tried to read this table from the first page given by you, using read_html,
but I failed.
Probably the final content is loaded by some JavaScript in this page,
which can not be "seen" by read_html.
I am relatively new to programming and completely new to stack overflow. I thought a good way to learn would be with a python & excel based project, but am stuck. My plan was to scrape a website of addresses using beautiful soup look up the zillow estimates of value for those addresses and populate them into tabular form in excel. I am unable to figure out how to get the addresses (the html on the site I am trying to scrape seems pretty messy), but was able to pull google address links from the site. Sorry if this is a very basic question, any advice would help though:
from bs4 import BeautifulSoup
from urllib.request import Request,
urlopen
import re
import pandas as pd
req = Request("http://www.tjsc.com/Sales/TodaySales")
html_page = urlopen(req)
soup = BeautifulSoup(html_page, "lxml")
count = 0
links = []
for link in soup.findAll('a'):
links.append(link.get('href'))
count = count +1
print(links)
print("count is", count)
po = links
pd.DataFrame(po).to_excel('todaysale.xlsx', header=False, index=False)
you are on the right track. Instead of 'a', you need to use different html tag 'td' for the rows. Also 'th' for column names. here is one way to implement it. list_slide function converts each 14 elements to one row since the original table has 14 columns.
from bs4 import BeautifulSoup as bs
import requests
import pandas as pd
url = "http://www.tjsc.com/Sales/TodaySales"
r = requests.get(url, verify=False)
text = r.text
soup = bs(text, 'lxml')
# Get column headers from the html file
header = []
for c_name in soup.findAll('th'):
header.append(c_name)
# clean up the extracted header content
header = [h.contents[0].strip() for h in header]
# get each row of the table
row = []
for link in soup.find_all('td'):
row.append(link.get_text().strip())
def list_slice(my_list, step):
"""This function takes any list, and divides it to chunks of size of "step"
"""
return [my_list[x:x + step] for x in range(0, len(my_list), step)]
# creating the final dataframe
df = pd.DataFrame(list_slice(row, 14), columns=header[:14])
I'm trying to get a data from this site
and then use some of it. Sorry for not copy-paste it but it's a long xml. So far I tried to get this data those ways:
from urllib.request import urlopen
url = "http://degra.wi.pb.edu.pl/rozklady/webservices.php?"
s = urlopen(url)
content = s.read()
as print(content) looks good, now I would like to get a data from it
<tabela_rozklad data-aktualizacji="1480583567">
<DZIEN>2</DZIEN>
<GODZ>3</GODZ>
<ILOSC>2</ILOSC>
<TYG>0</TYG>
<ID_NAUCZ>66</ID_NAUCZ>
<ID_SALA>79</ID_SALA>
<ID_PRZ>104</ID_PRZ>
<RODZ>W</RODZ>
<GRUPA>1</GRUPA>
<ID_ST>13</ID_ST>
<SEM>1</SEM>
<ID_SPEC>0</ID_SPEC>
</tabela_rozklad>
How can I handle this data to easy use it?
You can use Beautiful soup and capture the tags you want. The code below should get you started!
import pandas as pd
import requests
from bs4 import BeautifulSoup
url = "http://degra.wi.pb.edu.pl/rozklady/webservices.php?"
# secure url content
response = requests.get(url).content
soup = BeautifulSoup(response)
# find each tabela_rozklad
tables = soup.find_all('tabela_rozklad')
# for each tabela_rozklad looks like there is 12 nested corresponding tags
tags = ['dzien', 'godz', 'ilosc', 'tyg', 'id_naucz', 'id_sala',
'id_prz', 'rodz', 'grupa', 'id_st', 'sem', 'id_spec']
# initialize empty dataframe
df = pd.DataFrame()
# iterate over each tabela_rozklad and extract each tag and append to pandas dataframe
for table in tables:
all = map(lambda x: table.find(x).text, tags)
df = df.append([all])
# insert tags as columns
df.columns = tags
# display first 5 rows of table
df.head()
# and the shape of the data
df.shape # 665 rows, 12 columns
# and now you can get to the information using traditional pandas functionality
# for instance, count observations by rodz
df.groupby('rodz').count()
# or subset only observations where rodz = J
J = df[df.rodz == 'J']