Putting scraped data from beautifulsoup into a csv file - python

I want to put the data I'm scraping from the website into a csv file, my first attempt was using scrapy but I couldn't get the syntax right. When I managed to do using beautifulsoup, I just don't know how to put it into a csv file.
import requests
from bs4 import BeautifulSoup
URL = "https://www.practo.com/tests/glycosylated-haemoglobin-blood/p?city=delhi"
page = requests.get(URL)
#print(page)
soup=BeautifulSoup(page.content,'html.parser')
results = soup.find(id='root-app')
#print(results.prettify())
job_elems = results.find_all('div', class_='u-padrl--std')
#<h1 class="o-font-size--24 u-font-bold u-marginb--std">HbA1c Test</h1
for job_elem in job_elems:
title_elem = job_elem.find('h1', class_='o-font-size--24 u-font-bold u-marginb--std')
also_known = job_elem.find('span',class_="u-font-bold")
cost = job_elem.find('div',class_="o-font-size--22 u-font-bold o-f-color--primary")
what_test = job_elem.find('div',class_="c-markdown--unstyled")
#test_prep = job_elem.find('div',class_="c-tabsection__content c-pp__accordion-item__content active")
#temp = job_elem.find('p')
print(title_elem.text)
print(also_known.text)
print(cost.text)
print(what_test.text)
#print(temp.text)
#print(test_prep.text)
print()
text_content = results.find_all('div',class_='c-markdown--unstyled')
# c-tabsection__content c-pp__accordion-item__content active
# c-tabsection c-pp__accordion-item u-br-rule u-padtb--std--half active
for item in text_content:
prep = item.find('p')
#,class_="c-tabsection__content c-pp__accordion-item__content active")
print(prep.text)
print('xxo')

import requests
from bs4 import BeautifulSoup
# import the csv module
import csv
URL = "https://www.practo.com/tests/glycosylated-haemoglobin-blood/p?city=delhi"
page = requests.get(URL)
#print(page)
soup=BeautifulSoup(page.content,'html.parser')
results = soup.find(id='root-app')
#print(results.prettify())
job_elems = results.find_all('div', class_='u-padrl--std')
#<h1 class="o-font-size--24 u-font-bold u-marginb--std">HbA1c Test</h1
rows = []
fields = ['title_elem', 'also_known', 'cost', 'what_test']
filename = "myfile.csv"
for job_elem in job_elems:
title_elem = job_elem.find('h1', class_='o-font-size--24 u-font-bold u-marginb--std').text.encode("utf-8")
also_known = job_elem.find('span',class_="u-font-bold").text.encode("utf-8")
cost = job_elem.find('div',class_="o-font-size--22 u-font-bold o-f-color--primary").text.encode("utf-8")
what_test = job_elem.find('div',class_="c-markdown--unstyled").text.encode("utf-8")
row = [title_elem, also_known, cost, what_test]
rows.append(row)
with open(filename, 'w') as csvfile:
# creating a csv writer object
csvwriter = csv.writer(csvfile)
# writing the fields
csvwriter.writerow(fields)
# writing the data rows
csvwriter.writerows(rows)
text_content = results.find_all('div',class_='c-markdown--unstyled')
# c-tabsection__content c-pp__accordion-item__content active
# c-tabsection c-pp__accordion-item u-br-rule u-padtb--std--half active
for item in text_content:
prep = item.find('p')
#,class_="c-tabsection__content c-pp__accordion-item__content active")
print(prep.text)
print('xxo')

You can use the xlsxwriter library.
import xlsxwriter
workbook = xlsxwriter.Workbook("file.xlsx")
worksheet = workbook.add_worksheet()
worksheet.write(row, column, element)
workbook.close()

Related

assemble a table with the scraping of data spread across several tables containing in each element the link to the data

I need to set up a table as follows
date
time
Agua 6 Horas
Agua Diario
(QFE)
Radiación
Humedad
Viento promedio
Viento Temperatura del Aire Seco
this is the station link in table 6 there are the items of interest that fill each of the columns of the table that I intend to assemble
each desired element has an index in this table
the ones that matter are 57,60,64,89,66,28,26
I need to get the data from this table, and fill each of the columns of the table that I want to build with this data
when accessing an element of the table we have the data grouped in another table
That's how I thought of getting the data, I just can't gather them in a single table, if anyone can help me I would be grateful
import csv
from os import PRIO_PGRP
# from msilib.schema import tables
from pydoc import pager
from re import A
import re
from cv2 import pencilSketch
import requests
from bs4 import BeautifulSoup
import pandas as pd
from lxml import html
from urllib.parse import urljoin
# filename = "data.csv"
# f = open(filename, "w", newline ='')
# data = csv.writer(f)
pages = [
'https://climatologia.meteochile.gob.cl/application/informacion/inventarioComponentesPorEstacion/330006/57/119',
'https://climatologia.meteochile.gob.cl/application/informacion/inventarioComponentesPorEstacion/330006/60/125'
# 'https://climatologia.meteochile.gob.cl/application/informacion/inventarioComponentesPorEstacion/330006/64/130',
# 'https://climatologia.meteochile.gob.cl/application/informacion/inventarioComponentesPorEstacion/330006/89/200',
# 'https://climatologia.meteochile.gob.cl/application/informacion/inventarioComponentesPorEstacion/330006/66/132',
# 'https://climatologia.meteochile.gob.cl/application/informacion/inventarioComponentesPorEstacion/330007/28/61',
# 'https://climatologia.meteochile.gob.cl/application/informacion/inventarioComponentesPorEstacion/330007/28/60',
# 'https://climatologia.meteochile.gob.cl/application/informacion/inventarioComponentesPorEstacion/330006/26/58'
]
def make_soup(url):
res = requests.get(url)
soup = BeautifulSoup(res.text, 'lxml')
return soup
def get_links(url):
soup = make_soup(url)
a_tags = soup.find_all('a', href=re.compile(r"^/application/informacion/datosMensualesDelElemento/330006/"))
links = [urljoin(url, a.get('href')) for a in a_tags]
return links
def get_data(link):
soup = make_soup(link)
table = soup.find('table', {'class': 'table table-striped table-bordered'})
data = [[' '.join(item.text.split()) for item in tcel.select("td")]
for tcel in table.select("tr")]
return data
if __name__ == '__main__':
# data = []
# dataframe = []
# df = pd.DataFrame()
# for page in pages:
# make_soup(page)
# links = get_links(page)
# # print(links)
# for link in links:
# data.append(get_data(link))
# break
# dataframe.append(pd.DataFrame(data))
# df = pd.concat(dataframe, axis=0)
# print(df)
# make_soup(url)
# links = get_links(url)
# for link in links:
# data.append(get_data(link))
# print(data)
# break
# df = pd.DataFrame(data)
# df.to_csv('data.csv', index=False)
# df = pd.DataFrame(data)
# print(df)
columns_name = ['Data Medicao', 'Hora Medicao', 'RRR6 (mm)', 'Traza ()']
data = []
# for i in range(len(data)+1):
# print(data[i])
# df = pd.DataFrame(data)
for page in pages:
links = get_links(pages[0])
for link in links:
data.append(get_data(link))
df = pd.DataFrame(data)
df = df.iloc[1:]
df.to_csv('data.csv', index=False)
print(df)
# links = get_links(url)
# for link in links:
# data.writerows(get_data(link))

How can I convert list of crawled data to excel column?

import openpyxl
xl_file = openpyxl.Workbook()
xl_sheet =xl_file.active
from urllib.request import urlopen
from bs4 import BeautifulSoup
stockItem = '028300'
url = 'http://finance.naver.com/item/sise_day.nhn?code='+ stockItem
html = urlopen(url)
source = BeautifulSoup(html.read(), "html.parser")
maxPage=source.find_all("table",align="center")
mp = maxPage[0].find_all("td",class_="pgRR")
mpNum = int(mp[0].a.get('href')[-3:])
for page in range(1, 10):
print (str(page) )
url = 'http://finance.naver.com/item/sise_day.nhn?code=' + stockItem +'&page='+ str(page)
html = urlopen(url)
source = BeautifulSoup(html.read(), "html.parser")
srlists=source.find_all("tr")
isCheckNone = None
if((page % 1) == 0):
time.sleep(0)
for i in range(1,len(srlists)-1):
if(srlists[i].span != isCheckNone):
srlists[i].td.text
data1 = srlists[i].find_all("td",align="center")
data2 = srlists[i].find_all("td",class_="num")
print(srlists[i].find_all("td",align="center")[0].text, srlists[i].find_all("td",class_="num")[0].text )
for item in data1:
xl_sheet.append([item.get_text()])
This is what I've done for crawling stock data from the site.
I've successfully crawled the data of stock.
However, I couldn't save the data into excel file.
I've tried it, but it only showed the date data without the price data.
How could I convert results to excel file?
There were 2 things you misssed,
1) Mistake in importing packages
2) Did not append data2 in excel which contains prices
Here is the final code which will give your desired output. Just put your folder location for saving excel file.
import time
from openpyxl import Workbook #
xl_file = Workbook()
xl_sheet =xl_file.active
from urllib.request import urlopen
from bs4 import BeautifulSoup
i = 0
stockItem = '028300'
url = 'http://finance.naver.com/item/sise_day.nhn?code='+ stockItem
html = urlopen(url)
source = BeautifulSoup(html.read(), "html.parser")
maxPage=source.find_all("table",align="center")
mp = maxPage[0].find_all("td",class_="pgRR")
mpNum = int(mp[0].a.get('href')[-3:])
for page in range(1, 10):
print (str(page) )
url = 'http://finance.naver.com/item/sise_day.nhn?code=' + stockItem +'&page='+ str(page)
html = urlopen(url)
source = BeautifulSoup(html.read(), "html.parser")
srlists=source.find_all("tr")
isCheckNone = None
if((page % 1) == 0):
time.sleep(0)
for i in range(1,len(srlists)-1):
if(srlists[i].span != isCheckNone):
srlists[i].td.text
data1 = srlists[i].find_all("td",align="center")
data2 = srlists[i].find_all("td",class_="num")
#print(srlists[i].find_all("td",align="center")[0].text, srlists[i].find_all("td",class_="num")[0].text )
for item1,item2 in zip(data1,data2):
xl_sheet.append([item.get_text(),item2.get_text()])
print(xl_sheet)
xl_file.save(r'C:\Users\Asus\Desktop\vi.xlsx')
Suggestion : You can use Yahoofinance package for python to download stock data easily.
you can follow this link >> https://pypi.org/project/yfinance/

How to extract daily close from WSJ using Python?

I used python 3 and pandas to parse the daily close from WSJ into EXCEL. However, the daily close shown on the web page screen cannot be extracted. Here is the link: "https://quotes.wsj.com/index/COMP/historical-prices"
How to download the close data on screen into excel?
and how to download "DOWNLOAD A SPREADSHEET" button file into excel with another name like comp.xlxs ?
Here are the codes:
import requests
import pandas as pd
url = 'https://quotes.wsj.com/index/COMP/historical-prices'
jsonData = requests.get(url).json()
final_df = pd.DataFrame()
for row in jsonData['data']:
#row = jsonData['data'][1]
data_row = []
for idx, colspan in enumerate(row['colspan']):
colspan_int = int(colspan[0])
data_row.append(row['td'][idx] * colspan_int)
flat_list = [item for sublist in data_row for item in sublist]
temp_row = pd.DataFrame([flat_list])
final_df = final_df.append(temp_row, sort=True).reset_index(drop=True)
wait2 = input("PRESS ENTER TO CONTINUE.")
Follow UP question quotes:
#
url = 'https://quotes.wsj.com/index/HK/XHKG/HSI/historical-prices/download?num_rows=15&range_days=15&endDate=12/06/2019'
response = requests.get(url)
open('HSI.csv', 'wb').write(response.content)
read_file = pd.read_csv (r'C:\A-CEO\REPORTS\STOCKS\PROFILE\Python\HSI.csv')
read_file.to_excel (r'C:\A-CEO\REPORTS\STOCKS\PROFILE\Python\HSI.xlsx', index = None, header=True)
#
url = 'https://quotes.wsj.com/index/SPX/historical-prices/download?num_rows=15&range_days=15&endDate=12/06/2019'
response = requests.get(url)
open('SPX.csv', 'wb').write(response.content)
read_file = pd.read_csv (r'C:\A-CEO\REPORTS\STOCKS\PROFILE\Python\SPX.csv')
read_file.to_excel (r'C:\A-CEO\REPORTS\STOCKS\PROFILE\Python\SPX.xlsx', index = None, header=True)
#
url = 'https://quotes.wsj.com/index/COMP/historical-prices/download?num_rows=15&range_days=15&endDate=12/06/2019'
response = requests.get(url)
open('COMP.csv', 'wb').write(response.content)
read_file = pd.read_csv (r'C:\A-CEO\REPORTS\STOCKS\PROFILE\Python\COMP.csv')
read_file.to_excel (r'C:\A-CEO\REPORTS\STOCKS\PROFILE\Python\COMP.xlsx', index = None, header=True)
the URL is wrong; once downloaded you can do "Get Info" if on a Mac, and you'll see "Where From:". You will see it's of the form below.
import requests
import pandas as pd
import io
#original URL had a bunch of other parameters I omitted, only these seem to matter but YMMV
url = 'https://quotes.wsj.com/index/COMP/historical-prices/download?num_rows=360&range_days=360&endDate=11/06/2019'
response = requests.get(url)
#do this if you want the CSV written to your machine
open('test_file.csv', 'wb').write(response.content)
# this decodes the content of the downloaded response and presents it to pandas
df_test = pd.read_csv(io.StringIO(response.content.decode('utf-8')))
To answer your additional question -- you can simply loop across a list of tickers or symbols, something like:
base_url = 'https://quotes.wsj.com/index/{ticker_name}/historical-prices/download?num_rows=360&range_days=360&endDate=11/06/2019'
ticker_list = ['COMP','SPX','HK/XHKG/HSI']
for ticker in ticker_list:
response = requests.get(base_url.format(ticker_name = ticker))
#do this if you want the CSV written to your machine
open('prices_'+ticker.replace('/','-')+'.csv', 'wb').write(response.content)
Note for HK/XHKG/HSI, we need to replace the slashes with hyphens or it's not a valid filename. You can also use this pattern to make dataframes.

Can't write in a CSV file python

I am trying to write data into a csv after scraping using pandas dataframe, but the csv is empty even after program execution. The headers are written first but they are also overwritten when dataframe comes into action.
Here is the code:
from bs4 import BeautifulSoup
import requests
import re as resju
import csv
import pandas as pd
re = requests.get('https://www.farfeshplus.com/Video.asp?ZoneID=297')
soup = BeautifulSoup(re.content, 'html.parser')
links = soup.findAll('a', {'class': 'opacityit'})
links_with_text = [a['href'] for a in links]
headers = ['Name', 'LINK']
# this is output file, u can change the path as you desire, default is the working directory
file = open('data123.csv', 'w', encoding="utf-8")
writer = csv.writer(file)
writer.writerow(headers)
for i in links_with_text:
new_re = requests.get(i)
new_soup = BeautifulSoup(new_re.content, 'html.parser')
m = new_soup.select_one('h1 div')
Name = m.text
print(Name)
n = new_soup.select_one('iframe')
ni = n['src']
iframe = requests.get(ni)
i_soup = BeautifulSoup(iframe.content, 'html.parser')
d_script = i_soup.select_one('body > script')
d_link = d_script.text
mp4 = resju.compile(r"(?<=mp4:\s\[\')(.*)\'\]")
final_link = mp4.findall(d_link)[0]
print(final_link)
df = pd.DataFrame(zip(Name, final_link))
df.to_csv(file, header=None, index=False)
file.close()
df.head() returns:
0 1
0 ل h
1 ي t
2 ل t
3 ى p
4 s
0 1
0 ل h
1 ي t
2 ل t
3 ى p
4 s
Any suggestion ?
It seems you are using a mix of libraries to write to a csv, pandas handles all this nicely so there is no need to use the inbuilt csv module from python -
I've modified your code below - this should return your dataframe as a whole df and write it out as a csv.
also using Headers=None you were setting the columns to nothing, so they would be referenced by an index number.
from bs4 import BeautifulSoup
import requests
import re as resju
#import csv
import pandas as pd
re = requests.get('https://www.farfeshplus.com/Video.asp?ZoneID=297')
soup = BeautifulSoup(re.content, 'html.parser')
links = soup.findAll('a', {'class': 'opacityit'})
links_with_text = [a['href'] for a in links]
names_ = [] # global list to hold all iterable variables from your loops
final_links_ = []
for i in links_with_text:
new_re = requests.get(i)
new_soup = BeautifulSoup(new_re.content, 'html.parser')
m = new_soup.select_one('h1 div')
Name = m.text
names_.append(name) # append to global list.
print(Name)
n = new_soup.select_one('iframe')
ni = n['src']
iframe = requests.get(ni)
i_soup = BeautifulSoup(iframe.content, 'html.parser')
d_script = i_soup.select_one('body > script')
d_link = d_script.text
mp4 = resju.compile(r"(?<=mp4:\s\[\')(.*)\'\]")
final_link = mp4.findall(d_link)[0]
print(final_link)
final_links_.append(final_link) # append to global list.
df = pd.DataFrame(zip(names_, final_links_)) # use global lists.
df.columns = ['Name', 'LINK']
df.to_csv(file, index=False)

How can I export an web-scraping table into csv with multiple rows?

I wrote this code on Python 2.7.13, for scraping datatable from a website.
import urllib2
from bs4 import BeautifulSoup
import csv
import os
out=open("proba.csv","rb")
data=csv.reader(out)
def make_soup(url):
thepage = urllib2.urlopen(url)
soupdata = BeautifulSoup(thepage, "html.parser")
return soupdata
maindatatable=""
soup = make_soup("https://www.mnb.hu/arfolyamok")
for record in soup.findAll('tr'):
datatable=""
for data in record.findAll('td'):
datatable=datatable+","+data.text
maindatatable = maindatatable + "\n" + datatable[1:]
header = "Penznem,Devizanev,Egyseg,Penznemforintban"
print maindatatable
file = open(os.path.expanduser("proba.csv"),"wb")
utf16_str1 =header.encode('utf16')
utf16_str2 = maindatatable.encode('utf16')
file.write(utf16_str1)
file.write(utf16_str2)
file.close()
I want to export this into CSV with the next 4 rows:
"Penznem Devaizanev Egyseg Penznemforintban"
The data are separated with "," but the last two values is ONE row. (283,45)
How can I fix it?
you can not avoid last coma directly but,
What you can simply do is to use another seprator i.e. ;(semicolon)
and when you open file in exel,calc select (;)semicolon as seprator and you will get result as expected!
import urllib2
from bs4 import BeautifulSoup
import csv
import os
out=open("proba.csv","rb")
data=csv.reader(out)
def make_soup(url):
thepage = urllib2.urlopen(url)
soupdata = BeautifulSoup(thepage, "html.parser")
return soupdata
maindatatable=""
soup = make_soup("https://www.mnb.hu/arfolyamok")
for record in soup.findAll('tr'):
datatable=""
for data in record.findAll('td'):
datatable=datatable+";"+data.text
maindatatable = maindatatable + "\n" + datatable[1:]
header = "Penznem;Devizanev;Egyseg;Penznemforintban"
print maindatatable
file = open(os.path.expanduser("proba.csv"),"wb")
utf16_str1 =header.encode('utf16')
utf16_str2 = maindatatable.encode('utf16')
file.write(utf16_str1)
file.write(utf16_str2)
file.close()

Categories

Resources