I'm very new at this and I'm having trouble writing a CSV file from a webscraping. Can anyone help? Thank you!
import sys
import urllib2
import csv
import requests
from bs4 import BeautifulSoup
#web scraping wind data
r_wind = requests.get('http://w1.weather.gov/data/obhistory/KCQX.html')
html_wind = r_wind.text
soup = BeautifulSoup(html_wind, "html.parser")
table = soup.find('table')
rows_wind = table.findAll('tr')
rows_wind = rows_wind[1:]
#writing to a csv file
csvfile_wind = open("wind.csv","wb")
output_wind = csv.writer(csvfile_wind, delimiter=',',quotechar='"',quoting=csv.QUOTE_MINIMAL)
for row in rows_wind:
Date = cells[0].text.strip()
Time = cells[1].text.strip()
Wind = cells[2].text.strip()
output.writerow([Date,Time,Wind])
csvfile_wind.close()
Related
I want to put the data I'm scraping from the website into a csv file, my first attempt was using scrapy but I couldn't get the syntax right. When I managed to do using beautifulsoup, I just don't know how to put it into a csv file.
import requests
from bs4 import BeautifulSoup
URL = "https://www.practo.com/tests/glycosylated-haemoglobin-blood/p?city=delhi"
page = requests.get(URL)
#print(page)
soup=BeautifulSoup(page.content,'html.parser')
results = soup.find(id='root-app')
#print(results.prettify())
job_elems = results.find_all('div', class_='u-padrl--std')
#<h1 class="o-font-size--24 u-font-bold u-marginb--std">HbA1c Test</h1
for job_elem in job_elems:
title_elem = job_elem.find('h1', class_='o-font-size--24 u-font-bold u-marginb--std')
also_known = job_elem.find('span',class_="u-font-bold")
cost = job_elem.find('div',class_="o-font-size--22 u-font-bold o-f-color--primary")
what_test = job_elem.find('div',class_="c-markdown--unstyled")
#test_prep = job_elem.find('div',class_="c-tabsection__content c-pp__accordion-item__content active")
#temp = job_elem.find('p')
print(title_elem.text)
print(also_known.text)
print(cost.text)
print(what_test.text)
#print(temp.text)
#print(test_prep.text)
print()
text_content = results.find_all('div',class_='c-markdown--unstyled')
# c-tabsection__content c-pp__accordion-item__content active
# c-tabsection c-pp__accordion-item u-br-rule u-padtb--std--half active
for item in text_content:
prep = item.find('p')
#,class_="c-tabsection__content c-pp__accordion-item__content active")
print(prep.text)
print('xxo')
import requests
from bs4 import BeautifulSoup
# import the csv module
import csv
URL = "https://www.practo.com/tests/glycosylated-haemoglobin-blood/p?city=delhi"
page = requests.get(URL)
#print(page)
soup=BeautifulSoup(page.content,'html.parser')
results = soup.find(id='root-app')
#print(results.prettify())
job_elems = results.find_all('div', class_='u-padrl--std')
#<h1 class="o-font-size--24 u-font-bold u-marginb--std">HbA1c Test</h1
rows = []
fields = ['title_elem', 'also_known', 'cost', 'what_test']
filename = "myfile.csv"
for job_elem in job_elems:
title_elem = job_elem.find('h1', class_='o-font-size--24 u-font-bold u-marginb--std').text.encode("utf-8")
also_known = job_elem.find('span',class_="u-font-bold").text.encode("utf-8")
cost = job_elem.find('div',class_="o-font-size--22 u-font-bold o-f-color--primary").text.encode("utf-8")
what_test = job_elem.find('div',class_="c-markdown--unstyled").text.encode("utf-8")
row = [title_elem, also_known, cost, what_test]
rows.append(row)
with open(filename, 'w') as csvfile:
# creating a csv writer object
csvwriter = csv.writer(csvfile)
# writing the fields
csvwriter.writerow(fields)
# writing the data rows
csvwriter.writerows(rows)
text_content = results.find_all('div',class_='c-markdown--unstyled')
# c-tabsection__content c-pp__accordion-item__content active
# c-tabsection c-pp__accordion-item u-br-rule u-padtb--std--half active
for item in text_content:
prep = item.find('p')
#,class_="c-tabsection__content c-pp__accordion-item__content active")
print(prep.text)
print('xxo')
You can use the xlsxwriter library.
import xlsxwriter
workbook = xlsxwriter.Workbook("file.xlsx")
worksheet = workbook.add_worksheet()
worksheet.write(row, column, element)
workbook.close()
i want to scrape web data using input values from excel and scraping web for each row_value taken and save the output to same excel file.
from bs4 import BeautifulSoup
import requests
from urllib import request
import os
import pandas as pd
ciks = pd.read_csv("ciks.csv")
ciks.head()
output
CIK
0 1557822
1 1598429
2 1544670
3 1574448
4 1592290
then
for x in ciks:
url="https://www.sec.gov/cgi-bin/browse-edgar?CIK=" + x +"&owner=exclude&action=getcompany"
r = request.urlopen(url)
bytecode = r.read()
htmlstr = bytecode.decode()
soup = BeautifulSoup(bytecode)
t = soup.find('span',{'class':'companyName'})
print(t.text)
i got an erorr :
----> 9 print (t.text)
AttributeError: 'NoneType' object has no attribute 'text'
here, i want to scrape web data taking each row value as input from the CSV file.
It would be easier to convert the column values as list and then use it in the for loop - see solution below,
from bs4 import BeautifulSoup
import requests
from urllib import request
import os
import pandas as pd
#ciks = pd.read_csv("ciks.csv")
df = pd.read_csv("ciks.csv")
mylist = df['CIK'].tolist()# CIK is the column name
company =[]
for item in mylist:
print(item)
url="https://www.sec.gov/cgi-bin/browse-edgar?CIK=" + str(item) +"&owner=exclude&action=getcompany"
r = request.urlopen(url)
bytecode = r.read()
htmlstr = bytecode.decode()
soup = BeautifulSoup(bytecode,features="lxml")
t = soup.find('span',{'class':'companyName'})
company.append(t.text)
print(t.text)
df.assign(company= company)
print(df)
df.to_csv("ciks.csv")
I want to retrieve specific stat (PPDA) from multiple matchs on this site:
https//understat.com/match/xxxx
I have created the follow to parse the HTML and loop through each match using Python but i am struggling how to extract the specific stat and load it to a csv and to a graph. I am a beginner and any help would be appreciated!
Code:
import pandas as pd
import re
import random
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
import datetime
import csv
for i in range(9577,9807):
ppda_url = 'https://understat.com/match/' + str(i)
ppda_data = requests.get(ppda_url)
ppda_html = ppda_data.content
xml
soup = BeautifulSoup(ppda_html, 'lxml')
options=webdriver.ChromeOptions()
driver = webdriver.Chrome(chrome_options=options)
driver.get(ppda_url)
soup = BeautifulSoup(driver.page_source, 'lxml')
To extract the data with BeautifulSoup and write it to a CSV file, first find the div element with the text PPDA. Then find the next div element with the class progress-value then the nextdiv element with the class progress-value and get the data from these two last divs. Write it to a csv file like so.
import requests
from bs4 import BeautifulSoup
import csv
with open('ppda.csv', 'w', newline='') as csvfile:
for i in range(9577,9807):
ppda_url = 'https://understat.com/match/' + str(i)
ppda_data = requests.get(ppda_url)
ppda_html = ppda_data.content
soup = BeautifulSoup(ppda_html, 'lxml')
ppda = soup.find("div", string='PPDA')
home = ppda.findNext('div', {'class':"progress-value"})
print (home.text, home.findNext('div', {'class':"progress-value"}).text)
writer = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)
writer.writerow([home.text, home.findNext('div', {'class':"progress-value"}).text])
To graph it look at matplotlib for a start.
import requests
from bs4 import BeautifulSoup
import pandas as pd
import matplotlib.pyplot as plt
df = pd.DataFrame(columns=['HOME', 'AWAY'])
for i in range(9577,9807):
ppda_url = 'https://understat.com/match/' + str(i)
ppda_data = requests.get(ppda_url)
ppda_html = ppda_data.content
soup = BeautifulSoup(ppda_html, 'lxml')
ppda = soup.find("div", string='PPDA')
home = ppda.findNext('div', {'class':"progress-value"})
print (home.text, home.findNext('div', {'class':"progress-value"}).text)
df = df.append({'HOME': float(home.text), 'AWAY' : float(home.findNext('div', {'class':"progress-value"}).text)}, ignore_index=True)
#print (df)
df.to_csv("ppda2.csv", encoding='utf-8', index=False)
df.plot.bar()
plt.show()
Outputs: CSV file and Graph
What I'm trying to do:
I'm using Python to populate data to an existing Excel file.
What works:
My code below is successful in exporting the table to Excel file "Futures.xls".
What doesn't work:
The code below extracts table from website and exports to Futures excel file ( 100 Rows )The subsequent code re-opens Futures file and appends to Futures1 excel file ( 200 Rows ). However, if I continue to the run the subsequent code multiple times I cannot get more than 200 rows like 300 rows and so on as an when I run the code. Can someone tell me the issue?
My code:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import requests
import pandas as pd
from pandas import ExcelWriter
from pandas import ExcelFile
import os
url = "https://quotes.ino.com/exchanges/contracts.html?r=NYMEX_NG"
res = requests.get(url)
soup = BeautifulSoup(res.text, 'lxml')
Markets = []
Contracts =[]
Opens =[]
Highs =[]
Lows =[]
Lasts=[]
Changes=[]
Pcts=[]
data_rows = soup.findAll('tr')[3:]
for td in data_rows[:100]:
Market = td.findAll ('td')[0].text
Markets.append(Market)
Contract = td.findAll('td')[1].text
Contracts.append(Contract)
Open = td.findAll('td')[2].text
Opens.append(Open)
High = td.findAll('td')[3].text
Highs.append(High)
Low = td.findAll('td')[4].text
Lows.append(Low)
Last = td.findAll('td')[5].text
Lasts.append(Last)
Change = td.findAll('td')[6].text
Changes.append(Change)
Pct = td.findAll('td')[7].text
Pcts.append(Pct)
Time = td.findAll('td')[8].text
df = pd.DataFrame({'Contracts' :Contracts, 'Markets':Market,'Open':Opens, 'High':Highs, 'Low':Lows,'Last':Lasts,'Pct':Pcts})
out_path = "C:\Sid\Futures.xls"
writer = pd.ExcelWriter(out_path , engine='xlsxwriter')
df.to_excel(writer,'Sheet2',index=False)
writer.save()
from openpyxl import load_workbook
from openpyxl.utils.dataframe import dataframe_to_rows
wb = load_workbook("C:\Sid\Futures.xlsx")
ws = wb['Sheet2']
from openpyxl import load_workbook
from openpyxl.utils.dataframe import dataframe_to_rows
wb = load_workbook("C:\Sid\Futures.xlsx")
ws = wb['Sheet2']
for row in dataframe_to_rows(df, header=None, index = True):
ws.append(row)
wb.save('C:\Sid\Futures1.xlsx')
Additional:
Also, what code do I need so that my python runs automatically when my website updates? Prices change every 15 minutes.
I wrote this code on Python 2.7.13, for scraping datatable from a website.
import urllib2
from bs4 import BeautifulSoup
import csv
import os
out=open("proba.csv","rb")
data=csv.reader(out)
def make_soup(url):
thepage = urllib2.urlopen(url)
soupdata = BeautifulSoup(thepage, "html.parser")
return soupdata
maindatatable=""
soup = make_soup("https://www.mnb.hu/arfolyamok")
for record in soup.findAll('tr'):
datatable=""
for data in record.findAll('td'):
datatable=datatable+","+data.text
maindatatable = maindatatable + "\n" + datatable[1:]
header = "Penznem,Devizanev,Egyseg,Penznemforintban"
print maindatatable
file = open(os.path.expanduser("proba.csv"),"wb")
utf16_str1 =header.encode('utf16')
utf16_str2 = maindatatable.encode('utf16')
file.write(utf16_str1)
file.write(utf16_str2)
file.close()
I want to export this into CSV with the next 4 rows:
"Penznem Devaizanev Egyseg Penznemforintban"
The data are separated with "," but the last two values is ONE row. (283,45)
How can I fix it?
you can not avoid last coma directly but,
What you can simply do is to use another seprator i.e. ;(semicolon)
and when you open file in exel,calc select (;)semicolon as seprator and you will get result as expected!
import urllib2
from bs4 import BeautifulSoup
import csv
import os
out=open("proba.csv","rb")
data=csv.reader(out)
def make_soup(url):
thepage = urllib2.urlopen(url)
soupdata = BeautifulSoup(thepage, "html.parser")
return soupdata
maindatatable=""
soup = make_soup("https://www.mnb.hu/arfolyamok")
for record in soup.findAll('tr'):
datatable=""
for data in record.findAll('td'):
datatable=datatable+";"+data.text
maindatatable = maindatatable + "\n" + datatable[1:]
header = "Penznem;Devizanev;Egyseg;Penznemforintban"
print maindatatable
file = open(os.path.expanduser("proba.csv"),"wb")
utf16_str1 =header.encode('utf16')
utf16_str2 = maindatatable.encode('utf16')
file.write(utf16_str1)
file.write(utf16_str2)
file.close()