looping Row and scraping data taking input from excel file

looping Row and scraping data taking input from excel file - python

i want to scrape web data using input values from excel and scraping web for each row_value taken and save the output to same excel file.
from bs4 import BeautifulSoup
import requests
from urllib import request
import os
import pandas as pd
ciks = pd.read_csv("ciks.csv")
ciks.head()
output
CIK
0 1557822
1 1598429
2 1544670
3 1574448
4 1592290
then
for x in ciks:
url="https://www.sec.gov/cgi-bin/browse-edgar?CIK=" + x +"&owner=exclude&action=getcompany"
r = request.urlopen(url)
bytecode = r.read()
htmlstr = bytecode.decode()
soup = BeautifulSoup(bytecode)
t = soup.find('span',{'class':'companyName'})
print(t.text)
i got an erorr :
----> 9 print (t.text)
AttributeError: 'NoneType' object has no attribute 'text'
here, i want to scrape web data taking each row value as input from the CSV file.

It would be easier to convert the column values as list and then use it in the for loop - see solution below,
from bs4 import BeautifulSoup
import requests
from urllib import request
import os
import pandas as pd
#ciks = pd.read_csv("ciks.csv")
df = pd.read_csv("ciks.csv")
mylist = df['CIK'].tolist()# CIK is the column name
company =[]
for item in mylist:
print(item)
url="https://www.sec.gov/cgi-bin/browse-edgar?CIK=" + str(item) +"&owner=exclude&action=getcompany"
r = request.urlopen(url)
bytecode = r.read()
htmlstr = bytecode.decode()
soup = BeautifulSoup(bytecode,features="lxml")
t = soup.find('span',{'class':'companyName'})
company.append(t.text)
print(t.text)
df.assign(company= company)
print(df)
df.to_csv("ciks.csv")

Related

Parsing JSON output file

Hi a learner in python I have written code the extracts a son file from a sports website.
The code is
from bs4 import BeautifulSoup
import requests
import json
url = "https://s3-ap-southeast-2.amazonaws.com/racevic.static/2022-08-01/sportsbet-pakenham-synthetic/sectionaltimes/race-2.json?"
payload={}
headers = {}
response = requests.request("GET", url, headers=headers, data=payload)
print(response.text)
and the output looks like this (small portion)
sectionaltimes_callback({"Horses":[{"Comment":"Slow Out 1 Lengths , got back 2nd last off tardy start 8 Lengths 800m, still mile off them getting widest from the corner, charged home last 200m for eye catching second spot # powered home widest","FinalPosition":2,"FinalPositionAbbreviation":"2nd","FullName":"Ameerati","SaddleNumber":12,"HorseUrl":"/horses/ameerati","SilkUrl":"//cdn.silks.racing.com/bb/114031.png","Trainer":"Robbie Griffiths & Mathew de Kock","TrainerUrl":"/trainers/robbie-griffiths","Jockey":"P.M.Moloney","JockeyUrl":"/jockeys/patrick-moloney","SectionalTimes":[{"Distance":"1200m","Position":11,"Time":"1:11.43","AvgSpeed":0.0},{"Distance":"1000m","Position":11,"Time":"59.29","AvgSpeed":0.0},{"Distance":"800m","Position":11,"Time":"46.95","AvgSpeed":0.0},{"Distance":"600m","Position":11,"Time":"34.77","AvgSpeed":0.0},{"Distance":"400m","Position":11,"Time":"22.71","AvgSpeed":0.0},{"Distance":"200m","Position":4,"Time":"11.45","AvgSpeed":0.0},{"Distance":"Finish","Position":2,"Time":"","AvgSpeed":0.0}],"SplitTimes":[{"Distance":"1200m-1000m","Position":11,"Time":"12.14","AvgSpeed":0.0},{"Distance":"1000m-800m","Position":11,"Time":"12.34","AvgSpeed":0.0},{"Distance":"800m-600m","Position":11,"Time":"12.18","AvgSpeed":0.0},{"Distance":"600m-400m","Position":11,"Time":"12.06","AvgSpeed":0.0},{"Distance":"400m-200m","Position":11,"Time":"11.26","AvgSpeed":0.0},{"Distance":"200m-Finish","Position":4,"Time":"11.45","AvgSpeed":0.0}],"StartPosition":0,"BarrierNumber":12,"RaceTime":"","TimeVarToWinner":0.0,"BeatenMargin":0.0,"DistanceRun":0,"DistanceVarToWinner":"","SixHundredMetresTime":"34.77","TwoHundredMetresTime":"11.45","Early":0.0,"Mid":0.0,"Late":0.0,"OverallPeakSpeed":0.0,"PeakSpeedLocation":null,"OverallAvgSpeed":0.0,"DistanceFromRail":0.0},
The help I would appreciate now is what do I do to put this in a format that I can open in excel

import pandas as pd
import requests
import json
from openpyxl import Workbook
from openpyxl.utils import get_column_letter
from openpyxl.utils.dataframe import dataframe_to_rows
def race_data_to_xslxs(url, fname):
# get data
data = json.loads(requests.get(url).text[24:-1])
# create dataframes
dfs = {}
singulars = pd.DataFrame()
for k, v in data.items():
if isinstance(v, list):
dfs[k] = pd.DataFrame(v)
else:
singulars[k] = [v]
dfs = {'summary': singulars, **dfs}
# create workbook
wb = Workbook()
for k, df in dfs.items():
# create sheet
wsx = wb.create_sheet(title=k)
rows = dataframe_to_rows(df)
for r_idx, row in enumerate(rows, 1):
for c_idx, value in enumerate(row, 1):
wsx.cell(row=r_idx, column=c_idx, value=str(value))
del wb['Sheet']
# write excel file
wb.save(filename=fname)
url = "https://s3-ap-southeast-2.amazonaws.com/racevic.static/2022-08-01/sportsbet-pakenham-synthetic/sectionaltimes/race-2.json?"
path = 'fname.xlsx'
race_data_to_xslxs(url=url, fname=path)

The API is returning JSONP, not JSON. This is JSON wrapped in a call to a callback function, which can be used by browsers without violating the same-origin rule. You need to remove that function call before parsing it as JSON.
import re
import json
response = requests.request("GET", url, headers=headers, data=payload)
json_string = re.sub(r'^sectionaltimes_callback\((.*)\)$', r'\1', response)
data = json.loads(json_string)

You can try this -
import requests
import json
url = "https://s3-ap-southeast-2.amazonaws.com/racevic.static/2022-08-01/sportsbet-pakenham-synthetic/sectionaltimes/race-2.json?"
response = requests.get(url)
json.loads(response.text.split("(", 1)[1].strip(")"))

How can I convert list of crawled data to excel column?

import openpyxl
xl_file = openpyxl.Workbook()
xl_sheet =xl_file.active
from urllib.request import urlopen
from bs4 import BeautifulSoup
stockItem = '028300'
url = 'http://finance.naver.com/item/sise_day.nhn?code='+ stockItem
html = urlopen(url)
source = BeautifulSoup(html.read(), "html.parser")
maxPage=source.find_all("table",align="center")
mp = maxPage[0].find_all("td",class_="pgRR")
mpNum = int(mp[0].a.get('href')[-3:])
for page in range(1, 10):
print (str(page) )
url = 'http://finance.naver.com/item/sise_day.nhn?code=' + stockItem +'&page='+ str(page)
html = urlopen(url)
source = BeautifulSoup(html.read(), "html.parser")
srlists=source.find_all("tr")
isCheckNone = None
if((page % 1) == 0):
time.sleep(0)
for i in range(1,len(srlists)-1):
if(srlists[i].span != isCheckNone):
srlists[i].td.text
data1 = srlists[i].find_all("td",align="center")
data2 = srlists[i].find_all("td",class_="num")
print(srlists[i].find_all("td",align="center")[0].text, srlists[i].find_all("td",class_="num")[0].text )
for item in data1:
xl_sheet.append([item.get_text()])
This is what I've done for crawling stock data from the site.
I've successfully crawled the data of stock.
However, I couldn't save the data into excel file.
I've tried it, but it only showed the date data without the price data.
How could I convert results to excel file?

There were 2 things you misssed,
1) Mistake in importing packages
2) Did not append data2 in excel which contains prices
Here is the final code which will give your desired output. Just put your folder location for saving excel file.
import time
from openpyxl import Workbook #
xl_file = Workbook()
xl_sheet =xl_file.active
from urllib.request import urlopen
from bs4 import BeautifulSoup
i = 0
stockItem = '028300'
url = 'http://finance.naver.com/item/sise_day.nhn?code='+ stockItem
html = urlopen(url)
source = BeautifulSoup(html.read(), "html.parser")
maxPage=source.find_all("table",align="center")
mp = maxPage[0].find_all("td",class_="pgRR")
mpNum = int(mp[0].a.get('href')[-3:])
for page in range(1, 10):
print (str(page) )
url = 'http://finance.naver.com/item/sise_day.nhn?code=' + stockItem +'&page='+ str(page)
html = urlopen(url)
source = BeautifulSoup(html.read(), "html.parser")
srlists=source.find_all("tr")
isCheckNone = None
if((page % 1) == 0):
time.sleep(0)
for i in range(1,len(srlists)-1):
if(srlists[i].span != isCheckNone):
srlists[i].td.text
data1 = srlists[i].find_all("td",align="center")
data2 = srlists[i].find_all("td",class_="num")
#print(srlists[i].find_all("td",align="center")[0].text, srlists[i].find_all("td",class_="num")[0].text )
for item1,item2 in zip(data1,data2):
xl_sheet.append([item.get_text(),item2.get_text()])
print(xl_sheet)
xl_file.save(r'C:\Users\Asus\Desktop\vi.xlsx')
Suggestion : You can use Yahoofinance package for python to download stock data easily.
you can follow this link >> https://pypi.org/project/yfinance/

Can't write in a CSV file python

I am trying to write data into a csv after scraping using pandas dataframe, but the csv is empty even after program execution. The headers are written first but they are also overwritten when dataframe comes into action.
Here is the code:
from bs4 import BeautifulSoup
import requests
import re as resju
import csv
import pandas as pd
re = requests.get('https://www.farfeshplus.com/Video.asp?ZoneID=297')
soup = BeautifulSoup(re.content, 'html.parser')
links = soup.findAll('a', {'class': 'opacityit'})
links_with_text = [a['href'] for a in links]
headers = ['Name', 'LINK']
# this is output file, u can change the path as you desire, default is the working directory
file = open('data123.csv', 'w', encoding="utf-8")
writer = csv.writer(file)
writer.writerow(headers)
for i in links_with_text:
new_re = requests.get(i)
new_soup = BeautifulSoup(new_re.content, 'html.parser')
m = new_soup.select_one('h1 div')
Name = m.text
print(Name)
n = new_soup.select_one('iframe')
ni = n['src']
iframe = requests.get(ni)
i_soup = BeautifulSoup(iframe.content, 'html.parser')
d_script = i_soup.select_one('body > script')
d_link = d_script.text
mp4 = resju.compile(r"(?<=mp4:\s\[\')(.*)\'\]")
final_link = mp4.findall(d_link)[0]
print(final_link)
df = pd.DataFrame(zip(Name, final_link))
df.to_csv(file, header=None, index=False)
file.close()
df.head() returns:
0 1
0 ل h
1 ي t
2 ل t
3 ى p
4 s
0 1
0 ل h
1 ي t
2 ل t
3 ى p
4 s
Any suggestion ?

It seems you are using a mix of libraries to write to a csv, pandas handles all this nicely so there is no need to use the inbuilt csv module from python -
I've modified your code below - this should return your dataframe as a whole df and write it out as a csv.
also using Headers=None you were setting the columns to nothing, so they would be referenced by an index number.
from bs4 import BeautifulSoup
import requests
import re as resju
#import csv
import pandas as pd
re = requests.get('https://www.farfeshplus.com/Video.asp?ZoneID=297')
soup = BeautifulSoup(re.content, 'html.parser')
links = soup.findAll('a', {'class': 'opacityit'})
links_with_text = [a['href'] for a in links]
names_ = [] # global list to hold all iterable variables from your loops
final_links_ = []
for i in links_with_text:
new_re = requests.get(i)
new_soup = BeautifulSoup(new_re.content, 'html.parser')
m = new_soup.select_one('h1 div')
Name = m.text
names_.append(name) # append to global list.
print(Name)
n = new_soup.select_one('iframe')
ni = n['src']
iframe = requests.get(ni)
i_soup = BeautifulSoup(iframe.content, 'html.parser')
d_script = i_soup.select_one('body > script')
d_link = d_script.text
mp4 = resju.compile(r"(?<=mp4:\s\[\')(.*)\'\]")
final_link = mp4.findall(d_link)[0]
print(final_link)
final_links_.append(final_link) # append to global list.
df = pd.DataFrame(zip(names_, final_links_)) # use global lists.
df.columns = ['Name', 'LINK']
df.to_csv(file, index=False)

Scraping specific element from Understat.com

I want to retrieve specific stat (PPDA) from multiple matchs on this site:
https//understat.com/match/xxxx
I have created the follow to parse the HTML and loop through each match using Python but i am struggling how to extract the specific stat and load it to a csv and to a graph. I am a beginner and any help would be appreciated!
Code:
import pandas as pd
import re
import random
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
import datetime
import csv
for i in range(9577,9807):
ppda_url = 'https://understat.com/match/' + str(i)
ppda_data = requests.get(ppda_url)
ppda_html = ppda_data.content
xml
soup = BeautifulSoup(ppda_html, 'lxml')
options=webdriver.ChromeOptions()
driver = webdriver.Chrome(chrome_options=options)
driver.get(ppda_url)
soup = BeautifulSoup(driver.page_source, 'lxml')

To extract the data with BeautifulSoup and write it to a CSV file, first find the div element with the text PPDA. Then find the next div element with the class progress-value then the nextdiv element with the class progress-value and get the data from these two last divs. Write it to a csv file like so.
import requests
from bs4 import BeautifulSoup
import csv
with open('ppda.csv', 'w', newline='') as csvfile:
for i in range(9577,9807):
ppda_url = 'https://understat.com/match/' + str(i)
ppda_data = requests.get(ppda_url)
ppda_html = ppda_data.content
soup = BeautifulSoup(ppda_html, 'lxml')
ppda = soup.find("div", string='PPDA')
home = ppda.findNext('div', {'class':"progress-value"})
print (home.text, home.findNext('div', {'class':"progress-value"}).text)
writer = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)
writer.writerow([home.text, home.findNext('div', {'class':"progress-value"}).text])
To graph it look at matplotlib for a start.
import requests
from bs4 import BeautifulSoup
import pandas as pd
import matplotlib.pyplot as plt
df = pd.DataFrame(columns=['HOME', 'AWAY'])
for i in range(9577,9807):
ppda_url = 'https://understat.com/match/' + str(i)
ppda_data = requests.get(ppda_url)
ppda_html = ppda_data.content
soup = BeautifulSoup(ppda_html, 'lxml')
ppda = soup.find("div", string='PPDA')
home = ppda.findNext('div', {'class':"progress-value"})
print (home.text, home.findNext('div', {'class':"progress-value"}).text)
df = df.append({'HOME': float(home.text), 'AWAY' : float(home.findNext('div', {'class':"progress-value"}).text)}, ignore_index=True)
#print (df)
df.to_csv("ppda2.csv", encoding='utf-8', index=False)
df.plot.bar()
plt.show()
Outputs: CSV file and Graph

Python - CSV file blank after using CSV writer

I'm very new at this and I'm having trouble writing a CSV file from a webscraping. Can anyone help? Thank you!
import sys
import urllib2
import csv
import requests
from bs4 import BeautifulSoup
#web scraping wind data
r_wind = requests.get('http://w1.weather.gov/data/obhistory/KCQX.html')
html_wind = r_wind.text
soup = BeautifulSoup(html_wind, "html.parser")
table = soup.find('table')
rows_wind = table.findAll('tr')
rows_wind = rows_wind[1:]
#writing to a csv file
csvfile_wind = open("wind.csv","wb")
output_wind = csv.writer(csvfile_wind, delimiter=',',quotechar='"',quoting=csv.QUOTE_MINIMAL)
for row in rows_wind:
Date = cells[0].text.strip()
Time = cells[1].text.strip()
Wind = cells[2].text.strip()
output.writerow([Date,Time,Wind])
csvfile_wind.close()

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

looping Row and scraping data taking input from excel file - python

Related

Parsing JSON output file

How can I convert list of crawled data to excel column?

Can't write in a CSV file python

Scraping specific element from Understat.com

Python - CSV file blank after using CSV writer

Categories

Resources