The code below is yielding a value for the "resultStats" ID, which I would like to save in a CSV file. Is there any smart way to have the " desired_google_queries" (i.e. the search terms) in column A and the "resultStats" values in column B of the CSV?
I saw that there are a number of threads on this topic but none of the solutions I have read through worked for the specific situation.
from bs4 import BeautifulSoup
import urllib.request
import csv
desired_google_queries = ['Elon Musk' , 'Tesla', 'Microsoft']
for query in desired_google_queries:
url = 'http://google.com/search?q=' + query
req = urllib.request.Request(url, headers={'User-Agent' : "Magic Browser"})
response = urllib.request.urlopen( req )
html = response.read()
soup = BeautifulSoup(html, 'html.parser')
resultStats = soup.find(id="resultStats").string
print(resultStats)
I took the liberty of rewriting this to use the Requests library instead of urllib, but this shows how to do the CSV writing which is what I think you were more interested in:
from bs4 import BeautifulSoup
import requests
import csv
desired_google_queries = ['Elon Musk' , 'Tesla', 'Microsoft']
result_stats = dict()
for query in desired_google_queries:
url = 'http://google.com/search?q=' + query
response = requests.get(url)
html = response.text
soup = BeautifulSoup(html, 'html.parser')
result_stats[query] = soup.find(id="resultStats").string
with open ('searchstats.csv', 'w', newline='') as fout:
cw = csv.writer(fout)
for q in desired_google_queries:
cw.writerow([q, result_stats[q]])
instead of writing it line by line, you can write it all in one go by storing the result in a pandas dataframe first. See below code
from bs4 import BeautifulSoup
import urllib.request
import pandas as pd
data_dict = {'desired_google_queries': [],
'resultStats': []}
desired_google_queries = ['Elon Musk' , 'Tesla', 'Microsoft']
for query in desired_google_queries:
url = 'http://google.com/search?q=' + query
req = urllib.request.Request(url, headers={'User-Agent' : "Magic Browser"})
response = urllib.request.urlopen( req )
html = response.read()
soup = BeautifulSoup(html, 'html.parser')
resultStats = soup.find(id="resultStats").string
data_dict['desired_google_queries'].append(query)
data_dict['resultStats'].append(resultStats)
df = pd.DataFrame(data=data_dict)
df.to_csv(path_or_buf='path/where/you/want/to/save/thisfile.csv', index=None)
The original answer has been deleted unfortunately - please find below the code for everyone else interested in the situation. Thanks to the user who has posted the solution in the first place:
with open('eggs.csv', 'w', newline='') as csvfile:
spamwriter = csv.writer(csvfile, delimiter=' ',
quotechar='|', quoting=csv.QUOTE_MINIMAL)
spamwriter.writerow(['query', 'resultStats'])
for query in desired_google_queries:
...
spamwriter.writerow([query, resultStats])
Related
I try to scrape web page using python and BeautifulSoup. When I write:
table = soup.find('table')
it returns None.
and when I try get row content, it always returns empty list.
I also used Selenium and the same result empty list.
import requests
from bs4 import BeautifulSoup
import csv
url = "https://www.iea.org/data-and-statistics/data-tables?country=CANADA&energy=Balances&year=2010"
response = requests.get(url)
print(response.status_code) >>> print 200
soup = BeautifulSoup(response.text,"html.parser")
tr = soup.findAll('tr', attrs={'class': 'm-data-table__row '})
print(tr) >>> print []
print(len(tr)) >>> print 0
csvFile = open("C:/Users/User/Desktop/test27.csv",'wt',newline='', encoding='utf-8')
writer = csv.writer(csvFile)
try:
for cell in tr:
td = cell.find_all('td')
row = [i.text.replace('\n','') for i in td]
writer.writerow(row)
finally:
csvFile.close()
Any help?
When you analyse the website, the data is loaded via ajax call. The following script makes the ajax call and saves the required json to a file
import requests, json
from bs4 import BeautifulSoup
res = requests.get("https://api.iea.org/stats/?year=2010&countries=CANADA&series=BALANCES")
data = res.json()
with open("data.json", "w") as f:
json.dump(data,f)
this script needs to run all the way through RI_page_urls.csv, then run through all the resulting urls from RI_License_urls.csv and grab the business info.
it's pulling all the url's from RI_page_urls.csv, but then only running and printing the first of 100 urls from RI_License_urls.csv. Need help figuring out how to make it wait for the first part to complete before running the second part.
I appreciate any and all help.
Here's a url for the RI_page_urls.csv to start with:
http://www.crb.state.ri.us/verify_CRB.php
and the code:
from bs4 import BeautifulSoup as soup
import requests as r
import pandas as pd
import re
import csv
#pulls lic# url
with open('RI_page_urls.csv') as f_input:
csv_input = csv.reader(f_input)
for url in csv_input:
data = r.get(url[0])
page_data = soup(data.text, 'html.parser')
links = [r'www.crb.state.ri.us/' + link['href']
for link in page_data.table.tr.find_all('a') if re.search('licensedetail.php', str(link))]
df = pd.DataFrame(links)
df.to_csv('RI_License_urls.csv', header=False, index=False, mode = 'a')
#Code Above works!
#need to pull table info from license url
#this pulls the first record, but doesn't loop through the requests
with open('RI_License_urls.csv') as f_input_2:
csv_input_2 = csv.reader(f_input_2)
for url in csv_input_2:
data = r.get(url[0])
page_data = soup(data.text, 'html.parser')
company_info = (' '.join(info.get_text(", ", strip=True).split()) for info in page_data.find_all('h9'))
df = pd.DataFrame(info, columns=['company_info'])
df.to_csv('RI_company_info.csv', index=False)
Well , The question is a bit unclear and also there are a couple of things wrong about the code
data = r.get(url[0])
should be because its urls start with http or https not www
data = r.get("http://"+url[0])
In the below code ,
info is not defined so , i just assumed it should be company_info
company_info = (' '.join(info.get_text(", ", strip=True).split()) for info in page_data.find_all('h9'))
df = pd.DataFrame(info, columns=['company_info'])
Hence the full code is
from bs4 import BeautifulSoup as soup
import requests as r
import pandas as pd
import re
import csv
#pulls lic# url
with open('RI_page_urls.csv') as f_input:
csv_input = csv.reader(f_input)
for url in csv_input:
data = r.get(url[0])
page_data = soup(data.text, 'html.parser')
links = [r'www.crb.state.ri.us/' + link['href']
for link in page_data.table.tr.find_all('a') if re.search('licensedetail.php', str(link))]
df = pd.DataFrame(links)
df.to_csv('RI_License_urls.csv', header=False, index=False, mode = 'a')
#Code Above works!
#need to pull table info from license url
#this pulls the first record, but doesn't loop through the requests
with open('RI_License_urls.csv') as f_input_2:
csv_input_2 = csv.reader(f_input_2)
with open('RI_company_info.csv','a',buffering=0) as companyinfofiledescriptor:
for url in csv_input_2:
data = r.get("http://"+url[0])
page_data = soup(data.text, 'html.parser')
company_info = (' '.join(info.get_text(", ", strip=True).split()) for info in page_data.find_all('h9'))
df = pd.DataFrame(company_info, columns=['company_info'])
df.to_csv(companyinfofiledescriptor, index=False)
print(df)
I've written some code using python to scrape some titles and price from a webpage and write the results in a csv file. The script is running awesome. As I'm appending data to a csv file the script is writing headers in such a way that if it runs 4 loops then the headers will be written 4 times. How to fix it so that the headers will be written once. Thanks.
This is the script:
import csv
import requests
from bs4 import BeautifulSoup
diction_page = ['http://www.bloomberg.com/quote/SPX:IND','http://www.bloomberg.com/quote/CCMP:IND']
for link in diction_page:
res = requests.get(link).text
soup = BeautifulSoup(res,'lxml')
title = soup.select_one('.name').text.strip()
price = soup.select_one('.price').text
print(title,price)
with open('item.csv','a',newline='') as outfile:
writer = csv.writer(outfile)
writer.writerow(["Title","Price"])
writer.writerow([title, price])
As an option you can try this:
import csv
import requests
from bs4 import BeautifulSoup
diction_page = ['http://www.bloomberg.com/quote/SPX:IND','http://www.bloomberg.com/quote/CCMP:IND']
for i,link in enumerate(diction_page):
res = requests.get(link).text
soup = BeautifulSoup(res,'lxml')
title = soup.select_one('.name').text.strip()
price = soup.select_one('.price').text
print(title,price)
with open('item.csv','a',newline='') as outfile:
writer = csv.writer(outfile)
if (i == 0):
writer.writerow(["Title","Price"])
writer.writerow([title, price])
Don't write the headers in the for loop:
import csv
import requests
from bs4 import BeautifulSoup
diction_page = ['http://www.bloomberg.com/quote/SPX:IND','http://www.bloomberg.com/quote/CCMP:IND']
outfile = open('item.csv','w',newline='')
writer = csv.writer(outfile)
writer.writerow(["Title","Price"])
for link in diction_page:
res = requests.get(link).text
soup = BeautifulSoup(res,'lxml')
title = soup.select_one('.name').text.strip()
price = soup.select_one('.price').text
print(title,price)
writer.writerow([title, price])
outfile.close()
I'm trying to save all the data(i.e all pages) in single csv file but this code only save the final page data.Eg Here url[] contains 2 urls. the final csv only contains the 2nd url data.
I'm clearly doing something wrong in the loop.but i dont know what.
And also this page contains 100 data points. But this code only write first 44 rows.
please help this issue.............
from bs4 import BeautifulSoup
import requests
import csv
url = ["http://sfbay.craigslist.org/search/sfc/npo","http://sfbay.craigslist.org/search/sfc/npo?s=100"]
for ur in url:
r = requests.get(ur)
soup = BeautifulSoup(r.content)
g_data = soup.find_all("a", {"class": "hdrlnk"})
gen_list=[]
for row in g_data:
try:
name = row.text
except:
name=''
try:
link = "http://sfbay.craigslist.org"+row.get("href")
except:
link=''
gen=[name,link]
gen_list.append(gen)
with open ('filename2.csv','wb') as file:
writer=csv.writer(file)
for row in gen_list:
writer.writerow(row)
the gen_list is being initialized again inside your loop that runs over the urls.
gen_list=[]
Move this line outside the for loop.
...
url = ["http://sfbay.craigslist.org/search/sfc/npo","http://sfbay.craigslist.org/search/sfc/npo?s=100"]
gen_list=[]
for ur in url:
...
i found your post later, wanna try this method:
import requests
from bs4 import BeautifulSoup
import csv
final_data = []
url = "https://sfbay.craigslist.org/search/sss"
r = requests.get(url)
data = r.text
soup = BeautifulSoup(data, "html.parser")
get_details = soup.find_all(class_="result-row")
for details in get_details:
getclass = details.find_all(class_="hdrlnk")
for link in getclass:
link1 = link.get("href")
sublist = []
sublist.append(link1)
final_data.append(sublist)
print(final_data)
filename = "sfbay.csv"
with open("./"+filename, "w") as csvfile:
csvfile = csv.writer(csvfile, delimiter = ",")
csvfile.writerow("")
for i in range(0, len(final_data)):
csvfile.writerow(final_data[i])
Okay, I have built a program to scrape yahoo finance. I want the historical prices of a certain stock. I then want it to be written to an excel spreadsheet. It is doing everything the way it's supposed to, but it gives me ALL of the data on the whole page! I need just the data in the table. Thanks.
import urllib
import urllib.request
from bs4 import BeautifulSoup
import os
import requests
def make_soup(url):
thepage = urllib.request.urlopen(url)
soupdata = BeautifulSoup(thepage, "html.parser")
return soupdata
playerdatasaved=""
soup = make_soup("https://finance.yahoo.com/q/hp?s=USO+Historical+Prices")
for record in soup.findAll('tr'):
playerdata=""
for data in record.findAll('td'):
playerdata=playerdata+","+data.text
if len(playerdata)!=0:
playerdatasaved = playerdatasaved + "\n" + playerdata[1:]
header="Open,Close,High,Low"
file = open(os.path.expanduser("Uso.csv"),"wb")
file.write(bytes(header, encoding="ascii",errors='ignore'))
file.write(bytes(playerdatasaved, encoding="ascii",errors='ignore'))
print(playerdatasaved)
To get the table of data:
soup = make_soup("https://finance.yahoo.com/q/hp?s=USO+Historical+Prices")
table = [[cell.text for row in soup.findAll('tr')] for cell in soup.findAll('td')]
To write out the table of data to a file:
import csv
with open("output.csv", "wb") as f:
writer = csv.writer(f)
writer.writerows(table)