Python web scraping exchange prices - python

I want scraping the exchange prices informations from this website and after take it into a database: https://www.mnb.hu/arfolyamok
I wrote this code, but something wrong with it. How can i fix it, where i have to change it?
I am working with Python 2.7.13 on Windows 7.
The code is here:
import csv
import requests
from BeautifulSoup import BeautifulSoup
url = 'https://www.mnb.hu/arfolyamok'
response = requests.get(url)
html = response.content
soup = BeautifulSoup(html)
table = soup.find('tbody', attrs={'class': 'stripe'})
list_of_rows = []
for row in table.findAll('tr')[1:]:
list_of_cells = []
for cell in row.findAll('td'):
text = cell.text.replace(' ', '')
list_of_cells.append(text)
list_of_rows.append(list_of_cells)
print list_of_rows
outfile = open("./inmates.csv", "wb")
writer = csv.writer(outfile)
writer.writerow(["Pénznem", "Devizanév", "Egység", "Forintban kifejezett érték"])
writer.writerows(list_of_rows)

Add # coding=utf-8 to the top of your code. This will help solve the SyntaxError you are receiving. Also make sure your indentation is correct!

Related

issues saving data into csv during webscraping

I am having some issues while saving rows in a csv file after web scraping. I used the same notation and it worked well before on another site but now the csv file is blank. It seems python is not writing any row.
I show you my code, thanks in advance:
import requests
from bs4 import BeautifulSoup
import csv
import lxml
html_page = requests.get('https://www.scrapethissite.com/pages/forms/?page_num=1').text
soup = BeautifulSoup(html_page, 'lxml')
# get the number of pages (it might change in the future as the data is updated)
pagenum = soup.find('ul', {'class': 'pagination'})
n = pagenum.findAll('li')[-2].find('a')['href'].split('=')[1]
# now we convert the value of the page in a range so that we can loop over it
page = range(1, int(n) + 1)
print(page)
with open('HockeyLeague.csv', 'w') as f:
csv_writer = csv.writer(f)
csv_writer.writerow(['team_name', 'year', 'wins', 'losses', 'win_perc', 'goal_for', 'goal_against'])
for p in page:
html_page = requests.get(f'https://www.scrapethissite.com/pages/forms/?page_num={p}&per_page=25').text
soup = BeautifulSoup(html_page, 'lxml')
table = soup.find('table', {'class': 'table'})
for row in table.findAll('tr', {'class': 'team'}):
# getting the wanted variables:
team_name = row.find('td', {'class': 'name'}).text
year = row.find('td', {'class': 'year'}).text
wins = row.find('td', {'class': 'wins'}).text
losses = row.find('td', {'class': 'losses'}).text
goal_for = row.find('td', {'gf'}).text
goal_against = row.find('td', {'ga'}).text
try:
win_perc = row.find('td', {'pct text-success'}).text
except:
win_perc = row.find('td', {'pct text-danger'}).text
# write the data in the csv file we created at the beginning
csv_writer.writerow([team_name, year, wins, losses, win_perc, goal_for, goal_against])
Cause script in general is working these are just some things you should keep in mind:
I would recommend opening the file with newline='' on all platforms
to disable universal newlines translation and encoding='utf-8' to
be sure you are working on the "correct" one:
with open('HockeyLeague.csv', 'w', newline='', encoding='utf-8') as f:
...
.strip() your texts or use .get_text(strip=True) to get a clean
output and avoid linebreaks you do not wont.
team_name = row.find('td', {'class': 'name'}).text.strip()
year = row.find('td', {'class': 'year'}).text.strip()
...
In newer code avoid old syntax findAll() instead use find_all() -
For more take a minute to check
docs
Alternative Example
Uses a while loop the checks the "Next Button" and extract its url, also stripped_strings to extract the texts from each row:
import requests
from bs4 import BeautifulSoup
import csv
url = 'https://www.scrapethissite.com/pages/forms/'
with open('HockeyLeague.csv', 'w', newline='', encoding='utf-8') as f:
csv_writer = csv.writer(f)
csv_writer.writerow(['team_name', 'year', 'wins', 'losses', 'win_perc', 'goal_for', 'goal_against'])
while True:
html_page = requests.get(url).text
soup = BeautifulSoup(html_page)
for row in soup.find_all('tr', {'class': 'team'}):
# write the data in the csv file we created at the beginning
csv_writer.writerow(list(row.stripped_strings)[:-1])
if soup.select_one('.pagination a[aria-label="Next"]'):
url = 'https://www.scrapethissite.com'+soup.select_one('.pagination a[aria-label="Next"]').get('href')
else:
break
Output
team_name,year,wins,losses,win_perc,goal_for,goal_against
Boston Bruins,1990,44,24,0.55,299,264
Buffalo Sabres,1990,31,30,0.388,292,278
Calgary Flames,1990,46,26,0.575,344,263
Chicago Blackhawks,1990,49,23,0.613,284,211
Detroit Red Wings,1990,34,38,0.425,273,298
Edmonton Oilers,1990,37,37,0.463,272,272
...

After scraping I can not write the text to a text file

I am trying to scrape the prices from a website and it's working but... I can't write the result to a text.file.
this is my python code.
import requests
from bs4 import BeautifulSoup as bs
url = "https://www.futbin.com/stc/cheapest"
r = requests.get(url)
soup = bs(r.content, "html.parser")
price = soup.find("div", {"class":"d-flex row col-md-9 px-0"})
name =("example")
f =open(name + '.txt', "a")
f.write(price.text)
This is not working but if I print it instead of try to write it to a textfile it's working. I have searched for a long time but don't understand it. I think it must be a string to write to a text file but don't know how to change the ouput to a string.
You're getting error due to unicode character.
Try to add encoding='utf-8' property while opening a file.
Also your code gives a bit messy output. Try this instead:
import requests
from bs4 import BeautifulSoup as bs
url = "https://www.futbin.com/stc/cheapest"
r = requests.get(url)
soup = bs(r.content, "html.parser")
rows = soup.find("div", {"class":"d-flex row col-md-9 px-0"})
prices = rows.findAll("span",{"class":"price-holder-row"})
names = rows.findAll("div",{"class":"name-holder"})
price_list = []
name_list = []
for price in prices:
price_list.append(price.text.strip("\n "))
for name in names:
name_list.append(name.text.split()[0])
name =("example")
with open(f"{name}.txt",mode='w', encoding='utf-8') as f:
for name, price in zip(name_list,price_list):
f.write(f"{name}:{price}\n")

Writing to scraped links to a CSV file using Python3

I have scraped a website for html links and have a result of about 500 links. When I try to write them to a csv file, I do not get the list only the base page.
Here is my code:
import requests
from bs4 import BeautifulSoup
import csv
page = requests.get('https://www.census.gov/programs-surveys/popest.html')
print(page.status_code)
soup = BeautifulSoup(page.text, 'html.parser')
link_set = set()
for link in soup.find_all('a'):
web_links = link.get("href")
print(web_links)
csvfile = open('code_python.csv', 'w+', newline='')
writer = csv.writer(csvfile)
writer.writerow(['Links'])
writer.writerow([web_links])
csvfile.close()
I only get two lines in my csv file. The header 'Links' and www.census.gov. I have tried making it different by add another for loop in the csv writer area, but I get similar results.
for link in soup.find_all('a'):
web_links = link.get('href')
abs_url = join(page, web_links)
print(abs_url)
if abs_url and abs_url not in link_set:
writer.write(str(abs_url) + "\n")
link_set.add(abs_url)
It seems the 'web_links' definition should be where I put all the links into the csv file, but no dice. Where am I making my mistake?
In your code, you are writing two row in csv i.e.
writer.writerow(['Links'])
writer.writerow([web_links])
Here web_links is the last instance of retrieved href value.
I don't see the use of set instance. You can print and write in the csv without using set instance in following way :
page = requests.get('https://www.census.gov/programs-surveys/popest.html')
print(page.status_code)
soup = BeautifulSoup(page.text, 'html.parser')
csvfile = open('code_python.csv', 'w+', newline='')
writer = csv.writer(csvfile)
writer.writerow(['Links'])
for link in soup.find_all('a'):
web_links = link.get("href")
if web_links:
print(web_links)
writer.writerow([web_links])
csvfile.close()
You have never added the scrapped links to your set():
import requests
from bs4 import BeautifulSoup
import csv
page = requests.get('https://www.census.gov/programs-surveys/popest.html')
print(page.status_code)
soup = BeautifulSoup(page.text, 'html.parser')
link_set = set()
for link in soup.find_all('a'):
web_links = link.get("href")
print(web_links)
link_set.add(web_links)
csvfile = open('code_python.csv', 'w+', newline='')
writer = csv.writer(csvfile)
writer.writerow(['Links'])
for link in link_set:
writer.writerow([link])
csvfile.close()

csv.writing formating from beautifulsoup table/row

So i have a working code that pulls data from 30 websites on a domain.
with open("c:\source\list.csv") as f:
for row in csv.reader(f):
for url in row:
r = requests.get(url)
soup = BeautifulSoup(r.content, 'lxml')
tables = soup.find('table', attrs={"class": "hpui-standardHrGrid-table"})
for rows in tables.find_all('tr', {'releasetype': 'Current_Releases'})[0::1]:
item = []
for val in rows.find_all('td'):
item.append(val.text.strip())
with open('c:\source\output_file.csv', 'w', newline='') as f:
writer = csv.writer(f)
writer.writerow({url})
writer.writerows(item)
When I open the CSV file, I see each character taken from the 'Item' variable is stored in its own cell. I can't seem to find out what the heck is doing this and how to fix it.
Any thoughts?
I fixed this by changing
writer.writerows(item)
to
writer.writerow(item)

python webscraping and write data into csv

I'm trying to save all the data(i.e all pages) in single csv file but this code only save the final page data.Eg Here url[] contains 2 urls. the final csv only contains the 2nd url data.
I'm clearly doing something wrong in the loop.but i dont know what.
And also this page contains 100 data points. But this code only write first 44 rows.
please help this issue.............
from bs4 import BeautifulSoup
import requests
import csv
url = ["http://sfbay.craigslist.org/search/sfc/npo","http://sfbay.craigslist.org/search/sfc/npo?s=100"]
for ur in url:
r = requests.get(ur)
soup = BeautifulSoup(r.content)
g_data = soup.find_all("a", {"class": "hdrlnk"})
gen_list=[]
for row in g_data:
try:
name = row.text
except:
name=''
try:
link = "http://sfbay.craigslist.org"+row.get("href")
except:
link=''
gen=[name,link]
gen_list.append(gen)
with open ('filename2.csv','wb') as file:
writer=csv.writer(file)
for row in gen_list:
writer.writerow(row)
the gen_list is being initialized again inside your loop that runs over the urls.
gen_list=[]
Move this line outside the for loop.
...
url = ["http://sfbay.craigslist.org/search/sfc/npo","http://sfbay.craigslist.org/search/sfc/npo?s=100"]
gen_list=[]
for ur in url:
...
i found your post later, wanna try this method:
import requests
from bs4 import BeautifulSoup
import csv
final_data = []
url = "https://sfbay.craigslist.org/search/sss"
r = requests.get(url)
data = r.text
soup = BeautifulSoup(data, "html.parser")
get_details = soup.find_all(class_="result-row")
for details in get_details:
getclass = details.find_all(class_="hdrlnk")
for link in getclass:
link1 = link.get("href")
sublist = []
sublist.append(link1)
final_data.append(sublist)
print(final_data)
filename = "sfbay.csv"
with open("./"+filename, "w") as csvfile:
csvfile = csv.writer(csvfile, delimiter = ",")
csvfile.writerow("")
for i in range(0, len(final_data)):
csvfile.writerow(final_data[i])

Categories

Resources