im trying to create a table scraper, I’ve written this code http://pastebin.com/t1wSPvbb
But i have problem with saving data in text, after 10 page the scraper save same data
this is the code whit problem :
for num in range(1,500):
print num
try:
resp = ''
resp = opener.open("http://login.site.com/view.asp?view&PAGE="+str(num))
soup = BeautifulSoup(resp.read())
for tr in soup.find_all('tr')[3:]:
tds = tr.find_all('td')
outfile.write (tds[2].text.encode('utf-8','replace') + ',' + tds[0].text.encode('utf-8','replace') + ',' + tds[1].text.encode('utf-8','replace') + '\n')
pass
except:
pass
After 10 pages, the scraper saves the same data for the next few pages.
Related
I´m trying to export some data from a website and I first tried on one single page. I´ve to import text delimited by titles:
['Drug name','General Information','Clinical Results','Side Effects','Mechanism of Action','Literature
References','Additional Information','Approval Date','Date Created','Company Name']
The url is https://www.centerwatch.com/directories/1067-fda-approved-drugs/listing/3092-afinitor-everolimus
The code currently works, it gives me all the data. But when I insert it on the CSV , the information is not delimited as I wish.
As it is one single page, the excel should have ONE row... but it doesn´t
The code:
from bs4 import BeautifulSoup
import requests
import csv
csv_file = open('Drugs.csv','w')
csv_writer = csv.writer(csv_file, delimiter ='+')
csv_writer.writerow(['Drug name','General Information','Clinical Results','Side Effects','Mechanism of Action','Literature References','Additional Information','Approval Date','Date Created','Company Name'])
link = requests.get('https://www.centerwatch.com/directories/1067-fda-approved-drugs/listing/3092-afinitor-everolimus')
aux =[]
soup = BeautifulSoup(link.content, 'lxml')
drugName = soup.find('div', class_='company-navigation').find('h1').text
gralInfo = soup.find('div', class_='body directory-listing-profile__description')
y = 0
for h2 in gralInfo.find_all('h2'):
print (y)
text =''
for sibling in h2.find_next_siblings():
if (sibling.name == 'h2'):
break
else:
text = text + sibling.get_text(separator ='\n') + '\n'
print(text)
aux.append(text)
print()
print()
y = y + 1
auxi = []
for info in soup.find_all('div', class_='contact directory-listing-profile__master-detail'):
print(info.text)
auxi.append(info.text)
csv_writer.writerow([drugName, aux[0], aux[1], aux[2], aux[3], aux[4], aux[5], auxi[0], auxi[1], auxi[2]])
I would like the script to scrape all items from each page and append to a csv file but there are 2 problem :
1) When I run the script it only go to single page (the last page = 64). It doesn't crawl from page 1 until 64
2) When the script writes data to csv file it doesn't append new lines but it re-writes the whole csv file.
import csv
# YouTube Video: https://www.youtube.com/watch?v=zjo9yFHoUl8
from selenium import webdriver
MAX_PAGE_NUM = 67
MAX_PAGE_DIG = 1
driver = webdriver.Chrome('/Users/reezalaq/PycharmProjects/untitled2/venv/driver/chromedriver')
with open('result.csv', 'w') as f:
f.write("Product Name, Sale Price, Discount, Old Price \n")
for i in range(1, MAX_PAGE_NUM + 1):
page_num = (MAX_PAGE_DIG - len(str(i))) * "0" + str(i)
url = "https://www.blibli.com/jual/batik-pria?s=batik+pria&c=BA-1000013&i=" + page_num
driver.get(url)
buyers = driver.find_elements_by_xpath("//div[#class='product-title']")
prices = driver.find_elements_by_xpath("//span[#class='new-price-text']")
discount = driver.find_elements_by_xpath("//div[#class='discount']")
oldprice = driver.find_elements_by_xpath("//span[#class='old-price-text']")
num_page_items = len(buyers)
with open('result.csv', 'a') as f:
for c in range(num_page_items):
f.write(buyers[c].text + ' , ' + prices[c].text + ' , ' + discount[c].text + ' , ' + oldprice[c].text + '\n')
driver.close()
The main issue you had is an indentation problem that was just running your script using the last object found on the page.
Another issue I saw is that you were just putting all the titiles together, all the old prices together and so on.
For this reason it will be difficult to understand which price belongs to which item in case, for example, of items with missing data.
To solve this issue I've put all the items in a single webpage into the variable "products".
About the "append" or "write" option of the CSV in my implementation I check as first thing if the result.csv file exists.
Then we have two cases:
result.csv doesn't exist: I create it and I put headers in
result.csv already exists: it means that header is already in place and I can simply append new rows when looping
In order to get data out easily I've used BeautifulSoup (install it easily with pip).
There are still several challenges ahead because the data in this webpage is not consistent but the following example should be enough to get you going.
Please keep in mind that the "break" in the code will stop the scraping at the 1st page.
import csv
# YouTube Video: https://www.youtube.com/watch?v=zjo9yFHoUl8
from selenium import webdriver
from bs4 import BeautifulSoup
import os.path
MAX_PAGE_NUM = 67
MAX_PAGE_DIG = 1
driver = webdriver.Chrome('/Users/reezalaq/PycharmProjects/untitled2/venv/driver/chromedriver')
#driver = webdriver.Chrome()
def write_csv_header():
with open('result.csv', 'w') as f:
f.write("Product Name, Sale Price, Discount, Old Price \n")
def write_csv_row(product_title, product_new_price, product_discount, product_old_price, product_link):
with open('result.csv', 'a') as f:
f.write(product_title + ' , ' + product_new_price + ' , ' + product_discount + ' , ' + product_old_price + ' , ' + product_link + '\n')
if os.path.isfile('result.csv'):
write_csv_header()
for i in range(1, MAX_PAGE_NUM + 1):
page_num = (MAX_PAGE_DIG - len(str(i))) * "0" + str(i)
url = "https://www.blibli.com/jual/batik-pria?s=batik+pria&c=BA-1000013&i=" + page_num
driver.get(url)
source = driver.page_source
soup = BeautifulSoup(source, 'html.parser')
products = soup.findAll("a", {"class": "single-product"})
for product in products:
try:
product_title = product.find("div", {"class": "product-title"}).text.strip()
except:
product_title = "Not available"
try:
product_new_price = product.find("span", {"class": "new-price-text"}).text.strip()
except:
product_new_price = "Not available"
try:
product_old_price = product.find("span", {"class": "old-price-text"}).text.strip()
except:
product_old_price = "Not available"
try:
product_discount = product.find("div", {"class": "discount"}).text.strip()
except:
product_discount = "Not available"
try:
product_link = product['href']
except:
product_link = "Not available"
write_csv_row(product_title, product_new_price, product_discount, product_old_price, product_link)
break # this stops the parsing at the 1st page. I think it is a good idea to check data and fix all discrepancies before proceeding
driver.close()
If you want to append a new line to the file, you must use "a" argument instead of "w".
with open('result.csv', 'a') as f:
f.write("Product Name, Sale Price, Discount, Old Price \n")
definition of "w" option:
Opens a file for writing only. Overwrites the file if the file exists.
If the file does not exist, creates a new file for writing.
definition of "a" option:
Opens a file for appending. The file pointer is at the end of the file
if the file exists. That is, the file is in the append mode. If the
file does not exist, it creates a new file for writing.
definition of "ab" option:
Opens a file for appending in binary format. The file pointer is at
the end of the file if the file exists. That is, the file is in the
append mode. If the file does not exist, it creates a new file for
writing.
Therefore, for appending new lines, you must use options that contain "a" (appending option).
Definitions are represented on this answer.
I'm trying to create a text-delimited file containing the data from the "Actions" table on webpages like this one: http://stats.swehockey.se/Game/Events/300978
I would like each line to include the game # (from the end of the URL) and then the text from the line on the table. For example:
300972 | 60:00 | GK Out | OHK | 33. Hudacek, Julius
I haven't been able to get each row to actually separate. I've tried parsing through each row and column, using a list of stripped strings, and searching by different tags, classes, and styles.
Here's what I currently have:
from bs4 import BeautifulSoup
import urllib.request
def createtext():
gamestr = urlstr + "|"
#Find all table lines. Create one pipe-delimited line for each.
aptext = gamestr
for el in soup.find_all('tr'):
playrow = el.find_all('td', 'tdOdd')
for td in playrow:
if(td.find(text=True)) not in ("", None, "\n"):
aptext = aptext + ''.join(td.text) + "|"
aptext = aptext + "\n" + gamestr
#Creates file with Game # as filename and writes the data to the file
currentfile = urlstr + ".txt"
with open(currentfile, "w") as f:
f.write(str(aptext))
#Grabs the HTML file and creates the soup
urlno = 300978
urlstr = str(urlno)
url = ("http://stats.swehockey.se/Game/Events/" + urlstr)
request = urllib.request.Request(url)
response = urllib.request.urlopen(request)
pbpdoc = response.read().decode('utf-8')
soup = BeautifulSoup(pbpdoc)
createtext()
Thanks for any help or guidance!
First of all, you don't have to construct the CSV data manually, Python provides a built-in csv module for that.
Then, since you are up to "actions" only, I'd identify the "actions" table and find the events-only rows. This can be done with the help of a filtering function checking the first cell to not be empty:
import csv
from bs4 import BeautifulSoup
import requests
def only_action_rows(tag):
if tag.name == 'tr':
first_cell = tag.find('td', class_='tdOdd')
return first_cell and first_cell.get_text(strip=True)
event_id = 300978
url = "http://stats.swehockey.se/Game/Events/{event_id}".format(event_id=event_id)
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
actions_table = soup.find("h2", text="Actions").find_parent("table")
data = [[event_id] + [td.get_text(strip=True) for td in row.find_all('td', class_='tdOdd')]
for row in actions_table.find_all(only_action_rows)]
with open("output.csv", "w") as f:
writer = csv.writer(f)
writer.writerows(data)
Note that I'm using requests here.
I am a complete programming beginner, so please forgive me if I am not able to express my problem very well. I am trying to write a script that will look through a series of pages of news and will record the article titles and their links. I have managed to get that done for the first page, the problem is getting the content of the subsequent pages. By searching in stackoverflow, I think I managed to find a solution that will make the script access more than one URL BUT it seems to be overwriting the content extracted from each page it accesses so I always end up with the same number of recorded articles in the file. Something that might help: I know that URLs follow the following model: "/ultimas/?page=1", "/ultimas/?page=2", etc. and it appears to be using AJAX to request new articles
Here is my code:
import csv
import requests
from bs4 import BeautifulSoup as Soup
import urllib
r = base_url = "http://agenciabrasil.ebc.com.br/"
program_url = base_url + "/ultimas/?page="
for page in range(1, 4):
url = "%s%d" % (program_url, page)
soup = Soup(urllib.urlopen(url))
letters = soup.find_all("div", class_="titulo-noticia")
letters[0]
lobbying = {}
for element in letters:
lobbying[element.a.get_text()] = {}
letters[0].a["href"]
prefix = "http://agenciabrasil.ebc.com.br"
for element in letters:
lobbying[element.a.get_text()]["link"] = prefix + element.a["href"]
for item in lobbying.keys():
print item + ": " + "\n\t" + "link: " + lobbying[item]["link"] + "\n\t"
import os, csv
os.chdir("...")
with open("lobbying.csv", "w") as toWrite:
writer = csv.writer(toWrite, delimiter=",")
writer.writerow(["name", "link",])
for a in lobbying.keys():
writer.writerow([a.encode("utf-8"), lobbying[a]["link"]])
import json
with open("lobbying.json", "w") as writeJSON:
json.dump(lobbying, writeJSON)
print "Fim"
Any help on how I might go about adding the content of each page to the final file would be very appreciated. Thank you!
How about this one if serving the same purpose:
import csv, requests
from lxml import html
base_url = "http://agenciabrasil.ebc.com.br"
program_url = base_url + "/ultimas/?page={0}"
outfile = open('scraped_data.csv', 'w', newline='')
writer = csv.writer(outfile)
writer.writerow(["Caption","Link"])
for url in [program_url.format(page) for page in range(1, 4)]:
response = requests.get(url)
tree = html.fromstring(response.text)
for title in tree.xpath("//div[#class='noticia']"):
caption = title.xpath('.//span[#class="field-content"]/a/text()')[0]
policy = title.xpath('.//span[#class="field-content"]/a/#href')[0]
writer.writerow([caption , base_url + policy])
It looks like the code in your for loop (for page in range(1, 4):) isn't been called due to your file not been correctly indented:
If you tidy up your code, it works:
import csv, requests, os, json, urllib
from bs4 import BeautifulSoup as Soup
r = base_url = "http://agenciabrasil.ebc.com.br/"
program_url = base_url + "/ultimas/?page="
for page in range(1, 4):
url = "%s%d" % (program_url, page)
soup = Soup(urllib.urlopen(url))
letters = soup.find_all("div", class_="titulo-noticia")
lobbying = {}
for element in letters:
lobbying[element.a.get_text()] = {}
prefix = "http://agenciabrasil.ebc.com.br"
for element in letters:
lobbying[element.a.get_text()]["link"] = prefix + element.a["href"]
for item in lobbying.keys():
print item + ": " + "\n\t" + "link: " + lobbying[item]["link"] + "\n\t"
#os.chdir("...")
with open("lobbying.csv", "w") as toWrite:
writer = csv.writer(toWrite, delimiter=",")
writer.writerow(["name", "link",])
for a in lobbying.keys():
writer.writerow([a.encode("utf-8"), lobbying[a]["link"]])
with open("lobbying.json", "w") as writeJSON:
json.dump(lobbying, writeJSON)
print "Fim"
I've been working on a practice web-scraper that gets written reviews and writes them to a csv file, with each review given its own row. I've been having trouble with it as:
I can't seem to strip out the html and get only the text (i.e. the written review and nothing else)
There are a lot of weird spaces between and within even my review text (i.e. a row of space between lines etc.)
Thanks for your help!
Code below:
#! python3
import bs4, os, requests, csv
# Get URL of the page
URL = ('https://www.tripadvisor.com/Attraction_Review-g294265-d2149128-Reviews-Gardens_by_the_Bay-Singapore.html')
# Looping until the 5th page of reviews
pagecounter = 0
while pagecounter != 5:
# Request get the first page
res = requests.get(URL)
res.raise_for_status
# Download the html of the first page
soup = bs4.BeautifulSoup(res.text, "html.parser")
reviewElems = soup.select('.partial_entry')
if reviewElems == []:
print('Could not find clue.')
else:
#for i in range(len(reviewElems)):
#print(reviewElems[i].getText())
with open('GardensbytheBay.csv', 'a', newline='') as csvfile:
for row in reviewElems:
writer = csv.writer(csvfile, delimiter=' ', quoting=csv.QUOTE_ALL)
writer.writerow(row)
print('Writing page')
# Find URL of next page and update URL
if pagecounter == 0:
nextLink = soup.select('a[data-offset]')[0]
elif pagecounter != 0:
nextLink = soup.select('a[data-offset]')[1]
URL = 'http://www.tripadvisor.com' + nextLink.get('href')
pagecounter += 1
print('Download complete')
csvfile.close()
You can use row.get_text(strip=True) to get the text from your selected p.partial_entry. Try the following:
import bs4, os, requests, csv
# Get URL of the page
URL = ('https://www.tripadvisor.com/Attraction_Review-g294265-d2149128-Reviews-Gardens_by_the_Bay-Singapore.html')
with open('GardensbytheBay.csv', 'w', newline='') as csvfile:
writer = csv.writer(csvfile, delimiter=' ')
# Looping until the 5th page of reviews
for pagecounter in range(6):
# Request get the first page
res = requests.get(URL)
res.raise_for_status
# Download the html of the first page
soup = bs4.BeautifulSoup(res.text, "html.parser")
reviewElems = soup.select('p.partial_entry')
if reviewElems:
for row in reviewElems:
review_text = row.get_text(strip=True).encode('utf8', 'ignore').decode('latin-1')
writer.writerow([review_text])
print('Writing page', pagecounter + 1)
else:
print('Could not find clue.')
# Find URL of next page and update URL
if pagecounter == 0:
nextLink = soup.select('a[data-offset]')[0]
elif pagecounter != 0:
nextLink = soup.select('a[data-offset]')[1]
URL = 'http://www.tripadvisor.com' + nextLink.get('href')
print('Download complete')