In a previous question I got the answer from Hedgehog! (How to check for new discounts and send to telegram if changes detected?)
But another question is, how can I get only the new (products) items in the output and not all the text what is changed. My feeling is that the output I got is literally anything what is changed on the website and not only the new added discount.
Here is the code, and see the attachment what the output is. Thanks again for all the effort.
`# Import all necessary packages
import requests, time, difflib, os, re, schedule, cloudscraper
from bs4 import BeautifulSoup
from datetime import datetime
# Define scraper
scraper = cloudscraper.create_scraper()
# Send a message via a telegram bot
def telegram_bot_sendtext(bot_message):
bot_token = '1XXXXXXXXXXXXXXXXXXXXXXXXXXG5pses8'
bot_chatID = '-XXXXXXXXXXX'
send_text = 'https://api.telegram.org/bot' + bot_token + '/sendMessage?chat_id=' + bot_chatID
+ '&parse_mode=Markdown&text=' + bot_message
response = requests.get(send_text)
return response.json()
PrevVersion = ""
FirstRun = True
while True:
# Download the page with the specified URL
response = scraper.get("https://").content
# Url for in the messages to show
url = "https://"
# Act like a browser
#headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
# Parse the downloaded page and check for discount on the page
soup = BeautifulSoup(response, 'html.parser')
def get_discounts(soup):
for d in soup.select('.cept-discount'):
if d.text != '' and 65 < int(''.join(filter(str.isdigit, d.text))) < 99:
return True
else:
return False
# Remove all scripts and styles
for script in soup(["script", "style"]):
script.extract()
discounts = get_discounts(soup)
soup = soup.get_text()
# Compare the page text to the previous version and check if there are any discounts in your range
if PrevVersion != soup and discounts:
# On the first run - just memorize the page
if FirstRun == True:
PrevVersion = soup
FirstRun = False
print ("Start Monitoring "+url+ ""+ str(datetime.now()))
else:
print ("Changes detected at: "+ str(datetime.now()))
OldPage = PrevVersion.splitlines()
NewPage = soup.splitlines()
diff = difflib.context_diff(OldPage,NewPage,n=0)
out_text = "\n".join([ll.rstrip() for ll in '\n'.join(diff).splitlines() if ll.strip()])
print (out_text)
OldPage = NewPage
# Send a message with the telegram bot
telegram_bot_sendtext("Nieuwe prijsfout op Pepper " + url )
# print ('\n'.join(diff))
PrevVersion = soup
else:
print( "No Changes "+ str(datetime.now()))
time.sleep(5)
continue`
What happens?
As discussed, your assumptions are going in the right direction, all the changes identified by the difflib will be displayed.
It may be possible to adjust the content of difflib but I am sure that difflib is not absolutely necessary for this task.
How to fix?
First step is to upgrade get_discounts(soup) to not only check if discount is in range but also get information of the item itself, if you like to display or operate on later:
def get_discounts(soup):
discounts = []
for d in soup.select('.cept-discount'):
if d.text != '' and 65 < int(''.join(filter(str.isdigit, d.text))) < 99:
discounts.append({
'name':d.find_previous('strong').a.get('title'),
'url':d.find_previous('strong').a.get('href'),
'discount':d.text,
'price':d.parent.parent.select_one('.thread-price').text,
'bestprice':d.previous_sibling.text
})
return discounts
Second step is to check if there is a new discount, close to the difflib but more focused:
def compare_discounts(d1: list, d2: list):
diff = [i for i in d1 + d2 if i not in d1]
result = len(diff) == 0
if not result:
return diff
Last step is to react to changes from the discounts, if so it will print the urls from so you can go directly to the offert products.
Note Cause we have stored additional information in our list of dicts you can adjust the printing to get also the whole information or specific attributes
if newDiscounts:
#Send a message with the telegram bot
print('\n'.join([c['url'] for c in newDiscounts]))
telegram_bot_sendtext("Nieuwe prijsfout op Pepper " + url)
Example
import requests, time, difflib, os, re, schedule, cloudscraper
from bs4 import BeautifulSoup
from datetime import datetime
# Define scraper
scraper = cloudscraper.create_scraper()
# Send a message via a telegram bot
def telegram_bot_sendtext(bot_message):
bot_token = '1XXXXXXXXXXXXXXXXXXXXXXXXXXG5pses8'
bot_chatID = '-XXXXXXXXXXX'
send_text = 'https://api.telegram.org/bot' + bot_token + '/sendMessage?chat_id=' + bot_chatID + '&parse_mode=Markdown&text=' + bot_message
response = requests.get(send_text)
return response.json()
PrevVersion = ""
PrevDiscounts = []
FirstRun = True
def get_discounts(soup):
discounts = []
for d in soup.select('.cept-discount'):
if d.text != '' and 65 < int(''.join(filter(str.isdigit, d.text))) < 99:
discounts.append({
'name':d.find_previous('strong').a.get('title'),
'url':d.find_previous('strong').a.get('href'),
'discount':d.text,
'price':d.parent.parent.select_one('.thread-price').text,
'bestprice':d.previous_sibling.text
})
return discounts
def compare_discounts(d1: list, d2: list):
diff = [i for i in d1 + d2 if i not in d1]
result = len(diff) == 0
if not result:
return diff
while True:
# Download the page with the specified URL
response = requests.get("https://nl.pepper.com/nieuw").content
# Url for in the messages to show
url = "https://nl.pepper.com/nieuw"
# Parse the downloaded page and check for discount on the page
soup = BeautifulSoup(response, 'html.parser')
# Remove all scripts and styles
for script in soup(["script", "style"]):
script.extract()
discounts = get_discounts(soup)
souptext = soup.get_text()
# Compare the page text to the previous version and check if there are any discounts in your range
if PrevVersion != souptext and discounts:
# On the first run - just memorize the page
if FirstRun == True:
PrevVersion = souptext
PrevDiscounts = discounts
FirstRun = False
print ("Start Monitoring "+url+ ""+ str(datetime.now()))
else:
print ("Changes detected at: "+ str(datetime.now()))
newDiscounts = compare_discounts(PrevDiscounts,discounts)
if newDiscounts:
print('\n'.join([c['url'] for c in newDiscounts]))
#Send a message with the telegram bot
telegram_bot_sendtext("Nieuwe prijsfout op Pepper " + url)
else:
print('These are general changes but there are no new discounts available.')
PrevVersion = souptext
PrevDiscounts = discounts
else:
print( "No Changes "+ str(datetime.now()))
time.sleep(10)
continue
Output
Start Monitoring https://nl.pepper.com/nieuw 2021-12-12 12:28:38.391028
No Changes 2021-12-12 12:28:54.009881
Changes detected at: 2021-12-12 12:29:04.429961
https://nl.pepper.com/aanbiedingen/gigaset-plug-startpakket-221003
No Changes 2021-12-12 12:29:14.698933
No Changes 2021-12-12 12:29:24.985394
No Changes 2021-12-12 12:29:35.271794
No Changes 2021-12-12 12:29:45.629790
No Changes 2021-12-12 12:29:55.917246
Changes detected at: 2021-12-12 12:30:06.184814
These are general changes but there are no new discounts available.
Related
I'm stuck on a little problem and hope you can help.
I want to create a df by scraping from two parts of a web page. I seem to be stuck on the second part.
My requirement is to get a df with each Horse name and the associated odds.
eg.
Horse Odds
name1 odd1
name2 odd2
I've used a sample page in the script but it will be the same for any
: base url https://www.racingtv.com/racecards/tomorrow
: then select any time to get another page with the horse name and odds details etc.
import requests
import pandas as pd
from bs4 import BeautifulSoup
def main():
# base url is https://www.racingtv.com/racecards/tomorrow
# select any time to get the horse name and odds details etc.
url = 'https://www.racingtv.com/racecards/catterick-bridge/372180-watch-racing-tv-now-novices-hurdle-gbb-race?'
res = requests.get(url)
soup = BeautifulSoup(res.content, "html.parser")
strike = soup.select('div', class_='data-strike-out-group')
# this bit seems to be working
for data in soup.find_all('div',
class_='racecard__runner__column racecard__runner__name'):
for a in data.find_all('a'):
print(a.text)
# this bit sort of works but it seems to repeat the first three items of data
for odds in soup.find_all('div',
class_='racecard__runner__column racecard__runner__column--price'):
for odd1 in odds.find_all('ruk-odd'):
print(odd1.text)
# I tried this to work out how to stop getting the three duplicates but it does not work
for odds in strike.select('div',
class_='racecard__runner__column racecard__runner__column--price'):
for odd1 in odds.find_all('ruk-odd'):
print(odd1.text)
return
if __name__ == '__main__':
main()
class_='data-strike-out-group'
this isn't a class, check the raw html. It's an attribute of the div... weird
Glad you posted this, might end up using this site for a personal project. Figured you'd be interested in this code:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
headers = {
'accept':'*/*',
'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36',
}
url = 'https://www.racingtv.com/racecards/catterick-bridge/372180-watch-racing-tv-now-novices-hurdle-gbb-race?'
resp = requests.get(url,headers=headers)
print(resp)
soup = BeautifulSoup(resp.text,'html.parser')
table = soup.find('div',{'class':'page__content__section racecard'})
race_id = url.split('/')[-1].split('-')[0]
race_name = soup.find('div',class_='race__name').text.strip()
race_date = soup.find('div',class_='race__date').text.strip()
clean_date = datetime.strptime(race_date,'%d %b %Y').strftime('%Y%m%d')
race_info1 = soup.find_all('div',class_='race__subtitle')[0].text.strip()
race_info2 = soup.find_all('div',class_='race__subtitle')[1].text.strip()
final = []
for row in table.find_all('div',class_='racecard__runner--content'):
try:
num = row.find('div',class_='racecard__runner__cloth-number').text.strip()
last_days_ugly = row.find('div',class_='racecard__runner__name').find('a').find('sup').text
horse_name = row.find('div',class_='racecard__runner__name').find('a').text.strip().replace(last_days_ugly,'')
horse_link = 'http://www.racingtv.com'+row.find('div',class_='racecard__runner__name').find('a')['href']
last_race_days = last_days_ugly.strip().replace('(','').replace(')','')
for people in row.find_all('div',class_='racecard__runner__person'):
if 'J:' in people.getText():
jockey = people.find('a').text.strip()
jockey_link = 'http://www.racingtv.com'+people.find('a')['href']
if 'T:' in people.getText():
trainer = people.find('a').text.strip()
trainer_link = 'http://www.racingtv.com'+people.find('a')['href']
form = row.find('div',class_='racecard__runner__column--form_lr').find_all('div')[0].text.strip()
equip = row.find('div',class_='racecard__runner__column--form_lr').find_all('div')[1].text.strip()
weight = row.find('div',class_='racecard__runner__column--weight_age').find_all('div')[0].text.strip()
age = row.find('div',class_='racecard__runner__column--weight_age').find_all('div')[1].text.strip()
o_r = row.find('div',class_='racecard__runner__column--or').text.strip()
odds = row.find('div',class_='racecard__runner__column--price').getText()
odds_dec = row.find('div',class_='racecard__runner__column--price').find('ruk-odd')['data-js-odds-decimal']
odds_data = row.find('div',class_='racecard__runner__column--price').find('ruk-odd')['data-js-odd-alternatives']
except AttributeError: #skip blank starting gates
continue
item = {
'race_url' : url,
'race_id': race_id,
'race_name':race_name,
'race_date':clean_date,
'race_info1':race_info1,
'race_info2':race_info2,
'num': num,
'horse_name':horse_name,
'horse_link':horse_link,
'last_race_days':last_race_days,
'jockey':jockey,
'jockey_link':jockey_link,
'trainer':trainer,
'trainer_link':trainer_link,
'form':form,
'equip':equip,
'weight':weight,
'age':age,
'o_r':o_r,
'odds':odds,
'odds_dec':odds_dec,
'odds_data':odds_data
}
final.append(item)
df = pd.DataFrame(final)
df.to_csv('racingtv.csv',index=False)
print('Saved to racingtv.csv')
Following on from the script supplied kindly by bushcat69 and my subsequent question "how to get the race time into the df" I have cobbled together some code (cut and paste from other sites). I thought you may be interested. It may not be elegant but it seems to work. The section:
race_data.extend(get_racecards_data(url_race, date, racetime
is used to pass the url etc to the bushcat69 script.
Thanks again.
def get_meetings():
global date
global date_ext
odds_date = date_ext
url = f'https://www.racingtv.com/racecards/{date_ext}'
try:
res = requests.get(url, headers = headers)
except:
print('Date or Connection error occured! \nTry again!!')
return
soup = BeautifulSoup(res.text, 'html.parser')
meetings = soup.select('.race-selector__times__race')
course_num = len(meetings)
meetings1 = [a['href'] for a in soup.select('.race-selector__times__race')]
course_num = len(meetings1)
cnt01 = 0
if course_num == 0:
print('Provide a upcoming valid date')
return
for track in meetings1[:course_num]:
cnt01 = cnt01 + 1
trackref = track.split("/")[2]
print(cnt01, ": ", trackref)
need = input(f'{course_num} courses found \nHow many courses to scrape? Press \'a\' for all :\n')
if need == 'a':
n = course_num
else:
try:
n = int(need)
except:
print('Invalid input !')
return
cnt01 = 0
race_data = []
for mtm in meetings[:course_num]:
cnt01 = cnt01 + 1
racetime = mtm.text
href = mtm.attrs
htxt = Text(href)
url_race = htxt.partition("/")[2]
url_race = "/" + url_race.rpartition("'")[0]
print(cnt01, racetime, url_race)
time.sleep(1)
race_data.extend(get_racecards_data(url_race, date, racetime))
print(f"Meeting {url_race.split('/')[2]} scraping completed")
if cnt01 == n:
break
df_race = pd.DataFrame(race_data)
df = df_race
So, I am scraping a webpage and I have a element on the page where it displays an integer, when I scrape that element, i store the plaintext in a variable, then each time it scrapes, i compare the variable to what the plaintext is on the webpage. I am not sure if maybe i need to get a request to the webpage each time?
from win10toast import ToastNotifier
from _overlapped import NULL
from plyer import notification
import requests
from bs4 import BeautifulSoup
toaster = ToastNotifier()
toaster.show_toast("Notification!", "Alert!", threaded=True, icon_path=NULL, duration=3)
URL = "https://rocketleague.tracker.network/rocket-league/profile/steam/76561198074072333/mmr?playlist=13"
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')
_title = ""
_message = ""
recent_mmr = "111"
def get_mmr(url):
results = soup.find_all(class_="stat")
for stat in results :
titles = stat.find_all(class_="label")
for t in titles :
if(t.text.strip() == "Rating"):
val = stat.find(class_="value").text.strip()
return val
def get_rank(url):
results = soup.find(class_="stat tier")
rank = results.find(class_="label")
return rank.text.strip()
_message = "Rank: " + get_rank(URL) + "\n" + "MMR: " + get_mmr(URL)
recent_mmr = get_mmr(URL)
import time
while toaster.notification_active():
time.sleep(0.1)
notification.notify(
title="Ranked 3v3",
message= _message,
app_icon=NULL,
timeout=10
)
print(recent_mmr)
recent_mmr = get_mmr(URL)
while True:
print('running')
#page = requests.get(URL)
recent_mmr = get_mmr(URL)
mmr_temp = recent_mmr
print(mmr_temp +"(temp mmr)")
if mmr_temp == recent_mmr:
print("No update, recent MMR: " + recent_mmr)
mmr_temp = recent_mmr
time.sleep(60)
else:
notification.notify(
title="Ranked 3v3",
message= _message,
app_icon=NULL,
timeout=10
)
time.sleep(60)
recent_mmr = get_mmr(URL)
mmr_temp = recent_mmr
print("Updated, recent MMR: " + recent_mmr)
You're scraping the webpage to get the recent_mmr number, copying that to mmr_temp, and then immediately comparing to see if they're equal -- well of course they are, because you just copied it!
You need to reorganize the loop a little bit, and copy the mmr variable at the bottom of the loop:
previous_mmr = None
while True:
recent_mmr = get_mmr()
if recent_mmr != previous_mmr:
print("mmr changed")
previous_mmr = recent_mmr
I am trying to get a Product List of a website with selenium. I prototyped the program and everything worked perfectly but now I built a loop to get all products and it just gives me the same product 484 times(that's the number of products there are on the website)
Here is my code:
from bs4 import BeautifulSoup as soup # HTML data structure
from urllib.request import urlopen as uReq # Web client
import selenium
from selenium import webdriver
# URl to web scrape from
page_url = "https://www.smythstoys.com/at/de-at/spielzeug/lego/c/SM100114"
driver = webdriver.Chrome()
driver.get(page_url)
buttonName = "loadMoreProducts"
loadMoreButton = driver.find_element_by_id(buttonName)
while loadMoreButton is not None:
try:
try:
loadMoreButton.click()
except selenium.common.exceptions.ElementNotInteractableException:
break
except selenium.common.exceptions.ElementClickInterceptedException:
break
uClient = uReq(page_url)
page_soup = soup(uClient.read(), "html.parser")
uClient.close()
# gets all products
containers = driver.find_elements_by_tag_name('article')
print(len(containers))
# name the output file to write to local disk
out_filename = "smythstoys_product_data.csv"
# header of csv file to be written
headers = "product_name;price; info \n"
# opens file, and writes headers
f = open(out_filename, "w")
f.write(headers)
# loops trough all products
# -----------------------------------------------------------------------
# here is the problem:
for container in driver.find_elements_by_tag_name('article'):
print("----------------------------------------------------------------------")
product_name_container = container.find_element_by_xpath("//h2[#class ='prodName trackProduct']")
product_name = product_name_container.text
print(product_name)
price_container = container.find_element_by_xpath("//div[#class ='price']")
price = price_container.text
print("price:", price)
# ------------------------------------------------------------------------------------
try:
info_container = container.find_element_by_xpath("//span[#class ='decalImage-right']").text
print(info_container)
if not info_container:
info = "no special type"
print(info)
print(info_container)
f.write(product_name + "; " + price + "; " + info + "\n")
continue
if info_container == "https://smyths-at-prod-images.storage.googleapis.com/sys-master/images/hed/h5f/8823589830686" \
"/lego-hard-to-find-decal_CE.svg":
info = "seltenes Set"
elif info_container == "https://smyths-at-prod-images.storage.googleapis.com/sys-master/images/h41/h70" \
"/8823587930142/new-decal_CE%20%281%29.svg":
info = "neues Set"
elif info_container == "https://smyths-at-prod-images.storage.googleapis.com/sys-master/images/hde/hae" \
"/8871381303326/sale-decal_CE.svg":
info = "Sale"
else:
info = "unknown type" + info_container
print(info)
print(info_container)
except NameError:
print("no atribute")
if info_container is None:
info = "unknown type"
print(info)
# writes the dataset to file
f.write(product_name + "; " + price + "; " + info + "\n")
f.close() # Close the file
My output is:
LEGO Star Wars 75244 Tantive IV
price: 199,99€
no special type
and that 484x
I'm not sure why you used selenium to get the products when requests can do it smoothly. The following is something you wanna do to get all the products using requests.
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
link = "https://www.smythstoys.com/at/de-at/at/de-at/spielzeug/lego/c/SM100114/load-more?"
params = {'q':':bestsellerRating:productVisible:true','page':'1'}
p = 0
while True:
params['page'] = p
r = requests.get(link,params=params,headers={
'content-type': 'application/json; charset=utf-8'
})
soup = BeautifulSoup(r.json()['htmlContent'],"lxml")
if not soup.select_one("a.trackProduct[href]"):break
for item in soup.select("a.trackProduct[href]"):
product_name = item.select_one("h2.prodName").get_text(strip=True)
product_price = item.select_one("[itemprop='price']").get("content")
print(product_name,product_price)
p+=1
I am trying to extract some information about mtg cards from a webpage with the following program but I repeatedly retrieve information about the initial page given(InitUrl). The crawler is unable to proceed further. I have started to believe that i am not using the correct urls or maybe there is a restriction in using urllib that slipped my attention. Here is the code that i struggle with for weeks now:
import re
from math import ceil
from urllib.request import urlopen as uReq, Request
from bs4 import BeautifulSoup as soup
InitUrl = "https://mtgsingles.gr/search?q=dragon"
NumOfCrawledPages = 0
URL_Next = ""
NumOfPages = 4 # depth of pages to be retrieved
query = InitUrl.split("?")[1]
for i in range(0, NumOfPages):
if i == 0:
Url = InitUrl
else:
Url = URL_Next
print(Url)
UClient = uReq(Url) # downloading the url
page_html = UClient.read()
UClient.close()
page_soup = soup(page_html, "html.parser")
cards = page_soup.findAll("div", {"class": ["iso-item", "item-row-view"]})
for card in cards:
card_name = card.div.div.strong.span.contents[3].contents[0].replace("\xa0 ", "")
if len(card.div.contents) > 3:
cardP_T = card.div.contents[3].contents[1].text.replace("\n", "").strip()
else:
cardP_T = "Does not exist"
cardType = card.contents[3].text
print(card_name + "\n" + cardP_T + "\n" + cardType + "\n")
try:
URL_Next = InitUrl + "&page=" + str(i + 2)
print("The next URL is: " + URL_Next + "\n")
except IndexError:
print("Crawling process completed! No more infomation to retrieve!")
else:
NumOfCrawledPages += 1
Url = URL_Next
finally:
print("Moving to page : " + str(NumOfCrawledPages + 1) + "\n")
One of the reasons your code fail is, that you don't use cookies. The site seem to require these to allow paging.
A clean and simple way of extracting the data you're interested in would be like this:
import requests
from bs4 import BeautifulSoup
# the site actually uses this url under the hood for paging - check out Google Dev Tools
paging_url = "https://mtgsingles.gr/search?ajax=products-listing&lang=en&page={}&q=dragon"
return_list = []
# the page-scroll will only work when we support cookies
# so we fetch the page in a session
session = requests.Session()
session.get("https://mtgsingles.gr/")
All pages have a next button except the last one. So we use this knowledge to loop until the next-button goes away. When it does - meaning that the last page is reached - the button is replaced with a 'li'-tag with the class of 'next hidden'. This only exists on the last page
Now we're ready to start looping
page = 1 # set count for start page
keep_paging = True # use flag to end loop when last page is reached
while keep_paging:
print("[*] Extracting data for page {}".format(page))
r = session.get(paging_url.format(page))
soup = BeautifulSoup(r.text, "html.parser")
items = soup.select('.iso-item.item-row-view.clearfix')
for item in items:
name = item.find('div', class_='col-md-10').get_text().strip().split('\xa0')[0]
toughness_element = item.find('div', class_='card-power-toughness')
try:
toughness = toughness_element.get_text().strip()
except:
toughness = None
cardtype = item.find('div', class_='cardtype').get_text()
card_dict = {
"name": name,
"toughness": toughness,
"cardtype": cardtype
}
return_list.append(card_dict)
if soup.select('li.next.hidden'): # this element only exists if the last page is reached
keep_paging = False
print("[*] Scraper is done. Quitting...")
else:
page += 1
# do stuff with your list of dicts - e.g. load it into pandas and save it to a spreadsheet
This will scroll until no more pages exists - no matter how many subpages would be in the site.
My point in the comment above was merely that if you encounter an Exception in your code, your pagecount would never increase. That's probably not what you want to do, which is why I recommended you to learn a little more about the behaviour of the whole try-except-else-finally deal.
I am also bluffed, by the request given the same reply, ignoring the page parameter. As a dirty soulution I can offer you first to set up the page-size to a high enough number to get all the Items that you want (this parameter works for some reason...)
import re
from math import ceil
import requests
from bs4 import BeautifulSoup as soup
InitUrl = Url = "https://mtgsingles.gr/search"
NumOfCrawledPages = 0
URL_Next = ""
NumOfPages = 2 # depth of pages to be retrieved
query = "dragon"
cardSet=set()
for i in range(1, NumOfPages):
page_html = requests.get(InitUrl,params={"page":i,"q":query,"page-size":999})
print(page_html.url)
page_soup = soup(page_html.text, "html.parser")
cards = page_soup.findAll("div", {"class": ["iso-item", "item-row-view"]})
for card in cards:
card_name = card.div.div.strong.span.contents[3].contents[0].replace("\xa0 ", "")
if len(card.div.contents) > 3:
cardP_T = card.div.contents[3].contents[1].text.replace("\n", "").strip()
else:
cardP_T = "Does not exist"
cardType = card.contents[3].text
cardString=card_name + "\n" + cardP_T + "\n" + cardType + "\n"
cardSet.add(cardString)
print(cardString)
NumOfCrawledPages += 1
print("Moving to page : " + str(NumOfCrawledPages + 1) + " with " +str(len(cards)) +"(cards)\n")
I am working on a script to scrape a website, the problem is that it works normally when I run it with the interpreter, however after compiling it (PyInstaller or Py2exe) it fails, it appears to be that mechanize / requests both fail to keep the session alive.
I have hidden my username and password here, but I did put them correctly in the compiled code
import requests
from bs4 import BeautifulSoup as bs
from sys import argv
import re
import logging
url = argv[1]
payload = {"userName": "real_username", "password": "realpassword"}
session = requests.session()
resp = session.post("http://website.net/login.do", data=payload)
if "forgot" in resp.content:
logging.error("Login failed")
exit()
resp = session.get(url)
soup = bs(resp.content)
urlM = url[:url.find("?") + 1] + "page=(PLACEHOLDER)&" + \
url[url.find("?") + 1:]
# Get number of pages
regex = re.compile("\|.*\|\sof\s(\d+)")
script = str(soup.findAll("script")[1])
epNum = int(re.findall(regex, script)[0]) # Number of EPs
pagesNum = epNum // 50
links = []
# Get list of links
# If number of EPs > 50, more than one page
if pagesNum == 0:
links = [url]
else:
for i in range(1, pagesNum + 2):
url = urlM.replace("(PLACEHOLDER)", str(i))
links.append(url)
# Loop over the links and extract info: ID, NAME, START_DATE, END_DATE
raw_info = []
for pos, link in enumerate(links):
print "Processing page %d" % (pos + 1)
sp = bs(session.get(link).content)
table = sp.table.table
raw_info.extend(table.findAll("td"))
epURL = "http://www.website.net/exchange/viewep.do?operation"\
"=executeAction&epId="
# Final data extraction
raw_info = map(str, raw_info)
ids = [re.findall("\d+", i)[0] for i in raw_info[::4]]
names = [re.findall("<td>(.*)</td", i)[0] for i in raw_info[1::4]]
start_dates = [re.findall("<td>(.*)</td", i)[0] for i in raw_info[2::4]]
end_dates = [re.findall("<td>(.*)</td", i)[0] for i in raw_info[3::4]]
emails = []
eplinks = [epURL + str(i) for i in ids]
print names
The error happens on the level of epNum variable, this means as I figured that the HTML page is not the one I requested, but it works normally on linux script and compiled, work on widows as script but fails when compiled.
The py2exe tutorial mentions that you need MSVCR90.dll, did you check its present on the PC?