Use output of one function as input of another function - python

so I want to check a website to update me whenever there's a new item posted. They don't update often so I'm fairly certain when they do update it will be the item of interest. I want to achieve this by choosing a "starting number" and count the number of links on the page then compare that number to the number of links every 10 minutes, until the number of links are greater than the starting number.
First I run this to get the "starting number" of links:
links=[]
for link in soup.findAll('a'):
links.append(link.get('href'))
start_num = len(links)
Then compare that number to the number of links right now and every 5 seconds:
notify=True
while notify:
try:
page = urllib.request.urlopen('web/site/url')
soup = bs(page, "lxml")
links=[]
for link in soup.findAll('a'):
links.append(link.get('href'))
if len(links) > start_num:
message = client.messages.create(to="", from_="",body="")
print('notified')
notify=False
else:
print('keep going')
time.sleep(60*5)
except:
print("Going to sleep")
time.sleep(60*10)
How can I combine all this into 1 function where I run can store the starting number of links without overwriting it every time I check it against the current number of links?

you can do it at least two ways: decorators and generators
Decorators:
def hang_on(func):
# soup should be in a visible scope
def count_links():
# refresh page?
return len(soup.findAll('a'))
start_num = count_links()
def wrapper(*args, **kwargs):
while True:
try:
new_links = count_links()
if new_links > start_num:
start_num = new_links
return fund(*args, **kwargs)
print('keep going')
time.sleep(60*5)
except:
print("Going to sleep")
time.sleep(60*10)
return wrapper
#hang_on
def notify():
message = client.messages.create(to="", from_="",body="")
print('notified')
# somewhere in your code, simply:
notify()
Generators:
def gen_example(soup):
# initialize soup (perhaps from url)
# soup should be in a visible scope
def count_links():
# refresh page?
return len(soup.findAll('a'))
start_num = count_links()
while True:
try:
new_links = count_links()
if new_links > start_num:
start_num = new_links
message = client.messages.create(to="", from_="",body="")
print('notified')
yield True # this is what makes this func a generator
print('keep going')
time.sleep(60*5)
except:
print("Going to sleep")
time.sleep(60*10)
# somewhere in your code:
gen = gen_example(soup) # initialize
gen.next() # will wait and notify
# coming soon

I would implement it as a class, because this code is quite readable and easy to support. Enjoy:
class Notifier:
url = 'web/site/url'
timeout = 60 * 10
def __links_count(self):
page = urllib.request.urlopen(self.url)
soup = bs(page, "lxml")
links=[]
for link in soup.findAll('a'):
links.append(link.get('href'))
return len(links)
def __notify(self):
client.messages.create(to="", from_="", body="")
print('notified')
def run(self):
current_count = self.__links_count()
while True:
try:
new_count = self.__links_count()
if new_count > current_count:
self.__notify()
break
sleep(self.timeout)
except:
print('Keep going')
sleep(self.timeout)
notifier = Norifier()
notifier.run()

Related

How to print an incremented number after every loop in Python?

I'm very new to python, but I've made a lot of progress over the last few days. The below script works fine, but I just can't figure out how implement code that would print an incremented number every time 'avail' is equal to NO. I'd like to have it print something like 'None Available 1' on the first loop, then 'None Available 2' on the second loop, then 'None Available 3' on the third loop, etc..
import requests
import time
import subprocess
from bs4 import BeautifulSoup
def get_page(url):
response = requests.get(url)
if not response.ok:
print('Server responded:', response.status_code)
else:
soup = BeautifulSoup(response.text, 'lxml')
return soup
def get_detail_data(soup):
avail = soup.find('span', id='availability').text.strip()
if avail == "YES":
return True
elif avail == "NO":
print('None Available')
return False
else:
print("Unexpected value")
return None
def main():
url ='https://www.blahblah.com'
while True:
is_available = get_detail_data(get_page(url))
if is_available:
subprocess.call(["C:\\temp\\filename.bat"], shell=False)
break
time.sleep(2)
if __name__ == '__main__':
main()
The following would probably work, but there might be a better way to structure it.
_not_avail_counter = 0
def get_detail_data(soup):
avail = soup.find('span', id='availability').text.strip()
if avail == "YES":
return True
elif avail == "NO":
_not_avail_counter += 1
print('None Available ' + str(_not_avail_counter))
return False
else:
print("Unexpected value")
return None
I would suggest changing your while True loop into a for loop on an itertools.count iterator. You can pass the value from the count to the get_detail_data function with an argument.
import itertools
def get_detail_data(soup, count): # take the count as an argument
avail = soup.find('span', id='availability').text.strip()
if ...
# ...
elif avail == "NO":
print('None Available', count) # include count here (and anywhere else you want)
# ...
def main():
url ='https://www.blahblah.com'
for c in itertools.count(): # produce the count in a loop
is_available = get_detail_data(get_page(url), c)
# ...
Note that itertools.count starts counting a zero. If you want to start at 1 (like a human usually would when counting things), you may want to pass 1 as the start argument: for c in itertools.count(1).

Pyautogui For Loop Skips

i'm having an issue looping through a list of items and using the pyautogui.write function to input in items from each iterated items in a text box,basically what happens is that the loop skips sometimes two items or three items, here is my code
The name_list argument here is an actual list of names
def send_name(name_list):
time.sleep(4)
pyautogui.click(1024,227)
for name in name_list:
create_request = pyautogui.click(1195,239)
time.sleep(1)
acct = pyautogui.click(1018,392)
pyautogui.click(1053,427)
promo_code_box = pyautogui.click(1006,466)
pyautogui.write(name)
time.sleep(2)
pyautogui.doubleClick(880,436)
quantity = pyautogui.click(998,533)
submit = pyautogui.click(754,577)
time.sleep(1)
confirm = pyautogui.click(757,487)
pyautogui.click(778,245)
for instance if the name_list was ['prada','gucci','adidas','nike','puma','dior'] - the loop will skip like 2 items in here whereas i'll need it to loop through all the list
Thanks
Adding more sleep statements should ensure no skipping:
def send_name(name_list):
time.sleep(4)
pyautogui.click(1024,227)
for name in name_list:
time.sleep(1)
create_request = pyautogui.click(1195,239)
time.sleep(1)
acct = pyautogui.click(1018,392)
time.sleep(1)
pyautogui.click(1053,427)
time.sleep(1)
promo_code_box = pyautogui.click(1006,466)
time.sleep(1)
pyautogui.write(name)
time.sleep(2)
pyautogui.doubleClick(880,436)
time.sleep(1)
quantity = pyautogui.click(998,533)
time.sleep(1)
submit = pyautogui.click(754,577)
time.sleep(1)
confirm = pyautogui.click(757,487)
time.sleep(1)
pyautogui.click(778,245)
U can use
pyautogui.PAUSE = 1 `(or more)`
def send_name(name_list):
time.sleep(4)
pyautogui.click(1024,227)
for name in name_list:
create_request = pyautogui.click(1195,239)
time.sleep(1)
acct = pyautogui.click(1018,392)
pyautogui.click(1053,427)
promo_code_box = pyautogui.click(1006,466)
pyautogui.write(name)
time.sleep(2)
pyautogui.doubleClick(880,436)
quantity = pyautogui.click(998,533)
submit = pyautogui.click(754,577)
time.sleep(1)
confirm = pyautogui.click(757,487)
pyautogui.click(778,245)
for give seconds for the next step

How to make my we crawler solve a wikipedia game faster?

I'm making a wikipedia crawler but it's very slow. How can I make it faster?
I'm using requests module and beautifulsoup4 to parse the html pages. I've tried implementing multithreading but it's still slow.
import requests
from bs4 import BeautifulSoup as bs
from queue import Queue
baseURL = "https://en.wikipedia.org";
startURL = "/wiki/French_battleship_Courbet_(1911)"
endURL = "/wiki/Royal_Navy"
tovisit = Queue()
visited = []
def main():
if (not checkValid(startURL)) or (not checkValid(endURL)):
print("Invalid URLs entered.")
quit()
initCrawler(startURL)
def initCrawler(startURL):
global tovisit
global visited
tovisit.put(startURL)
finished = False
while not finished:
if tovisit.empty():
finished = True
continue
url = tovisit.get()
childlinks = linkCrawl(url)
for i in childlinks:
tovisit.put(i)
visited.append(url)
def linkCrawl(url):
global visited
global tovisit
global endURL
print("crawling "+ url + "\n")
r = requests.get(baseURL+url)
soup = bs(r.content, "html.parser")
rawlinks = soup.find_all('a', href=True)
refinedlinks = []
for rawLink in rawlinks:
i = rawLink["href"]
if i is None:
continue
# ensure what we have is a string
if not (type(i) is str):
continue
# no poi
if i in visited:
continue
if i in list(tovisit.queue):
continue
if not checkValid(i):
continue
if i == endURL:
print("yay")
exit()
refinedlinks.append(i)
return refinedlinks
def checkValid(url):
if not url.startswith("/wiki/"):
return False
if url.startswith("/wiki/Special:"):
return False
if url.startswith("/wiki/Wikipedia:"):
return False
if url.startswith("/wiki/Portal:"):
return False
if url.startswith("/wiki/File:"):
return False
if url.endswith("(disambiguation)"):
return False
return True
if __name__ == "__main__":
main()
I expect the bot to run faster, but it's actually slow. Research says that eventually multithreading won't be enough.

Python - how to get updated link that was passed as an argument?

I'm passing a link as an argument in a thread, that I want to scrape the timestamp on. But in the function that the thread is pointing to, the timestamp value does not change, every time i'm rescraping it. How do you get timeLink to be dynamic and change every time it goes over the while loop? Here is the code:
def abcStart(timeLink):
while True:
res = timeLink
res.raise_for_status()
timestamp = BeautifulSoup(res.content, 'html.parser').find_all('b')
if timestamp[0].text == otherTimestamp[0].text:
work on something
break
if timestamp[0].text > otherTimestamp[0].text:
continue
else:
print('not yet')
time.sleep(30)
break
timelink = requests.get('http://example.com/somelink')
threadobj = threading.Thread(target=abcStart, args=(timelink))
threadobj.start()
threadobj.join()
It looks like there is only one http request being sent. On this line:
timelink = requests.get('http://example.com/somelink')
the abcStart() function is receiving the http response, and using that one value the whole time it is running. This will cause us to scrape the same page every time. If we want to have a different page to scrape for each loop iteration, we need to perform another http request each time. Something like this:
def abcStart(timeLink):
while True:
res = requests.get(timeLink) # send request here
res.raise_for_status()
timestamp = BeautifulSoup(res.content, 'html.parser').find_all('b')
if timestamp[0].text == otherTimestamp[0].text:
work on something
break
if timestamp[0].text > otherTimestamp[0].text:
continue
else:
print('not yet')
time.sleep(30)
break
timeLink = 'http://example.com/somelink' # declare url
threadobj = threading.Thread(target=abcStart, args=(timelink))
threadobj.start()
threadobj.join()
I guess you should move timeLink request inside your function:
def abcStart(timeLink):
while True:
res = requests.get('http://example.com/somelink')
res.raise_for_status()
timestamp = BeautifulSoup(res.content, 'html.parser').find_all('b')
if timestamp[0].text == otherTimestamp[0].text:
work on something
break
if timestamp[0].text > otherTimestamp[0].text:
continue
else:
print('not yet')
time.sleep(30)
break
threadobj = threading.Thread(target=abcStart, args=())
threadobj.start()
threadobj.join()

Calling a function within a function within a class

So there's this website that posts something I want to buy at a random time of day for a limited amount of time and I want to write something to send a message to my phone when a new url is posted to that webpage.
I planned on doing this by counting the number of links on the page (since it's rarely updated) and checking it every 5 minutes against what it was 5 minutes before that, then 5 minutes later check it against what it was 10 minutes before that, 5 minutes later check what it was 15 minutes before that... and if it's greater than what it originally was, send a message to my phone. Here's what I have so far:
class url_alert:
url = ''
def link_count(self):
notifyy=True
while notifyy:
try:
page = urllib.request.urlopen(self.url)
soup = bs(page, "lxml")
links=[]
for link in soup.findAll('a'):
links.append(link.get('href'))
notifyy=False
print('found', int(len(links)), 'links')
except:
print('Stop making so many requests')
time.sleep(60*5)
return len(links)
def phone(self):
self= phone
phone.message = client.messages.create(to="", from_="",body="")
print('notified')
def looper(self):
first_count = self.link_count()
print('outside while')
noty = True
while noty:
try:
second_count = self.link_count()
print('before compare')
if second_count == first_count:
self.phone()
noty = False
except:
print('not quite...')
time.sleep(60)
alert = url_alert()
alert.looper()
As a test, I decided to set the if statement that determines whether or not to send a message as equal but the loop kept on running. Am I calling the functions within the looper function the right way?
It looks like you need to eliminate the try block, as it is now, if self.phone() takes an exception you will never leave the loop
def looper(self):
first_count = self.link_count()
while True:
if first_count != self.link_count():
self.phone()
break
time.sleep(60)

Categories

Resources