I'm having problem with nested for loop (for doc in query) that is ran only once. It's inside for item in news_items which I have verified iterates 10 times, and the for doc in query loop should iterate 9 times. When I'm printing doc, it prints 9 documents, however as I'm trying to make if / else check on the document's content, it only happens to run one time. (I would expect 9 x 10 outputs since it's checking item from parent, to doc in query but all I get is 9 outputs).
I've tried to look on stack but nothing I found seems to be relevant, from other programing languages I work with I don't see why this wouldn't work but maybe I'm missing something since I'm fairly new to Python (1 week).
def scrape(url):
# GET DATE AT THE TIME OF CRAWL START
today = date.today()
d1 = today.strftime("%d/%m/%Y")
# D2 is used for query only
d2 = today.strftime("%Y%m%d")
# LOAD URL IN DRIVER
driver.get(url)
try:
news_container = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CLASS_NAME, "FlashNews-Box-Root"))
)
# array of items
news_items = news_container.find_elements_by_class_name("FlashNews-Box-Item")
refresher_ref = db.collection(u'news').document('sources').collection('refresher_news')
# query for last article
query = refresher_ref.order_by(u'article_timestamp', direction=firestore.Query.DESCENDING).limit(10).stream()
for item in news_items:
print("News items found: " + str(len(news_items)))
try:
# image is optional so we need to try it
try:
item_image = item.find_element_by_class_name("FlashNews-Box-ItemImage").find_element_by_tag_name(
"img").get_attribute("src")
except Exception as e:
item_image = "unavailable"
# time will be added to the same day as when this was ran, since this will run often and compare
# article texts, we won't have issue with wrong dates
item_time = item.find_element_by_class_name("FlashNews-Box-ItemTime").text + " " + d1
item_time_query_temp = item.find_element_by_class_name("FlashNews-Box-ItemTime").text.replace(":", "")
# normalize timestamp for sorting
if len(item_time_query_temp) == 3:
item_time_query_temp = "0" + item_time_query_temp
item_time_query = d2 + item_time_query_temp
item_text = item.find_element_by_class_name("FlashNews-Box-ItemText").text
item_redirect = item.find_element_by_class_name("FlashNews-Box-ItemText").find_element_by_tag_name(
"a").get_attribute("href")
result = {"article_time": item_time, "article_url": item_redirect, "article_image": item_image,
"article_text": item_text, "article_timestamp": item_time_query}
# print(result)
# save data to firestore - check for last item in firestore, then add this article
is_new = True
print("Printing 10x")
# THIS EXECUTES ONLY ONCE?
for doc in query:
# print(str(len(query)))
current_doc = doc.to_dict()
# print(current_doc)
# print(current_doc)
# print("Iteration: " + current_doc['article_text'])
# print("Old: " + current_doc["article_text"] + " New: " + item_text)
if current_doc['article_text'] == item_text:
print("Match")
# print(current_doc['article_text'] + item_text)
# print("Old: " + current_doc['article_text'] + " New: " + item_text)
else:
print("Mismatch")
# print(current_doc['article_text'] + item_text)
# print("Skipping article as the text exists in last 10")
# else:
# print("Old: " + current_doc['article_text'] + " New: " + item_text)
# print(str(is_new))
# if is_new:
# refresher_ref.add(result)
# print("Adding document")
except Exception as e:
print(e)
except Exception as e:
# HANDLE ERRORS
print(e)
print("Completed running.")
# quit driver at the end of function run
driver.quit()
query isn't a list, but some other iterable type that you can only consume once (similar to a generator). In order to use it multiple times in the outer loop, you'll need to create a list to hold the contents in memory. For example,
# query for last article
query = refresher_ref.order_by(u'article_timestamp', direction=firestore.Query.DESCENDING).limit(10).stream()
query = list(query)
for item in news_items:
...
Related
I am attempting to scrape info from few hundred thousand files I am reading from disk and shove it into a SQLite database. I would like to be able to stop reading the files (i.e. crash or user interrupt) and have it pick up reading the files where it left off. Meaning don't start over with file number one every time. I've started working from a code sample that is helpful, but assumes you have sequentially numbered files delivered in sequential order. My issues:
I understand glob returns files arbitrarily
Files in my directory are similar in name not sequential
File names in directory might look like:
249959 Run Data Email D.eml
250000 Returned mail s.eml
250002 Warning could n.eml
Here is where I am code-wise:
cur.executescript('''
CREATE TABLE IF NOT EXISTS MailSubject (id INTEGER UNIQUE, subject TEXT)''')
# Pick up where we left off
start = None
cur.execute('SELECT max(id) FROM MailSubject' )
try:
row = cur.fetchone()
if row is None :
start = 0
else:
start = row[0]
except:
start = 0
if start is None : start = 0
# Number user wants to grab
many = 0
# Number processed this go
count = 0
fail = 0
while True:
if ( many < 1 ) :
conn.commit()
sval = input('How many messages:')
if ( len(sval) < 1 ) : break
many = int(sval)
start = start + 1
cur.execute('SELECT id FROM MailSubject WHERE id=?', (start,) )
try:
row = cur.fetchone()
if row is not None : continue
except:
row = None
many = many - 1
print ("Many:", many)
# This is where you would define URL or file name to open
# below works for URL with sequential file names
# url = baseurl + str(start) + '/' + str(start + 1)
text = "None"
try:
os.chdir("INBOX-200")
# This is how I open files now, no start / stop
# Files are not sequentially numbered
# Glob retrieves items in arbitrary order
for file in glob.glob("*.eml"):
try:
with open(file, 'r') as f:
text = f.read()
except KeyboardInterrupt:
print('')
print('Program interrupted by user...')
break
except:
print('XXX File cannot be opened:', file)
fail = fail + 1
if fail > 5 : break
break
# Must have succeed, increase qty processed this round
count = count + 1
Thanks for your assistance.
I am trying to implement multiprocessing in my web crawler, what I usually see online is sending the url as args into the function of map or map_async or apply_asyn. The data I am crawling is in the table, thus, I extract them by doing two times beautifulsoup find_all for row and column. Since the data I am crawling sometime is in one page which only require one url. I try to use the return list from Find_all as args for map_async, but the error occur showing "Fatal Python error: Cannot recover from stackoverflow."
The error occurred on the following line
return_list = pool.map_async(func, Species_all_recorded_data_List)
How could I solve it or where should the multiprocessing be implemented will be better?
The second problem is that if I put some code above the function crawl_all_data_mp, when it execute the pool = Pool(), all the code above will execute. I solved it by simply move all the other code under that function. It might not be correct since I still can't really run the code due to the first error.
Looking for your advice
My code:
(1) Function to call for web crawling
from tkinter import filedialog
from tkinter import *
import csv
import os.path
from os import path
from Index import *
from Dragonfly import *
import codecs
from multiprocessing import Process, Value
#\ multiprocessing ver
def multiprocessing_row_data(Web_rawl_Species_family_name, Web_rawl_Species_name, Total_num, Limit_CNT, expecting_CNT, oldID, page, Species_all_record_data_Data_Set):
global DataCNT, stop_crawl_all_data_mp
tmp_List = Species_all_record_data_Data_Set.find_all('td')
# End condition
# 1.no data in next page
# 2.for update to find unti the old data by inspecting its ID
# 3.if it count over the the limit count
id = tmp_List[0].text
if (len(id) == 0) or (DataCNT >= expecting_CNT)or (DataCNT >= Limit_CNT):
print(' --Finish crawl--' + ' crawl to page: ' + str(page) + ", ID: " + id + ", count: " + str(DataCNT))
stop_crawl_all_data_mp = True
raise StopIteration
# access the same value in memory when doing multiprocessing
with DataCNT.getlock():
DataCNT.value += 1
response_DetailedInfo = session.post(general_url + Detailed_discriptions_url + id, headers=headers)
soup2 = BeautifulSoup(response_DetailedInfo.text, 'html.parser')
print("Current finished datas >> " + str(DataCNT.value) + " /" + str(Total_num) + " (" + str(DataCNT.value * 100 / Total_num) + "%)", end='\r')
return DetailedTableInfo(tmp_List[0].text, tmp_List[1].text, tmp_List[2].text, tmp_List[3].text, tmp_List[4].text, tmp_List[5].text, tmp_List[7].text, tmp_List[6].text,
soup2.find(id='R_LAT').get('value'),
soup2.find(id='R_LNG').get('value'),
Web_rawl_Species_family_name,
Web_rawl_Species_name,
soup2.find(id='R_MEMO').get('value'))
def crawl_all_data_mp(Web_rawl_Species_family_name, Web_rawl_Species_name, Total_num, Limit_CNT, expecting_CNT, oldID):
page = 0
DataList = []
while not stop_crawl_all_data_mp:
pool = multiprocessing.Pool(10)
Species_all_recorded_data = session.post( general_url +
species_all_record_data_first_url +
species_all_record_data_page_url + str(page) +
species_all_record_data_species_url +
Species_class_key[Web_rawl_Species_family_name] +
Species_key[Web_rawl_Species_name],
headers=headers)
soup = BeautifulSoup(Species_all_recorded_data.text, 'html.parser')
Species_all_recorded_data_List = soup.find_all(id='theRow')
func = partial(multiprocessing_row_data, Web_rawl_Species_family_name, Web_rawl_Species_name, Total_num, Limit_CNT, expecting_CNT, oldID, page)
return_list = pool.map_async(func, Species_all_recorded_data_List)
DataList.append(list(filter(None, return_list.get())))
page += 1
# make sure whe main is finished, subfunctions still keep rolling on
pool.close()
pool.join()
return [DataList, page]
(2) main
it goes wrong on the following line for calling the function above
[datatmpList, page] = crawl_all_data_mp(Input_species_famliy, Input_species, Total_num, limit_cnt, expecting_CNT, oldID)
the main code:
# --main--
if __name__ == '__main__':
# setting
Input_species_famliy = "細蟌科"
Input_species = "四斑細蟌"
limit_cnt = 6000
folder = 'Crawl_Data\\' + Species_class_key[Input_species_famliy]
File_name = folder + "\\" + Species_class_key[Input_species_famliy] + Species_key[Input_species] +'.csv'
oldID = 0
oldData_len = 0
print("--Start crawl-- " + Input_species_famliy + " " + Input_species)
print("[folder]: " + folder)
stop_crawl_all_data_mp = False
# check the file exist or not
file_check = path.exists(current_path + "\\" + File_name)
# get the Old ID
if file_check:
file_size = os.stat(current_path + "\\" + File_name).st_size
if not file_size == 0:
with open(File_name, newline='', errors = "ignore") as F:
R = csv.reader(F)
oldData = [line for line in R]
oldID = oldData[0][0]
oldData_len = len(oldData)-1
# login
Login_Web(myaccount, mypassword)
# find the total number of the species_input (expect executed one time)
Species_total_num_Dict = Find_species_total_data()
# get the data
Total_num = int(Species_total_num_Dict[Input_species])
#[datatmpList, page] = crawl_all_data(Input_species_famliy, Input_species, Total_num, limit_cnt, oldID)
expecting_CNT = Total_num - oldData_len # get the total number of data need to be update ot crawl
[datatmpList, page] = crawl_all_data_mp(Input_species_famliy, Input_species, Total_num, limit_cnt, expecting_CNT, oldID)
Data = []
for Data_tmp in datatmpList:
Data.append([Data_tmp.SpeciesFamily,
Data_tmp.Species,
Data_tmp.IdNumber,
Data_tmp.Dates,
Data_tmp.Times,
Data_tmp.User,
Data_tmp.City,
Data_tmp.Dictrict,
Data_tmp.Place,
Data_tmp.Altitude,
Data_tmp.Latitude,
Data_tmp.Longitude,
Data_tmp.Description
])
#auto make the directories
newDir = current_path + "\\" + folder
if (not os.path.isdir(newDir)):
os.mkdir(newDir)
# 'a' stands for append, which can append the new data to old one
with open(File_name, mode='a', newline='', errors = "ignore") as employee_file:
employee_writer = csv.writer(employee_file, delimiter=',', quoting=csv.QUOTE_MINIMAL)
# init , for there is no file exists or the file is empty
if ((not file_check) or (file_size == 0)):
employee_writer.writerow(CSV_Head)
employee_writer.writerows(Data)
# for inserting the data into the old one
else:
for i in range(0, len(Data)):
oldData.insert(i, Data[i])
employee_writer.writerows(oldData)
I am trying to build an item from many parsing functions because am getting data from multiple urls,
I try to iterate a dictionary (that i built using 2 for loops) that's why am using 2 for loops to get the needed variable to generate the URL
then for every variable i call the second parse function passing the needed URL
this is where i want to call the second parse function from my main parse
for r in [1,2]:
for t in [1,2]:
dataName = 'lane'+str(r)+"Player"+str(t)+"Name"
dataHolder = 'lane'+str(r)+"Player"+str(t)
nameP = item[dataName]
print('before parse ==> lane = ' + str(r) + " team = " + str(t))
urlP = 'https://www.leagueofgraphs.com/summoner/euw/'+nameP+'#championsData-soloqueue'
yield Request( urlP, callback=self.parsePlayer , meta={'item': item , "player" : dataHolder} )
I am using those prints() to see in output how my code is executing
same in my second parsing function which is as following
def parsePlayer( self , response ):
item = response.meta['item']
player = response.meta['player']
print('after parse ====> ' + player)
mmr = response.css('.rank .topRankPercentage::text').extract_first().strip().lower()
mmrP = player+"Mmr"
item[mmrP] = mmr
# yield item after the last iteration
( i know i did not explain every detail in the code but i think its not needed to see my problem , not after u see what am getting from those prints )
result i get
expected result
also for some reason everytime i run the spyder i get diffrent random order of prints this is confusing i think it s something about the yield i hope someone can help me with that
Scrapy works asynchronously (as explained clearly in their official documentation), which is why the order of your prints seem random.
Besides the order, the expected output looks exactly the same as the result you get.
If you can explain why the order is relevant, we might be able to answer your question better.
If you want to yield 1 item with data of all 4 players in there, the following structure can be used:
def start_requests(self):
# prepare the urls & players:
urls_dataHolders = []
for r in [1, 2]:
for t in [1, 2]:
dataName = 'lane' + str(r) + "Player" + str(t) + "Name"
dataHolder = 'lane' + str(r) + "Player" + str(t)
urlP = 'https://www.leagueofgraphs.com/summoner/euw/' + dataName\
+ '#championsData-soloqueue'
urls_dataHolders.append((urlP, dataHolder))
# get the first url & dataholder
url, dataHolder = urls_dataHolders.pop()
yield Request(url,
callback=self.parsePlayer,
meta={'urls_dataHolders': urls_dataHolders,
'player': dataHolder})
def parsePlayer(self, response):
item = response.meta.get('item', {})
urls_dataHolders = response.meta['urls_dataHolders']
player = response.meta['player']
mmr = response.css(
'.rank .topRankPercentage::text').extract_first().strip().lower()
mmrP = player + "Mmr"
item[mmrP] = mmr
try:
url, dataHolder = urls_dataHolders.pop()
except IndexError:
# list of urls is empty, so we yield the item
yield item
else:
# still urls to go through
yield Request(url,
callback=self.parsePlayer,
meta={'urls_dataHolders': urls_dataHolders,
'item': item,
'player': dataHolder})
Have read multiple sources for solutions such as Guaravs post (including all recommendations in the comments), and Alexander, to name a few, but still not able to fix the issue. The error comes when creating a database entry # models.OrderItemTax.objects.create()
Using django 1.11.3 and MySQL 14.14
views.py **only partial code since the function is over 600 lines long.
def checkout_cart(request):
try:
item_to_group_keys = []
promo_key = 'promo:' + str(request.user.id)
with connection.cursor() as cursor:
with transaction.atomic():
// more code here...
try:
with transaction.atomic(): // updated solution
event = ''
group = models.Group.objects.get(id=order_item.group.id)
now = timezone.now()
event_date = now.strftime('%Y-%m-%d')
try:
event = models.Event.objects.get(group_id=group.id, date=event_date)
try:
self_attendee = models.EventAttendee.objects.get(event=event,
user_id=request.user.id,
is_attending=True)
except:
self_attendee = models.EventAttendee.objects.create(event=event,
user_id=request.user.id,
is_attending=True)
self_attendee.save()
except models.Event.DoesNotExist:
event = models.Event.objects.create(group_id=group.id, date=event_date, planner_id=request.user.id)
todays_site_deal = models.SiteDeal.objects.get_deal()
if todays_site_deal is not None and todays_site_deal != '':
event.site_deal = todays_site_deal
event.save()
self_attendee = models.EventAttendee.objects.create(event=event, user_id=request.user.id, is_attending=True)
self_attendee.save()
initial_group_members = models.GroupMembership.objects.filter(group_id=group.id, status_id=1)
group_members = initial_group_members.exclude(user_id=request.user.id)
for gm in group_members:
userprofile = models.UserProfile.objects.get(user_id=gm.user_id)
attendee_request = models.EventAttendeeRequest.objects.create(event=event, user_id=gm.user_id)
attendee_request.status = models.EventRequestStatusOpt.objects.get(id=3)
attendee_request.save()
group_fn = gm.user.first_name
group_ln = gm.user.last_name
group_pn = userprofile.phone
if group_fn != '' and group_ln != '' and group_pn != '' and group_pn is not None:
non_digits = re.compile(r'[^\d]+')
group_pn = non_digits.sub('', group_pn)
# request_hash = str(member_request.pk) + str(group_id) + str(int(round(time.time())))
# request_hash = int(request_hash) ^ 0xABCEEFAB
# member_request.hash = request_hash
text_message = event.planner.first_name + " from your Cliiique " + event.group.name + " started shopping. Join today's event!"
try:
pn_check = client_lookup.phone_numbers.get(group_pn)
if pn_check.phone_number is not None: # if invalid, throws an exception
# SMS
# -------------------------------------------------
message = client_rest.messages.create(
body=text_message,
to=group_pn,
from_="+12132050074")
# -------------------------------------------------
except TwilioRestException as e:
pass
except: #group not in checkout, do not create event
pass
# Get tax amount for each Item
rate = Decimal('0.0950')
oi_price = Decimal(order_item.op_price.strip(''))
tax_amount = oi_price * rate
oi_tax = models.OrderItemTax.objects.create(amount=tax_amount,
rate=rate,
order_item_id=order_item.pk)
oi_tax.save()
except IntegrityError:
return HttpResponse("<strong>CODE #700: Fatal Transaction Error! Please contact customer service.</strong>")
From docs:
Avoid catching exceptions inside atomic!
Since the function had multiple nested try and excepts, I inserted another transaction.atomic() within a the try and except closest to where the error is occurring.
Per docs, have nested try and excepts could complicate the initial atomic. I found that placing the atomic (for a second time) helps. Hope this helps others facing a similar situation.
try:
item_to_group_keys = []
promo_key = 'promo:' + str(request.user.id)
with connection.cursor() as cursor:
with transaction.atomic():
// more code here...
try:
with transaction.atomic(): // **added second atomic**
//more code
except:
pass
except IntegrityError:
I am trying to find set of elements then click on each element which takes me to a new page and perform some steps on that new page. Then click on the back button on the browser or a button on the new page that takes me to the previous page then find the same elements and repeat the above process for the rest of the elements.
I am using the below code to find the elements again before proceeding to find the elements but my code isn't working. Can someone please help?
elements = driver.find_elements_by_css_selector("#top-tables-chart-container > div > svg > g > g > rect")
counter = 0
for counter in range(counter, len(elements)):
elements = driver.find_elements_by_css_selector("#top-tables-chart-container > div > svg > g > g > rect")
webdriver.ActionChains(driver).move_to_element(elements[counter]).click().perform()
time.sleep(5)
tableNameLink= elements[counter].find_element_by_xpath("//div[#class='d3-tip bar-chart top-tables-tooltip n']//div[#class='left-section']//div[#class='table-name']//a[contains(#href,'#/table/')]")
print tableNameLink
tableNameLink.click()
tableName = driver.find_element_by_xpath("//div[#class='discover-design-transform-container clearfix']//div[#class='left-header-section clearfix']//div[#class='entity-info table-type']//span[#class='entity-identifier']")
table = tableName.text
print " Table: " + table
print '\n'
if table == "lineitem":
TableAccessFreqChartInfoBadgesValidation(self.driver).test_table_access_freq_chart_info_badges_validation("F","8","13","13")
time.sleep(1)
print '\n'
if table == "orders":
TableAccessFreqChartInfoBadgesValidation(self.driver).test_table_access_freq_chart_info_badges_validation("D","4","9","9")
time.sleep(1)
print '\n'
topUsagePatternsTab = driver.find_element_by_xpath("//div[#id='workload-level-tabs']//a[#href='#/topUsagePatterns']")
topUsagePatternsTab.click()
You will need to rebuild the list each time you return to the page, you were rebuilding it at the end of your loop but your for loop referenced the original list, which is no longer valid. A simple way is to use a counter within the loop to track your position.
elements = driver.find_elements_by_xpath("//your_path")
counter = 0
for counter in range(counter, len(elements)):
elements = driver.find_elements_by_xpath("//your_path")
elements[counter].click()
time.sleep(2)
discoverPageTables = driver.find_element_by_xpath("//your_path").text
print "Tables Found :" + discoverPageTables
discoverPageInstanceCount = driver.find_element_by_xpath("your_path").text
print "Instance Count Found :" + discoverPageInstanceCount
discoverpageWorkload = driver.find_element_by_xpath("//your_path").text
print "Workload Percentage :" + discoverpageWorkload
discoverPageHiveCompatible = driver.find_element_by_xpath("//your_path").text
print "Hive Compatible :" + discoverPageHiveCompatible
discoverPageComplexity = driver.find_element_by_xpath("your_path").text
print "Complexity :" + discoverPageComplexity
discoverPageNormalizedComplexity = driver.find_element_by_xpath("your_path").text
print "Normalized Complexity :" + discoverPageNormalizedComplexity
print '\n'
driver.back()
time.sleep(5)