scrapy for loop responds everytime only the first element

scrapy for loop responds everytime only the first element - python

I've tried to crawl a page with many "article" elements, but the loop returns everytime only the first element (as many as their are article elements).
for offer in response.xpath("//article"):
product = Product()
product['kind'] = 'deal'
product['portal'] = 'mydealz'
product['link'] = offer.xpath("//a[#class='cept-dealBtn boxAlign-jc--all-c space--h-3 width--all-12 btn btn--mode-primary']/#href").get()
product['merchant'] = offer.xpath("//div[#class='threadGrid']/div[#class='threadGrid-title']/a[#class='thread-title--list']/text()").get()
#product['merchant'] = offer.xpath("//div[#class='threadGrid']/div[#class='threadGrid-title js-contextual-message-placeholder']/div[#class='overflow--fade']/a[#class='cept-merchant-link text--color-greyShade size--all-s']/span[#class='cept-merchant-link-term overflow--wrap-off']/span[#class='text--b text--color-brandPrimary cept-merchant-name']/text()").get()
product['offer'] = offer.xpath("//div[#class='threadGrid']/div[#class='threadGrid-title js-contextual-message-placeholder']/strong[#class='thread-title']/a[#class='cept-tt thread-link linkPlain thread-title--list']/text()").get()
#product['offer'] = offer.xpath("//div[#class='threadGrid']/div[#class='threadGrid-title js-contextual-message-placeholder']/strong[#class='thread-title']/a[#class='cept-tt thread-link linkPlain thread-title--list']/text()").get()
product['crawlingdate'] = strftime("%Y-%m-%d %H:%M:%S", gmtime())
yield product

Related

Django queryset .iterate(): not working. Only first item is taken into account, second is ignored

I have the following code. I know the queryset contains 2 items. However, Django only creates one entry. What am I overlooking?
I know "OrderItem.objects.filter(order=order)" returns two objects. However, in the order_lines.iterate() it only creates one entry. Do I overlook something?
order_lines = OrderItem.objects.filter(order=order)
inv_ref = (str(order.order_date.year) + "-" + str(10000+order.id))
inv = Invoice.objects.create(invoice_ref=inv_ref,date=order.order_date, order=order, ship_to=order.order_ship_to, bill_to=order.order_bill_to)
value = 0
vat = 0
total = 0
commission = 0
## Looping not working properly: fix all items to be added to invoice.
for item in order_lines.iterator():
exvat = (item.price/((100+item.vat)/100))
val = item.quantity * exvat
ttl = (item.price*item.quantity)
comm = item.commission
vat_val = ttl-val
InvoiceItems.objects.create(invoice=inv, order_item=item, quantity=item.quantity, price=exvat, value=val, vat=item.vat,vat_val=vat_val,total=ttl)
value = value + val
vat = vat + vat_val
total = total + ttl
commission = commission + comm
print(item)

Do not use .iterator() as django.models.Model.objects.filter() already returns an QuerySet which implements iterator protocol (Iterable[]).
...
for item in order_lines:
...
...

Try simplified loop:
for item in order_lines:
it should work like a charm because it's already QuerySet object.

How to fetch Total No of pages count in Python-docx?

I am learning Python-docx and I am using this solution to add page number as given on stackoverflow by #Utkarsh Dalal
def create_element(name):
return OxmlElement(name)
def create_attribute(element, name, value):
element.set(nsqn(name), value)
def add_page_number(paragraph):
paragraph.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
page_run = paragraph.add_run()
t1 = create_element('w:t')
create_attribute(t1, 'xml:space', 'preserve')
t1.text = 'Page '
page_run._r.append(t1)
page_num_run = paragraph.add_run()
fldChar1 = create_element('w:fldChar')
create_attribute(fldChar1, 'w:fldCharType', 'begin')
instrText = create_element('w:instrText')
create_attribute(instrText, 'xml:space', 'preserve')
instrText.text = "PAGE"
fldChar2 = create_element('w:fldChar')
create_attribute(fldChar2, 'w:fldCharType', 'end')
page_num_run._r.append(fldChar1)
page_num_run._r.append(instrText)
page_num_run._r.append(fldChar2)
of_run = paragraph.add_run()
t2 = create_element('w:t')
create_attribute(t2, 'xml:space', 'preserve')
t2.text = ' of '
of_run._r.append(t2)
fldChar3 = create_element('w:fldChar')
create_attribute(fldChar3, 'w:fldCharType', 'begin')
instrText2 = create_element('w:instrText')
create_attribute(instrText2, 'xml:space', 'preserve')
instrText2.text = "NUMPAGES"
fldChar4 = create_element('w:fldChar')
create_attribute(fldChar4, 'w:fldCharType', 'end')
num_pages_run = paragraph.add_run()
num_pages_run._r.append(fldChar3)
num_pages_run._r.append(instrText2)
num_pages_run._r.append(fldChar4)
for pg in num_pages_run.element:
print(pg.text)
doc = Document()
add_page_number(doc.sections[0].footer.paragraphs[0])
doc.save("your_doc.docx")
I receive the Page number in format of Page x of y.
But when I try to access the value of 'y' for total number of pages, I could not do it. I have tried to access it getting the text attribute of num_pages_run as pg.text. But all I get is NUMPAGES as output instead of number of pages.
I am looking for this feature because I would like to do some actions whenever a new page is added to the document.
Is there a way to get total pages from python-docx or any other alternative?

How to loop iterate the div using python+selenium

I have this code and I want to iterate through all the divs inside the div. The first time I write in python and the first time I write a bot, so the first Silenium library I come across, I write in it.
all_urls_div = browser.find_elements_by_class_name("DPiy6, qF0y9, Igw0E, IwRSH, eGOV_, _4EzTm")
j = 0
for i in range(len(all_urls_div)):
chat = all_urls_div.find_elements_by_class_name("DPiy6, qF0y9, Igw0E, IwRSH, eGOV_, _4EzTm")
chat[j].click()
lastMessageBlock = browser.find_element_by_xpath(
"/html/body/div[1]/section/div/div[2]/div/div/div[2]/div[2]/div/div[1]/div/div/div[2]/div[2]/div/div/div")
j = j + 1
act = ActionChains(browser)
act.move_to_element(lastMessageBlock).perform()
threeDots = browser.find_element_by_xpath(
"/html/body/div[1]/section/div/div[2]/div/div/div[2]/div[2]/div/div[1]/div/div/div[2]/div[2]/div/div/div/button[1]").click()
deleteLastMessage = browser.find_element_by_xpath(
"/html/body/div[1]/section/div/div[2]/div/div/div[2]/div[2]/div/div[1]/div/div/div[2]/div[2]/div/div/div/div[2]/div/div[2]/div[4]/button").click()
browser.find_element_by_xpath("/html/body/div[6]/div/div/div/div[2]/button[1]").click()
textArea = browser.find_element_by_xpath(
"/html/body/div[1]/section/div/div[2]/div/div/div[2]/div[2]/div/div[2]/div/div/div[2]/textarea")
textArea.clear()
browser.execute_script("arguments[0].value='" + message + "'", textArea)
textArea.send_keys(Keys.SPACE)
sendButton = browser.find_element_by_xpath(
"/html/body/div[1]/section/div/div[2]/div/div/div[2]/div[2]/div/div[2]/div/div/div[3]/button").click()
browser.execute_script("arguments[0].scrollTop = arguments[0].scrollTop + arguments[0].offsetHeight;",
general_path_block)
general_path_block)
I tried to write a loop, but after the first iteration an exception pops up, what should I do?
File "D:\PROJECTS\InstaBotByHushchadi\main.py", line 356, in resend_message_in_direct
chat = all_urls_div.find_elements_by_class_name("DPiy6, qF0y9, Igw0E, IwRSH, eGOV_, _4EzTm")
AttributeError: 'list' object has no attribute 'find_elements_by_class_name'

This exception
selenium.common.exceptions.StaleElementReferenceException: Message: stale element reference: element is not attached to the page document
is because you have performed
chat.click()
for the first element, and that may have been redirected you to some other page/link, causing other web elements to be stale in nature.
Fix:
Redefine the list, again
all_urls_div = general_path_chats_grid.find_elements_by_tag_name("div")
j = 0
for i in range(len(all_urls_div)):
chat = general_path_chats_grid.find_elements_by_tag_name("div")
chat[j].click()
lastMessageBlock = browser.find_element_by_xpath("/html/body/div[1]/section/div/div[2]/div/div/div[2]/div[2]/div/div[1]/div/div/div[2]/div[2]/div/div/div")
j = j + 1
act = ActionChains(browser)
act.move_to_element(lastMessageBlock).perform()
threeDots = browser.find_element_by_xpath(
"/html/body/div[1]/section/div/div[2]/div/div/div[2]/div[2]/div/div[1]/div/div/div[2]/div[2]/div/div/div/button[1]").click()
deleteLastMessage = browser.find_element_by_xpath(
"/html/body/div[1]/section/div/div[2]/div/div/div[2]/div[2]/div/div[1]/div/div/div[2]/div[2]/div/div/div/div[2]/div/div[2]/div[4]/button").click()
browser.find_element_by_xpath("/html/body/div[6]/div/div/div/div[2]/button[1]").click()
textArea = browser.find_element_by_xpath(
"/html/body/div[1]/section/div/div[2]/div/div/div[2]/div[2]/div/div[2]/div/div/div[2]/textarea")
textArea.clear()
browser.execute_script("arguments[0].value='" + message + "'", textArea)
textArea.send_keys(Keys.SPACE)
sendButton = browser.find_element_by_xpath(
"/html/body/div[1]/section/div/div[2]/div/div/div[2]/div[2]/div/div[2]/div/div/div[3]/button").click()
browser.execute_script("arguments[0].scrollTop = arguments[0].scrollTop + arguments[0].offsetHeight;",
general_path_block)

why python if while ends in a dead loop

order = 2
selected = 0
while selected < 21: # because I can only select 20 rows the most once.
current_tr = driver.find_element_by_xpath('/ html / body / table / tbody / tr / td / div / div[3] / table / tbody / tr[%d]' % order) # form line 1. below the table's header
if current_tr.get_attribute("bgcolor") is None: # no bgcolor means not yet reviewed
driver.find_element_by_xpath("//td[2]/div/a").click() # check the onclick content
div_content = driver.find_element_by_xpath("//td[2]/div/div").text # fetch onclick content
driver.find_element_by_xpath("//td[2]/div/div/a").click() # close the onclick content
print(div_content)
if "car" in div_content: #judge if certain string exists in onclick content
list_content = div_content.split("【car】")
car_close = list_content[1].strip() # fetch the content
list_car = car_close.split(" ")
car = list_doi[0]
print(car)
orderminus = order - 1
driver.find_element_by_xpath('// *[ # id = "%d"] / td[6] / a' % orderminus).click() # pick this row,
time.sleep(1)
selected = selected + 1
order = order + 0 #if this row is picked, the row will disappear, so the order won't change
else: ###problem is here, the else branch seems like never been executed ? otherwise the if always stands? no, not possible. there are ones exclude "car", the problem occurs at the first instance of div_content without "car"
order = order + 1 # if "car" is not in
time.sleep(1)
else: # if already reviewed, order + 1
order = order + 1
above is my code using selenium to navigate the webpage with a table.
First judgement: if the current row is reviewed,
not yet reviewed? ok, print the info;
already reviewed？skip it.
then plus judgement: if there certain string "car" in the info:
no? skip;
yes, click it, the row disappear;
But currently when I am running this, the actual status is :
when doing the plus judement, if the string "car" is not in the info,
it keeps printing the info, it seems it not doing the else branch, is doing the line 6_9 in this snippet, always, dead end loop.
Why? anybody give me a clue?
to make things clear， i have simplified my code as below:
list = []
list.append("ff122")
list.append("carff")
list.append("ff3232")
list.append("ffcar")
list.append("3232")
order = 0
selected = 0
while selected < 6:
current_tr = list[order]
print("round %d %s" % (order, current_tr))
if "ff" in current_tr:
print("ff is in current_tr")
if "car" in current_tr:
print("car")
selected = selected + 1
order = order + 0
else:
order = order + 1
print("order is %d" % order)
else: # if already reviewed, order + 1
order = order + 1
print("order is %d" % order)
everybody can run this, what I need to do is firstly filter the "ff", if "ff" exists, then filter "car". both two conditions TRUE, selected +1, until selected reach certain number. in real instance, don't doubt that the list is long enough.

Speed up parser: HTML into Database

I need insert all html tags and attributes into database
el.driver.get(url_page)
txthtml = el.driver.page_source
soup = BeautifulSoup(txthtml, "html.parser")
body = soup.find('html')
html_parse(body, el, url_page_id, 0, 0, 0,url_page)
def html_parse(html, el, url_page_id, level, i, parent_id, url_page):
txt = ""
if len(html.text) > 0:
txt = html.text.replace("\n","").replace("\t","").replace("\r","")
ta = tag_list()
ta.p_id = el.id
ta.page_id = url_page_id
ta.level = level
ta.number = i
ta.txt = txt
ta.name = html.name
ta.parent_id = parent_id
ta.html = str(html)
ta.save()
insert_attr(html, el.id, url_page_id, ta.id, url_page)
children = list(html.children)
j = 0
for child in children:
if child.name is None:
continue
j = j + 1
html_parse(child, el, url_page_id, level + 1, j, ta.id, url_page)
When I have recursive function html_parse
html - current html object
el - driver class
url_page_id - id of page
level - level in DOM
i - childe number
parent_id - id of parent
url_page - current URL
tag_list - insert current tag
insert_attr - insert into database attrs of tag
Every html_parse function run fast, but full html parsing run about 4-5 minutes per big html page.
How I can speed up the code?

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

scrapy for loop responds everytime only the first element - python

Related

Django queryset .iterate(): not working. Only first item is taken into account, second is ignored

How to fetch Total No of pages count in Python-docx?

How to loop iterate the div using python+selenium

why python if while ends in a dead loop

Speed up parser: HTML into Database

Categories

Resources