Unable to use different values in multiple lines while using yield - python

I'm trying to figure out how I can print multiple values using yield in different lines. To be clearer: not using multiple yield; rather, one yield having multiple lines. In case of return I can use something like this:
return (placeholder_one,placeholder_two,placeholder_three +
placeholder_four,placeholder_five,placeholder_six,title,link)
However, I get stuck when it comes to do the same using yield.
My goal is to write the values in a csv file. If I use return, I could write the same in the following manner:
placeholder_one,placeholder_two,placeholder_three,placeholder_four,placeholder_five,placeholder_six,title,link = fetch_items()
writer.writerow([placeholder_one,placeholder_two,placeholder_three,placeholder_four,placeholder_five,placeholder_six,title,link])
If I use yield, i can simply use this within name function (which would be the most ideal):
if __name__ == '__main__':
for item in fetch_items():
writer.writerow(item)
print(item)
I've used some placeholders to make the line bigger;
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
base = "https://stackoverflow.com"
url = "https://stackoverflow.com/questions/tagged/web-scraping"
def fetch_items():
res = requests.get(url,headers={"User-Agent":"Mozilla/5.0"})
soup = BeautifulSoup(res.text,"html.parser")
placeholder_one = "Some name"
placeholder_two = "Some id"
placeholder_three = "Gender info"
placeholder_four = "Some phone"
placeholder_five = "Some email"
placeholder_six = "Some credit info"
for items in soup.select(".summary"):
title = items.select_one(".question-hyperlink").get_text(strip=True)
link = urljoin(base,items.select_one(".question-hyperlink").get("href"))
yield placeholder_one,placeholder_two,placeholder_three,placeholder_four,placeholder_five,placeholder_six,title,link
if __name__ == '__main__':
for item in fetch_items():
print(item)
How can I yield the values in two or three lines like the way I did with return?

Could you be just expecting something close to:
for items in soup.select(".summary"):
title = items.select_one(".question-hyperlink").get_text(strip=True)
link = urljoin(base,items.select_one(".question-hyperlink").get("href"))
yield (placeholder_one,placeholder_two,placeholder_three,
placeholder_four,placeholder_five,placeholder_six,
title,link)

Try creating a list or a tuple to yield:
yield [placeholder_one,placeholder_two,placeholder_three + placeholder_four,placeholder_five,placeholder_six,title,link]
As an example consider:
def num():
for i in range(10):
yield [2 + i, 6 + i, 7 + i]
for i in num():
print(i[0])
print(i[1])
print(i[2])
Also take a look at yield-multiple-values

Related

How to get all emails from a page individually

I am trying to get all emails from a specific page and separate them into an individual variable or even better a dictionary. This is some code.
import requests
import re
import json
from bs4 import BeautifulSoup
page = "http://www.example.net"
info = requests.get(page)
if info.status_code == 200:
print("Page accessed")
else:
print("Error accessing page")
code = info.content
soup = BeautifulSoup(code, 'lxml')
allEmails = soup.find_all("a", href=re.compile(r"^mailto:"))
print(allEmails)
sep = ","
allEmailsStr = str(allEmails)
print(type(allEmails))
print(type(allEmailsStr))
j = allEmailsStr.split(sep, 1)[0]
print(j)
Excuse the poor variable names because I put this together so it would be fine by itself. The output from the example website would be for example something like
[k, kolyma, location, balkans]
So if I ran the problem it would return only
[k
But if I wanted it to return every email on there individually how would I do that?
To get just the email str you can try:
emails = []
for email_link in allEmails:
emails.append(email_link.get("href").replace('mailto:', ''))
print(emails)
Based on your expected output, you can use the unwrap function of BeautifulSoup
allEmails = soup.find_all("a", href=re.compile(r"^mailto:"))
for Email in allEmails:
print(Email.unwrap()) #This will print the whole element along with tag
# k

Can't return a string and a list from one function to another

I've created a script in python to parse the website address of different agencies from it's landing page and the location address from it's inner page. What I can't understand is how can i return a string and a list at the same time in order for them to be reused in another function. To be clearer: I wish to return the website address and list of links from collect_links() function and reuse them in get_info() function. My current approach throws an error - ValueError: not enough values to unpack (expected 2, got 1).
Tis is my attempt so far:
import re
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
def collect_links(link):
res = requests.get(link)
soup = BeautifulSoup(res.text, "lxml")
website = [soup.select_one("p.company-profile-website > a").get("href")]
items = [urljoin(url,item.get("href")) for item in soup.select("[id^='company-'] .search-companies-result-info h2 > a")]
return website,items
def get_info(website,link):
res = requests.get(link)
soup = BeautifulSoup(res.text, "lxml")
address = soup.select_one("p.footer-right").get_text(strip=True)
print(website,address)
if __name__ == '__main__':
url = "https://www.cv-library.co.uk/companies/agencies/A"
for item,link in collect_links(url):
get_info(item,link)
How can I return a string and a list from one function to another?
PS I would like to be stick to the design I've already tried.
Your websites is a list with a single element string, not a string as you've enclosed it in [] literal. You need to drop [] to make it a string as no point making that a list.
After doing that, you can get the return value, and iterate over the links like:
if __name__ == '__main__':
url = "https://www.cv-library.co.uk/companies/agencies/A"
website, links = collect_links(url)
for link in links:
get_info(website, link)
Main error in the code is in this link.
website = [soup.select_one("p.company-profile-website > a").get("href")]
This only returns one value:
http://www.autoskills-uk.com
Your function should be:
def collect_links(link):
res = requests.get(link)
soup = BeautifulSoup(res.text, "lxml")
websites = [x.get("href") for x in soup.select("p.company-profile-website > a")] #<============== Changed
items = [urljoin(url,item.get("href")) for item in soup.select("[id^='company-'] .search-companies-result-info h2 > a")]
return zip(websites, items)
Return as zip of websites and items.
Now you can list unpack item and link in the for loop:
if __name__ == '__main__':
url = "https://www.cv-library.co.uk/companies/agencies/A"
for item,link in collect_links(url):
get_info(item,link)
You are returning two lists, one with one element and another, with many elements as a tuple, and try to iterator over this tuple, unpacking each list into two elements item and link.
I don't see, what you really want to do, but you should separate for-loop and return values:
website, links = collect_links(url)
for link in links:
get_info(website[0], link)

Python - print 2nd argument

I am new to Python and I've written this test-code for practicing purposes, in order to find and print email addresses from various web pages:
def FindEmails(*urls):
for i in urls:
totalemails = []
req = urllib2.Request(i)
aResp = urllib2.urlopen(req)
webpage = aResp.read()
patt1 = '(\w+[-\w]\w+#\w+[.]\w+[.\w+]\w+)'
patt2 = '(\w+[\w]\w+#\w+[.]\w+)'
regexlist = [patt1,patt2]
for regex in regexlist:
match = re.search(regex,webpage)
if match:
totalemails.append(match.group())
break
#return totalemails
print "Mails from webpages are: %s " % totalemails
if __name__== "__main__":
FindEmails('https://www.urltest1.com', 'https://www.urltest2.com')
When I run it, it prints only one argument.
My goal is to print the emails acquired from webpages and store them in a list, separated by commas.
Thanks in advance.
The problem here is the line: totalemails = []. Here, you are re-instantiating the the variables totalemails to have zero entries. So, in each iteration, it only has one entry inside it. After the last iteration, you'll end up with just the last entry in the list. To get a list of all emails, you need to put the variable outside of the for loop.
Example:
def FindEmails(*urls):
totalemails = []
for i in urls:
req = urllib2.Request(i)
....

How to increment variable in the middle of URL and output for multiple queries?

I would like to modify the code below to allow for searching multiple stores at once (via the four digit store number in the 'data' section below). What is the best way to accomplish this? Preferably I would be able to limit the search to 50-100 stores.
import requests
import json
js = requests.post("http://www.walmart.com/store/ajax/search",
data={"searchQuery":"store=2516&size=18&dept=4044&query=43888060"} ).json()
data = json.loads(js['searchResults'])
res = data["results"][0]
print(res["name"], res["inventory"])
I would also like the store # printed in the line above.
Your data object in the request.post call can be constructed like any other string. And then a variable that represents it can take the place of your "store=2516..." string. Like this, assuming requests is defined in the outer function someplace and can be reused:
var stores = ["2516","3498","5478"];
stores.forEach( makeTheCall );
function makeTheCall( element, index, array ) {
storeQuery = "store=" + element + "&size=18&dept=4044&query=43888060";
js = requests.post("http://www.walmart.com/store/ajax/search",
data={"searchQuery":storeQuery} ).json()
data = json.loads(js['searchResults'])
res = data["results"][0]
console.log("name: " + res["name"] + ", store: " + element + ", inventory: " + res["inventory"]);
}
I'm not familiar with your use of "print", but I've only ever used client side javascript.
The API does not support searching for multiple stores, so you need to make multiple requests.
import requests
import json
from collections import defaultdict
results = defaultdict(list)
stores = [2516, 1234, 5678]
url = "http://www.walmart.com/store/ajax/search"
query = "store={}&size=18&dept=4044&query=43888060"
for store in stores:
r = requests.post(url, data={'searchQuery': query.format(store)})
r.raise_for_status()
try:
data = json.loads(r.json()['searchResults'])['results'][0]
results[store].append((data['name'],data['inventory']))
except IndexError:
continue
for store, data in results.iteritems():
print('Store: {}'.format(store))
if data:
for name, inventory in data:
print('\t{} - {}'.format(name, inventory))

How to scrape more than first instance of triple-nested list of links in Python?

I am trying to determine the simplest way to record the contents of webpages linked from webpages linked from an original webpage. I would like my output to be a table with rows corresponding to the contents of the third layer deep of pages.
As you can see from the code, I am currently only able to get the first instance of a desired item on the third-level page. Also, while my current code will return one row corresponding to each h2 item on the base URL, I hope to have multiple rows per h2 item (as many as there are instances of "span.'case-doc-details' a" on the second layer).
Some additional info: At each linking state, I do not know how many pages will be linked. I am using Python and Scraperwiki, and new to both. I have attempted to research the question, but have hit a roadblock in my knowledge of what to ask. Thanks in advance for any help.
import scraperwiki
import urlparse
import lxml.html
import urllib
def scrape_table(root):
rows = root.cssselect("h2")
record = {}
counter=0
for row in rows:
table_cells = row.cssselect("h2 a")
for cell in table_cells:
record['Count']=counter
table_cellsurls = table_cells[0].cssselect("a")
record['CaseURL'] = table_cellsurls[0].attrib.get('href')
caselinkurl = urllib.urlopen('http://www.italaw.com/'+table_cellsurls[0].attrib.get('href')).read()
#print caselinkurl
caseroots = lxml.html.fromstring(caselinkurl)
title=caseroots.cssselect("title")
record['Title'] = title[0].text_content()
ids=caseroots.cssselect("div div div div a")
for i in ids:
if len(ids)<=2:
record['Rules']="None"
record['Treaty']="None"
else:
record['Rules']=ids[2].text_content()
record['Treaty']=ids[3].text_content()
pars = caseroots.cssselect("span.'case-doc-details' a")
#print "pars length is", len(pars)
caselinkurl2=urllib.urlopen('http://www.italaw.com/'+pars[0].attrib.get('href')).read()
caseroots2=lxml.html.fromstring(caselinkurl2)
#create another table element with rows, marked off with the case that they came from, create all the rows.
for i in pars:
if len(pars)==0:
record['DetailsURL']="None"
else:
record['DetailsURL']=pars[0].attrib.get('href')
pars2=caseroots2.cssselect("div.'field-item even' span.'date-display-single'")
if len(pars2)==0:
record['Doc Date']="None"
else:
record['Doc Date']=pars2[0].text_content()
pars3=caseroots2.cssselect("div.'field-name-field-case-doc-file' span.'file' a")
if len(pars3) ==0:
record['Doc Type Link']="None"
record['Doc Type']="None"
else:
record['Doc Type Link']=pars3[0].attrib.get('href')
record['Doc Type']=pars3[0].text_content()
pars4=caseroots2.cssselect("div.'field-name-field-arbitrator-claimant'")
if len(pars4)==0:
record['Claimant Nominee']="None"
else:
record['Claimant Nominee']=pars4[0].text_content()
pars5=caseroots2.cssselect("div.'field-name-field-arbitrator-respondent'")
if len(pars5)==0:
record['Respondent Nominee']="None"
else:
record['Respondent Nominee']=pars5[0].text_content()
pars6=caseroots2.cssselect("div.'field-name-field-arbitrator-chair'")
if len(pars6)==0:
record['President']="None"
else:
record['President']=pars6[0].text_content()
print record, '------------'
scraperwiki.sqlite.save(['Count'],record)
counter+=1
def scrape_and_look_for_next_link(url):
html = scraperwiki.scrape(url)
print html
root = lxml.html.fromstring(html)
scrape_table(root)
#START HERE:
url = 'http://www.italaw.com/cases-by-respondent?field_case_respondent_tid=All'
scrape_and_look_for_next_link(url)
Here's the code I've got so far - this doesn't yet grab the documents link data (or save anything), but that should be a case of extending the principles here into another function:
import scraperwiki
import urlparse
import lxml.html
import urllib
def scrape_page(linkurl):
html = scraperwiki.scrape(linkurl)
root = lxml.html.fromstring(html)
title = root.cssselect("h1")
print "the title:", title[0].text
record = {}
record['title'] = title[0].text
record['url'] = linkurl
#<div class="field-items"><div class="field-item even"><a
arbrules = root.cssselect("div.field-items a")
if arbrules:
record['arbruleurl'] = arbrules[0].attrib.get("href")
record['arbrule'] = arbrules[0].text_content()
else:
record['arbruleurl'] = "NO URL"
record['arbrule'] = "NO ARBRULE"
legalbasis = root.cssselect("div.field-label")
if legalbasis:
record['legalbasis'] = legalbasis[0].text_content()
else:
record['legalbasis'] = "NO LEGAL BASIS GIVEN"
extralinks = []
contents = root.cssselect("div.view-content a")
if contents:
for content in contents:
extralinks.append(content.text_content())
extralinks.append(content.attrib.get("href"))
record['extralinks'] = extralinks
else:
record['extralinks'] = "NO EXTRA LINKS"
#record['firstparty'] = title[0].text.split(" v. ")[0]
#record['secondparty'] = title[0].text.split(" v. ")[1]
#record['casenumber'] = title[0].text.split(" Case No.")[1]
print record
def scrape_table(root):
links = root.cssselect("div.link-wrapper a")
for link in links:
print link.text_content()
linkurl = link.attrib.get("href")
print linkurl
scrape_page('http://www.italaw.com'+linkurl)
def scrape_and_look_for_next_link(url):
html = scraperwiki.scrape(url)
print html
root = lxml.html.fromstring(html)
scrape_table(root)
#START HERE:
url = 'http://www.italaw.com/cases-by-respondent?field_case_respondent_tid=All'
scrape_and_look_for_next_link(url)
Here is what I got to work for this problem.
A few instructive general points:
Use an if else loop to distinguish the situation of a length of zero from non-zero length of your key attribute.
Just before this, create your dictionary.
In both if and else components of the loop, give printing, storing and index augmentation instructions. You'll set your index to zero just before going into the loop.
In the else bit, create a for loop that iterates over each instance i, with they key attribute you want to iterate over set to record the ith instance. Set all other attributes to the zeroth instance.
Finally, when dealing with an arbitrary number of triple-nested links, it will generally be best to scrape all data (if possible) from the lowest level you are scraping. In my case, this worked, because all of the attributes I wanted to record were repeated on this level. In other cases, I am not sure what the best way to proceed would be.
Thanks to Paul for nudging this forward.
import scraperwiki
import urlparse
import lxml.html
import urllib
def scrape_table(root):
rows = root.cssselect("h2")
counter=0
for row in rows:
table_cells = row.cssselect("h2 a")
for cell in table_cells:
table_cellsurls = table_cells[0].cssselect("a")
#record['CaseURL'] = table_cellsurls[0].attrib.get('href')
caselinkurl = urllib.urlopen('http://www.italaw.com/'+table_cellsurls[0].attrib.get('href')).read()
#print caselinkurl
caseroots = lxml.html.fromstring(caselinkurl)
pars = caseroots.cssselect("span.'case-doc-details' a")
#print "pars length is", len(pars)
record = {}
#create another table element with rows, marked off with the case that they came from, create all the rows.
if len(pars)==0:
record['DetailsURL']="None"
record['Count']=counter
print record, '------------'
scraperwiki.sqlite.save(['Count'],record)
counter+=1
else:
for i in range(0,len(pars)):
record['Count']=counter
caselinkurl2=urllib.urlopen('http://www.italaw.com/'+pars[i].attrib.get('href')).read()
caseroots2=lxml.html.fromstring(caselinkurl2)
record['DetailsURL']=pars[i].attrib.get('href')
title=caseroots2.cssselect("h2")
record['Title'] = title[1].text_content()
rules=caseroots2.cssselect("div.'field-name-field-arbitration-rules'")
if len(rules)==0:
record['Rules']="None"
else:
record['Rules']=rules[0].text_content()
treaty=caseroots2.cssselect("div.'field-name-field-case-treaties'")
if len(treaty)==0:
record['Treaty']="None"
else:
record['Treaty']=treaty[0].text_content()
pars2=caseroots2.cssselect("div.'field-name-field-case-document-date'")
if len(pars2)==0:
record['Doc Date']="None"
else:
record['Doc Date']=pars2[0].text_content()
pars3=caseroots2.cssselect("div.'field-name-field-case-doc-file' span.'file' a")
if len(pars3) ==0:
record['Doc Type Link']="None"
record['Doc Type']="None"
else:
record['Doc Type Link']=pars3[0].attrib.get('href')
record['Doc Type']=pars3[0].text_content()
pars4=caseroots2.cssselect("div.'field-name-field-arbitrator-claimant'")
if len(pars4)==0:
record['Claimant Nominee']="None"
else:
record['Claimant Nominee']=pars4[0].text_content()
pars5=caseroots2.cssselect("div.'field-name-field-arbitrator-respondent'")
if len(pars5)==0:
record['Respondent Nominee']="None"
else:
record['Respondent Nominee']=pars5[0].text_content()
pars6=caseroots2.cssselect("div.'field-name-field-arbitrator-chair'")
if len(pars6)==0:
record['President']="None"
else:
record['President']=pars6[0].text_content()
print record, '------------'
scraperwiki.sqlite.save(['Count'],record)
counter+=1
def scrape_and_look_for_next_link(url):
html = scraperwiki.scrape(url)
print html
root = lxml.html.fromstring(html)
scrape_table(root)
#START HERE:
url = 'http://www.italaw.com/cases-by-respondent?field_case_respondent_tid=All'
scrape_and_look_for_next_link(url)

Categories

Resources