How to apply scraping function to an entire column in a dataframe - python

this will be a long post as the problem I'm facing is embedded in a larger project -- thanks to anyone who takes the time to read this.
Basically, I'm scraping the Wikipedia:Featured articles page. There are hundreds of article links on this page and I have already succeeded in compiling a list of articles from this page that are biographies. The following codes were used:
def __api_GET_latest_page(title):
parameters = {
"action": "parse",
"page": title,
"format": "json"
}
response_json = __get("revisions", title, parameters)
if("parse" in response_json.keys()
and "text" in response_json["parse"].keys()
and "*" in response_json["parse"]["text"].keys()):
return response_json["parse"]["text"]["*"]
return None
def __get(function_key, key, parameters, check_cache=True, write_cache=True):
target = "https://en.wikipedia.org/w/api.php"
cache_path = "cached_api"
params_unicode = str(parameters).encode('utf-8')
md5 = hashlib.md5(params_unicode).hexdigest()
return_json = None
cache_file = os.path.join(cache_path, function_key, str(key), md5)
cache_exists = os.path.isfile(cache_file)
if cache_exists:
try:
json_in = open(cache_file, "r")
json_str = json_in.read()
return_json = json.loads(json_str)
if "error" in return_json.keys() and "code" in return_json["error"].keys() and return_json["error"]["code"]=="maxlag":
cache_exists = False
except:
cache_exists = False
if not cache_exists:
cache_dir = os.path.dirname(cache_file)
if not os.path.isdir(cache_dir):
os.makedirs(cache_dir)
r = requests.get(target, params=parameters)
request_json = r.json()
json_out = open(cache_file, "w")
print(json.dumps(request_json), file=json_out)
return_json = request_json
return return_json
def __remove_tables_and_scripts(tree):
tags_to_remove = ["tbody", "td", "script"]
for tag in tags_to_remove:
elements = tree.find(f".//{tag}")
if elements is not None:
for e in elements:
e.getparent().remove(e)
return tree
def page_text(name, format, include_tables = False):
try:
result = __api_GET_latest_page(name)
except:
print("API request failed.")
if result:
e = etree.fromstring(result)
if not include_tables:
e = __remove_tables_and_scripts(e)
if format == "html":
return str(etree.tostring(e))
elif format == "text":
return ''.join(e.itertext())
elif format == "list":
return ''.join(e.itertext()).split('\n')
else:
print("Failed to retrieve a page.")
return None
The above code, specifically the ​page_text() function, gets the plain text of any Wikipedia page and caches the result locally. Anyways, with the following code, I got a list of all the article titles in the wikipedia featured articles page which are biographies:
def get_featured_biographies(t):
titles = page_text("Wikipedia:Featured articles", "list")
titles = titles[40: ]
titles = titles[:-7]
titles = list(filter(lambda x: x != '', titles))
list_featured_biographies = []
boolean = False
for elem in t:
if ('[edit]' in elem) and ('biographies' in elem) | ('Biographies' in elem):
boolean = True
continue
elif ('[edit]' in elem) and ('biographies' not in elem):
boolean = False
if boolean:
list_featured_biographies = list_featured_biographies + [elem]
else:
continue
return list_featured_biographies
list_featured_biographies = get_featured_biographies(titles)
This is an example of the output:
Here's where I run into problems. I need to write a function that scrapes all of the individual pages for featured article biography titles in the list I created created. Specifically, I need to write a function that extracts the first paragraph of each biography. I have succeeded in this task with the following code:
for title in list_featured_biographies:
page_content = page_text(title, "list")
list_of_values_with_keywords = []
for value in page_content:
if ('was a' in value) | ('was an ' in value) | ('is a ' in value) | ('is an ' in value):
list_of_values_with_keywords.append(value)
first_paragraph = list_of_values_with_keywords[0]
print(first_paragraph)
And so Bronwyn Bancroft, Felice Beato, and Jean Bellette are the first three names. The following screenshot shows the output for these first three names.
As you can see, my output is essentially a list of first paragraphs. I would like to organize this information into a two column dataframe, with the first column being the name of the article title and the second being the article's first paragraph. The following code encounters an error in trying to achieve this:
title2_list = []
list_of_first_para = []
for title in list_featured_biographies:
page_content = page_text(title, "list")
title2_list.append(title)
list_of_values_with_keywords = []
for value in page_content:
if ('was a' in value) | ('was an ' in value) | ('is a ' in value) | ('is an ' in value):
list_of_values_with_keywords.append(value)
first_paragraph = list_of_values_with_keywords[0]
list_of_first_para.append(first_paragraph)
data2_for_df = {'Article_Title':title2_list, 'First_Paragraph':list_of_first_para}
wiki1para_df = pd.DataFrame(data2_for_df)
print(wiki1para_df)
This is the error I run into:
IndexError Traceback (most recent call last)
<ipython-input-317-f36585876409> in <module>
13 return first_paragraph
14
16 print(first_paragraph)
<ipython-input-317-f36585876409> in get_first_paragraph(list)
9 list_of_values_with_keywords.append(value)
10
---> 11 first_paragraph = list_of_values_with_keywords[0]
12
13 return first_paragraph
IndexError: list index out of range

Related

How to fix Error : KeyError at /input 'Company'? When Selecting single Row From Dataframe

def input(request):
if 'pass' in request.POST:
company = request.POST['pass']
else:
company = False
df = pandas.read_csv('data.csv',index_col = None)
take = df.groupby('Company').mean()
table = take[take['Company'] == company]
table_content = table.to_html(classes = 'table')
return render(request,'result.html',{'table_content': table_content})
I want to Represent Single Row In HTML In Tabular Form. But getting Error KeyError at /input
'Company'.
Am working With Django.
KeyError at /input
'Company'
If you have a dictionary in python and you want to check if a key/value pair exists you should use get. See https://www.programiz.com/python-programming/methods/dictionary/get
So this is what you should do in your web app as well.
Example:
d = {'k':'v'}
val = d.get('k')
if val is None:
print('not found')
else:
print('found')

im getting connection errors in my python program

u have tried to run my program , but each time im getting error in the middile of the run
basiclly, my program does this :
1. get the xml from my website
2. run all the urls
3. get data from my web page (sku,name,title, price etc)
4. get the lowest price from another website, by compraring the price with the same sku
the problem is that i have more then 7,000 urls in my xml ,so my program get error network each time
what to do ? how can i resolve it ?
def parse_sitemap (url):
resp = requests.get(XXXX)
for u in urls:
loc = u.find ('loc').string
# not a sitemap requirement skip if not present
out.append ([loc])
return out
def get_sku (u):
html = requests.get(u)
bsObj = BeautifulSoup(html.content,'xml')
sku = bsObj.find('span',attrs={'itemprop':'sku'}).get_text()
return sku
def get_price ( u):
try:
html = requests.get(u)
bsObj = BeautifulSoup(html.content,'xml')
price = bsObj.find('span',attrs={'itemprop':'price'}).get_text()
price = str(price).replace(' ₪‎','')
return price
except:
return 'no price'
def get_zapPrice (makat):
try:
check ='https://www.zap.co.il/search.aspx?keyword='+makat
r = requests.get(check)
html = requests.get(r.url)
bsObj = BeautifulSoup(html.content,'html.parser')
zapPrice = bsObj.select_one('div.StoresLines div.PriceNum').text.strip().replace(' ₪','')
return zapPrice
except:
return 'no zap product'
def get_zapStoreName (makat):
try:
check ='https://www.zap.co.il/search.aspx?keyword='+makat
r = requests.get(check)
html = requests.get(r.url)
bsObj = BeautifulSoup(html.content,'html.parser')
storeName = bsObj.select_one('div.StoresLines
div.BuyButtonsTxt').text.strip().replace('ב-','')
return storeName
except:
return 'no zap product'
for u in urls:
ws1 [ 'A1' ] = u
makat = get_sku(u)
ws1 [ 'F1' ] = makat
zapPrice = get_zapPrice(makat)
ws1['I1'] = zapPrice
storeName = get_zapStoreName(makat)
ws1['J1'] = storeName
ws1.insert_rows(1)
ws1.append ([])
print("writing product no." + str(i))
ws1['F1'] = 'makat'
ws1['I1'] = 'zap price'
ws1['J1'] = 'zap store'
wb.save ("sample.xlsx")
wb.close ()
print ('end')
i didn't write all my code - by the basic is here
each def it's start with requests.get, get what i want and return it
after that, i'm writing it to excel file
the problem that im getting after 1,000 urls checks ...
what is the problem ??

Access list of values from a function in another function in Python?

I have a function that gathers a list of UPCs. I created another function to take the list and search for prices. The issue I am having is when a UPC is not found, a KeyError occurs. How do ignore the UPCs with no match and continue with the code? The current version of the code is an infinite loop.
def trending():
trending = requests.get('http://api.com/trends?format=json&apiKey={}'.format(apiKey))
trendingResponse = trending.json()
items = trendingResponse['items']
for item in items:
price = item['salePrice']
name = item['name']
upc = item['upc']
stock = item['stock']
image = item['largeImage']
url = item['productUrl']
sDescription = item['shortDescription']
brandName = item['brandName']
availableOnline = item['availableOnline']
print('Current UPC = ' + str(upc))
return upc_lookup(upc)
def upc_lookup(upc):
products_api = mws.Products(access_key, secret_key, seller_id, region='US')
# lookup product by upc
products = products_api.get_matching_product_for_id(marketplaceid=marketplace_usa, type_='UPC', ids=upc)
parse = products.parsed
while True:
try:
# return asin from UPC lookup
asin = parse['Products']['Product']['Identifiers']['MarketplaceASIN']['ASIN']['value']
print('ASIN Found = ' + str(asin))
except KeyError:
print('UPC {} not Found in Amazon'.format(upc))
Looks like I had to move my Return in the first function out of the For Loop.
def function_1():
function_1.item = {'sale-Price':100, 'name':'ABC', 'stock':3, 'brand-name':4}`
def function_2():
item = function_1.item
sp = item['salePrice']
name = item['name']
stock = item['stock']
print(sp, name, stock)
function_1()
function_2()

Python saves only one row of data

def get_user_data(self,start_url):
html = self.session.get(url=start_url,headers=self.headers,cookies=self.cookies).content
selector = etree.fromstring(html,etree.HTMLParser(encoding='utf-8'))
all_user = selector.xpath('//div[contains(#class,"c") and contains(#id,"M")]')
for i in all_user:
user_id = i.xpath('./div[1]/a[#class="nk"]/#href')[0]
content = i.xpath('./div[1]/span[1]')[0]
contents = content.xpath('string(.)')
times = i.xpath('./div/span[#class="ct"]/text()')[0]
if len(i.xpath('./div[3]')):
imgages = i.xpath('./div[2]/a/img/#src')
praise_num = i.xpath('./div[3]/a[2]/text()')
transmit_num = i.xpath('./div[3]/a[3]/text()')
elif len(i.xpath('./div[2]')):
imgages = i.xpath('./div[2]/a/img/#src')
praise_num = i.xpath('./div[2]/a[3]/text()')
transmit_num = i.xpath('./div[2]/a[4]/text()')
else :
imgages = ''
praise_num = i.xpath('./div[1]/a[2]/text()')
transmit_num = i.xpath('./div[1]/a[3]/text()')
try:
if re.search('from',times.encode().decode('utf-8')):
month_day, time, device = times.split(maxsplit=2)
self.data['mobile_phone'] = device
else:
time,device = times.split(maxsplit=1)
self.data['month_day'] = ''
self.data['create_time'] = month_day + ' ' + time
except Exception as e:
print('failure:',e)
self.data['crawl_time'] = datetime.strftime(datetime.now(),'%Y-%m-%d %H:%M:%S')
self.data['user_id'] = user_id
self.data['contents'] = contents.encode().decode('utf-8').replace('\u200b','')
self.data['imgages'] = imgages
self.data['praise_num'] = praise_num
self.data['transmit_num'] = transmit_num
with open('a.txt','a',encoding='utf-8') as f:
f.write(json.dumps(self.data)+'\n')
I try to grab every page of data and save it to data.But I wrote it wrong, because I saved only one piece of data on each page in 'a.txt'.So how do I write to save every page of data correctly in 'a.txt'?
Write operation is outside the for loop thats why it is only adding last iteration data to file
with open('a.txt','a',encoding='utf-8') as f:
f.write(json.dumps(self.data)+'\n')
You're overwriting the various values in self.data in every iteration of the loop.
Instead, self.data should be a list. You should create a new dictionary in each iteration and append it to the data at the end.
self.data = []
for i in all_user:
values = {}
...
values['crawl_time'] = ...
values['user_id'] = ...
...
self.data.append(values)

Scrape facebook AttributeError

I am beginner for Python,
How I can solve
AttributeError: module 'urllib' has no attribute 'Request'
As I view other post, still can't understand how solve the problem
Here the screen capture of the error
And this is the code (I refer from https://github.com/minimaxir/facebook-page-post-scraper/blob/master/get_fb_posts_fb_page.py)
import urllib.request
import json, datetime, csv, time
app_id = "xxx"
app_secret = "xxx" # DO NOT SHARE WITH ANYONE!
access_token = "xxx"
page_id = 'xxx'
def testFacebookPageData(page_id, access_token):
# construct the URL string
base = "https://graph.facebook.com/v2.4"
node = "/" + page_id +'/feed'
parameters = "/?access_token=%s" % access_token
url = base + node + parameters
# retrieve data
response = urllib.request.urlopen(url)
data = json.loads(response.read().decode('utf-8'))
print (data)
def request_until_succeed(url):
req = urllib.request.urlopen(url)
success = False
while success is False:
try:
response = urllib.urlopen(req)
if response.getcode() == 200:
success = True
except Exception as e:
print (e)
time.sleep(5)
print (url, datetime.datetime.now())
return response.read()
def getFacebookPageFeedData(page_id, access_token, num_statuses):
# construct the URL string
base = "https://graph.facebook.com"
node = "/" + page_id + "/feed"
parameters = "/?fields=message,link,created_time,type,name,id,likes.limit(1).summary(true),comments.limit(1).summary(true),shares&limit=%s&access_token=%s" % (num_statuses, access_token) # changed
url = base + node + parameters
# retrieve data
data = json.loads(request_until_succeed(url))
return data
def processFacebookPageFeedStatus(status):
# The status is now a Python dictionary, so for top-level items,
# we can simply call the key.
# Additionally, some items may not always exist,
# so must check for existence first
status_id = status['id']
status_message = '' if 'message' not in status.keys() else status['message'].encode('utf-8')
link_name = '' if 'name' not in status.keys() else status['name'].encode('utf-8')
status_type = status['type']
status_link = '' if 'link' not in status.keys() else status['link']
# Time needs special care since a) it's in UTC and
# b) it's not easy to use in statistical programs.
status_published = datetime.datetime.strptime(status['created_time'],'%Y-%m-%dT%H:%M:%S+0000')
status_published = status_published + datetime.timedelta(hours=-5) # EST
status_published = status_published.strftime('%Y-%m-%d %H:%M:%S') # best time format for spreadsheet programs
# Nested items require chaining dictionary keys.
num_likes = 0 if 'likes' not in status.keys() else status['likes']['summary']['total_count']
num_comments = 0 if 'comments' not in status.keys() else status['comments']['summary']['total_count']
num_shares = 0 if 'shares' not in status.keys() else status['shares']['count']
# return a tuple of all processed data
return (status_id, status_message, link_name, status_type, status_link,
status_published, num_likes, num_comments, num_shares)
def scrapeFacebookPageFeedStatus(page_id, access_token):
with open('%s_facebook_statuses.csv' % page_id, 'w') as file:
w = csv.writer(file)
w.writerow(["status_id", "status_message", "link_name", "status_type", "status_link",
"status_published", "num_likes", "num_comments", "num_shares"])
has_next_page = True
num_processed = 0 # keep a count on how many we've processed
scrape_starttime = datetime.datetime.now()
print (page_id, scrape_starttime)
statuses = getFacebookPageFeedData(page_id, access_token, 100)
while has_next_page:
for status in statuses['data']:
w.writerow(processFacebookPageFeedStatus(status))
# output progress occasionally to make sure code is not stalling
num_processed += 1
if num_processed % 1000 == 0:
print (num_processed, datetime.datetime.now())
# if there is no next page, we're done.
if 'paging' in statuses.keys():
statuses = json.loads(request_until_succeed(statuses['paging']['next']))
else:
has_next_page = False
print (num_processed, datetime.datetime.now() - scrape_starttime)
if __name__ == '__main__':
scrapeFacebookPageFeedStatus(page_id, access_token)
There is no urllib.Request() in Python 3 - there is urllib.request.Request().
EDIT: you have url = urllib.Request(url) in error message but I don't see this line in your code - maybe you run wrong file.

Categories

Resources