def get_user_data(self,start_url):
html = self.session.get(url=start_url,headers=self.headers,cookies=self.cookies).content
selector = etree.fromstring(html,etree.HTMLParser(encoding='utf-8'))
all_user = selector.xpath('//div[contains(#class,"c") and contains(#id,"M")]')
for i in all_user:
user_id = i.xpath('./div[1]/a[#class="nk"]/#href')[0]
content = i.xpath('./div[1]/span[1]')[0]
contents = content.xpath('string(.)')
times = i.xpath('./div/span[#class="ct"]/text()')[0]
if len(i.xpath('./div[3]')):
imgages = i.xpath('./div[2]/a/img/#src')
praise_num = i.xpath('./div[3]/a[2]/text()')
transmit_num = i.xpath('./div[3]/a[3]/text()')
elif len(i.xpath('./div[2]')):
imgages = i.xpath('./div[2]/a/img/#src')
praise_num = i.xpath('./div[2]/a[3]/text()')
transmit_num = i.xpath('./div[2]/a[4]/text()')
else :
imgages = ''
praise_num = i.xpath('./div[1]/a[2]/text()')
transmit_num = i.xpath('./div[1]/a[3]/text()')
try:
if re.search('from',times.encode().decode('utf-8')):
month_day, time, device = times.split(maxsplit=2)
self.data['mobile_phone'] = device
else:
time,device = times.split(maxsplit=1)
self.data['month_day'] = ''
self.data['create_time'] = month_day + ' ' + time
except Exception as e:
print('failure:',e)
self.data['crawl_time'] = datetime.strftime(datetime.now(),'%Y-%m-%d %H:%M:%S')
self.data['user_id'] = user_id
self.data['contents'] = contents.encode().decode('utf-8').replace('\u200b','')
self.data['imgages'] = imgages
self.data['praise_num'] = praise_num
self.data['transmit_num'] = transmit_num
with open('a.txt','a',encoding='utf-8') as f:
f.write(json.dumps(self.data)+'\n')
I try to grab every page of data and save it to data.But I wrote it wrong, because I saved only one piece of data on each page in 'a.txt'.So how do I write to save every page of data correctly in 'a.txt'?
Write operation is outside the for loop thats why it is only adding last iteration data to file
with open('a.txt','a',encoding='utf-8') as f:
f.write(json.dumps(self.data)+'\n')
You're overwriting the various values in self.data in every iteration of the loop.
Instead, self.data should be a list. You should create a new dictionary in each iteration and append it to the data at the end.
self.data = []
for i in all_user:
values = {}
...
values['crawl_time'] = ...
values['user_id'] = ...
...
self.data.append(values)
Related
this will be a long post as the problem I'm facing is embedded in a larger project -- thanks to anyone who takes the time to read this.
Basically, I'm scraping the Wikipedia:Featured articles page. There are hundreds of article links on this page and I have already succeeded in compiling a list of articles from this page that are biographies. The following codes were used:
def __api_GET_latest_page(title):
parameters = {
"action": "parse",
"page": title,
"format": "json"
}
response_json = __get("revisions", title, parameters)
if("parse" in response_json.keys()
and "text" in response_json["parse"].keys()
and "*" in response_json["parse"]["text"].keys()):
return response_json["parse"]["text"]["*"]
return None
def __get(function_key, key, parameters, check_cache=True, write_cache=True):
target = "https://en.wikipedia.org/w/api.php"
cache_path = "cached_api"
params_unicode = str(parameters).encode('utf-8')
md5 = hashlib.md5(params_unicode).hexdigest()
return_json = None
cache_file = os.path.join(cache_path, function_key, str(key), md5)
cache_exists = os.path.isfile(cache_file)
if cache_exists:
try:
json_in = open(cache_file, "r")
json_str = json_in.read()
return_json = json.loads(json_str)
if "error" in return_json.keys() and "code" in return_json["error"].keys() and return_json["error"]["code"]=="maxlag":
cache_exists = False
except:
cache_exists = False
if not cache_exists:
cache_dir = os.path.dirname(cache_file)
if not os.path.isdir(cache_dir):
os.makedirs(cache_dir)
r = requests.get(target, params=parameters)
request_json = r.json()
json_out = open(cache_file, "w")
print(json.dumps(request_json), file=json_out)
return_json = request_json
return return_json
def __remove_tables_and_scripts(tree):
tags_to_remove = ["tbody", "td", "script"]
for tag in tags_to_remove:
elements = tree.find(f".//{tag}")
if elements is not None:
for e in elements:
e.getparent().remove(e)
return tree
def page_text(name, format, include_tables = False):
try:
result = __api_GET_latest_page(name)
except:
print("API request failed.")
if result:
e = etree.fromstring(result)
if not include_tables:
e = __remove_tables_and_scripts(e)
if format == "html":
return str(etree.tostring(e))
elif format == "text":
return ''.join(e.itertext())
elif format == "list":
return ''.join(e.itertext()).split('\n')
else:
print("Failed to retrieve a page.")
return None
The above code, specifically the page_text() function, gets the plain text of any Wikipedia page and caches the result locally. Anyways, with the following code, I got a list of all the article titles in the wikipedia featured articles page which are biographies:
def get_featured_biographies(t):
titles = page_text("Wikipedia:Featured articles", "list")
titles = titles[40: ]
titles = titles[:-7]
titles = list(filter(lambda x: x != '', titles))
list_featured_biographies = []
boolean = False
for elem in t:
if ('[edit]' in elem) and ('biographies' in elem) | ('Biographies' in elem):
boolean = True
continue
elif ('[edit]' in elem) and ('biographies' not in elem):
boolean = False
if boolean:
list_featured_biographies = list_featured_biographies + [elem]
else:
continue
return list_featured_biographies
list_featured_biographies = get_featured_biographies(titles)
This is an example of the output:
Here's where I run into problems. I need to write a function that scrapes all of the individual pages for featured article biography titles in the list I created created. Specifically, I need to write a function that extracts the first paragraph of each biography. I have succeeded in this task with the following code:
for title in list_featured_biographies:
page_content = page_text(title, "list")
list_of_values_with_keywords = []
for value in page_content:
if ('was a' in value) | ('was an ' in value) | ('is a ' in value) | ('is an ' in value):
list_of_values_with_keywords.append(value)
first_paragraph = list_of_values_with_keywords[0]
print(first_paragraph)
And so Bronwyn Bancroft, Felice Beato, and Jean Bellette are the first three names. The following screenshot shows the output for these first three names.
As you can see, my output is essentially a list of first paragraphs. I would like to organize this information into a two column dataframe, with the first column being the name of the article title and the second being the article's first paragraph. The following code encounters an error in trying to achieve this:
title2_list = []
list_of_first_para = []
for title in list_featured_biographies:
page_content = page_text(title, "list")
title2_list.append(title)
list_of_values_with_keywords = []
for value in page_content:
if ('was a' in value) | ('was an ' in value) | ('is a ' in value) | ('is an ' in value):
list_of_values_with_keywords.append(value)
first_paragraph = list_of_values_with_keywords[0]
list_of_first_para.append(first_paragraph)
data2_for_df = {'Article_Title':title2_list, 'First_Paragraph':list_of_first_para}
wiki1para_df = pd.DataFrame(data2_for_df)
print(wiki1para_df)
This is the error I run into:
IndexError Traceback (most recent call last)
<ipython-input-317-f36585876409> in <module>
13 return first_paragraph
14
16 print(first_paragraph)
<ipython-input-317-f36585876409> in get_first_paragraph(list)
9 list_of_values_with_keywords.append(value)
10
---> 11 first_paragraph = list_of_values_with_keywords[0]
12
13 return first_paragraph
IndexError: list index out of range
i am trying to scrape data using loop and this is the code
import requests
import json
import pandas as pd
parameters = ['a:1','a:2','a:3','a:4','a:3','a:4','a:5','a:6','a:7','a:8','a:9','a:10']
results = pd.DataFrame()
for item in parameters:
key, value = item.split(':')
url = "https://xxxx.000webhostapp.com/getNamesEnc02Motasel2.php?keyword=%s&type=2&limit=%s" %(key, value)
r = requests.get(url)
cont = json.loads(r.content)
temp_df = pd.DataFrame(cont)
results = results.append(temp_df)
results.to_csv('ScrapeData.csv', index=False)
this method is working great but the problem is that there i need the parameters = until 'a:1000' and i think there is a better solution to loop from 'a:1' to 'a:1000' instead of duplicating parameters like in my code .
i really need your help
Use can use a for i in range(start, end) loop. Like this
results = pd.DataFrame()
key = 'a'
# Goes from 1 to 1000 (including both)
for value in range(1, 1001):
url = f'https://xxxx.000webhostapp.com/getNamesEnc02Motasel2.php?keyword={key}&type=2&limit={value}'
r = requests.get(url)
cont = json.loads(r.content)
temp_df = pd.DataFrame(cont)
results = results.append(temp_df)
results.to_csv('ScrapeData.csv', index=False)
value = 1
key = 'a'
while value <= 1000:
url = .....%(key, str(value))
....
....
value += 1
......
Use a counter
I have a function that gathers a list of UPCs. I created another function to take the list and search for prices. The issue I am having is when a UPC is not found, a KeyError occurs. How do ignore the UPCs with no match and continue with the code? The current version of the code is an infinite loop.
def trending():
trending = requests.get('http://api.com/trends?format=json&apiKey={}'.format(apiKey))
trendingResponse = trending.json()
items = trendingResponse['items']
for item in items:
price = item['salePrice']
name = item['name']
upc = item['upc']
stock = item['stock']
image = item['largeImage']
url = item['productUrl']
sDescription = item['shortDescription']
brandName = item['brandName']
availableOnline = item['availableOnline']
print('Current UPC = ' + str(upc))
return upc_lookup(upc)
def upc_lookup(upc):
products_api = mws.Products(access_key, secret_key, seller_id, region='US')
# lookup product by upc
products = products_api.get_matching_product_for_id(marketplaceid=marketplace_usa, type_='UPC', ids=upc)
parse = products.parsed
while True:
try:
# return asin from UPC lookup
asin = parse['Products']['Product']['Identifiers']['MarketplaceASIN']['ASIN']['value']
print('ASIN Found = ' + str(asin))
except KeyError:
print('UPC {} not Found in Amazon'.format(upc))
Looks like I had to move my Return in the first function out of the For Loop.
def function_1():
function_1.item = {'sale-Price':100, 'name':'ABC', 'stock':3, 'brand-name':4}`
def function_2():
item = function_1.item
sp = item['salePrice']
name = item['name']
stock = item['stock']
print(sp, name, stock)
function_1()
function_2()
i want to ask something about update an attribute in xml file. i've created the codes but there's no errors. it's already read the parameter data, but its not post the updated value data.
my code :-
def user_submit(request):
baseurl = request.build_absolute_uri()
parsed = urlparse.urlparse(baseurl)
params = urlparse.parse_qs(parsed.query)
param = params.get('Users', [''])[0]
if request.method == 'POST':
form = UsersForm(request.POST)
passw = form.data['password']
real = form.data['realname']
# role = form.data['rolename']
files = os.path.join(settings.WSS, 'users.xml')
doc = minidom.parse(files)
items = doc.getElementsByTagName("UserRepository")
for items2 in items:
for items3 in items2.getElementsByTagName("User"):
username = items3.getAttribute('username')
# items3 = []
if username == param:
username = items3.getAttribute('username')
password = items3.getAttribute('password')
realname = items3.getAttribute('realname')
items3.attributes['password'].value = passw
items3.attributes['realname'].value = real
# for items4 in items3.getElementsByTagName("Role"):
# results2.append({
# items4.attributes['rolename'].value = role })
xml_file = open(files, "w")
doc.writexml(xml_file, encoding="utf-8")
xml_file.close()
return HttpResponseRedirect(reverse( 'users_list'))
param value:-
parameter's value
anyone can help me to solve this problem.
One of my friend was developing a scrapy script to scrap data from a page.
After sometime, I needed to add another field into. And I added the field successfully. But the problem is the field is not getting the data of the links inside the td. The field name is "Last Batsman"
Data URL:
http://digicricket.marssil.com/match/MatchData.aspx?op=1&match=1385
XPath of the Data:
//*[#id="ctl00_ContentPlaceHolder1_divData"]/table[6]/tr/td
import scrapy
from bs4 import BeautifulSoup
from scrapy.exceptions import CloseSpider
from scrapy.selector import Selector
from digicricket.items import ODIorTestItem
class DigicricketMarsilOp1Spider(scrapy.Spider):
name = "digicricket.marssil.op1"
allowed_domains = ["digicricket.marssil.com"]
def __init__(self, match_id=None):
if match_id:
match_id_list = match_id.split(',')
for i in match_id_list:
if not i.isdigit():
raise CloseSpider('Match ID = {0} is not a number'.format(i))
else:
self.start_urls = ['http://digicricket.marssil.com/match/MatchData.aspx?op=1&match={0}'.format(i)
for i in match_id_list]
else:
raise CloseSpider('You forgot input Match ID/IDs')
def parse(self, response):
item = ODIorTestItem()
item['Batsman_op1'] = []
item['Bowler_op1'] = []
item['other_op1'] = []
sel = Selector(response)
tables = sel.xpath('//div[#id="ctl00_ContentPlaceHolder1_divData"]/table').extract()
row_for_other = dict()
for i in xrange(len(tables)):
html_text = BeautifulSoup(tables[i])
if i == 1:
sl = 0
for tr in html_text.find_all('tr'):
td = tr.find_all('td')
if td:
sl += 1
row = dict()
row['sl'] = sl
row['match_id'] = response.url[response.url.rfind('=')+1:]
row["Batsman"] = td[0].get_text()
row["R"] = td[1].get_text()
row["B"] = td[2].get_text()
row["4s"] = td[3].get_text()
row["6s"] = td[4].get_text()
row["SR"] = td[5].get_text()
item['Batsman_op1'].append(row)
elif i == 2:
sl = 0
for tr in html_text.find_all('tr'):
td = tr.find_all('td')
if td:
sl += 1
row = dict()
row['sl'] = sl
row['match_id'] = response.url[response.url.rfind('=')+1:]
row["Bowler"] = td[0].get_text()
row["O"] = td[1].get_text()
row["M"] = td[2].get_text()
row["R"] = td[3].get_text()
row["W"] = td[4].get_text()
row["Econ"] = td[5].get_text()
item['Bowler_op1'].append(row)
else:
for tr in html_text.find_all('tr'):
td = tr.find_all('td')
if i == 0:
try:
row_for_other["InningsMatchDetails"] = sel.xpath('//*[#id="ctl00_ContentPlaceHolder1_divData"]/'
'table[1]/tr/td/b/text()[1]').extract()[0]
except:
row_for_other["InningsMatchDetails"] = None
try:
row_for_other["CurrentScore"] = sel.xpath('//*[#id="ctl00_ContentPlaceHolder1_divData"]/'
'table[1]/tr/td/b/span/text()').extract()[0]
except:
row_for_other["CurrentScore"] = None
try:
row_for_other["OversRunRate"] = sel.xpath('//*[#id="ctl00_ContentPlaceHolder1_divData"]/'
'table[1]/tr/td/b/text()[2]').extract()[0]
except:
row_for_other["OversRunRate"] = None
try:
row_for_other["Extras"] = sel.xpath('//*[#id="ctl00_ContentPlaceHolder1_divData"]/table[1]/'
'tr/td/b/text()[3]').extract()[0]
except:
row_for_other["Extras"] = None
try:
row_for_other["MatchResult"] = sel.xpath('//*[#id="ctl00_ContentPlaceHolder1_divData"]/'
'table[1]/tr/td/b/text()[4]').extract()[0]
except:
row_for_other["MatchResult"] = None
try:
row_for_other["RecentOvers"] = sel.xpath('//*[#id="ctl00_ContentPlaceHolder1_divData"]/'
'table[4]/tr/td[2]/text()').extract()[0]
except:
row_for_other["RecentOvers"] = None
try:
row_for_other["LastBatsman"] = sel.xpath('//*[#id="ctl00_ContentPlaceHolder1_divData"]/'
'table[6]/tr/td/text()').extract()[0]
except:
row_for_other["LastBatsman"] = None
row_for_other['match_id'] = response.url[response.url.rfind('=')+1:]
item['other_op1'].append(row_for_other)
return item
Your XPath seems to miss some tags. On the web page there are two div levels before the second table. Replacing / with // takes care of these. (Because my browser added some <tbody> tags there is also a double slash in front of the tr.
.//*[#id="ctl00_ContentPlaceHolder1_divData"]//table[6]//tr/td/a[1]/text()