I've followed a tutorial to scrape from a facebook profile and I keep getting this error:
JSONDecodeError: Extra data: line 1 column 8 (char 7)
Does anyone know what the problem might be?
Here is my python script:
def get_bs(session, url):
#Makes a GET requests using the given Session objectand returns a BeautifulSoup object.
r = None
while True:
r = session.get(url)
if r.ok:
break
return BeautifulSoup(r.text, 'lxml'
#To login
def make_login(session, base_url, credentials):
#Returns a Session object logged in with credentials.
login_form_url = '/login/device-based/regular/login/?refsrc=https%3A'\
'%2F%2Fmobile.facebook.com%2Flogin%2Fdevice-based%2Fedit-user%2F&lwv=100'
params = {'email':credentials['email'], 'pass':credentials['pass']}
while True:
time.sleep(3)
logged_request = session.post(base_url+login_form_url, data=params)
if logged_request.ok:
logging.info('[*] Logged in.')
break
#Crawling FB
def crawl_profile(session, base_url, profile_url, post_limit):
#Goes to profile URL, crawls it and extracts posts URLs.
profile_bs = get_bs(session, profile_url)
n_scraped_posts = 0
scraped_posts = list()
posts_id = None
while n_scraped_posts < post_limit:
try:
posts_id = 'recent'
posts = profile_bs.find('div', id=posts_id).div.div.contents
except Exception:
posts_id = 'structured_composer_async_container'
posts = profile_bs.find('div', id=posts_id).div.div.contents
posts_urls = [a['href'] for a in profile_bs.find_all('a', text='Full Story')]
for post_url in posts_urls:
# print(post_url)
try:
post_data = scrape_post(session, base_url, post_url)
scraped_posts.append(post_data)
except Exception as e:
logging.info('Error: {}'.format(e))
n_scraped_posts += 1
if posts_completed(scraped_posts, post_limit):
break
show_more_posts_url = None
if not posts_completed(scraped_posts, post_limit):
show_more_posts_url = profile_bs.find('div', id=posts_id).next_sibling.a['href']
profile_bs = get_bs(session, base_url+show_more_posts_url)
time.sleep(3)
else:
break
return scraped_posts
def get_bs(session, url):
#Makes a GET requests using the given Session object and returns a BeautifulSoup object.
r = None
while True:
r = session.get(url)
time.sleep(3)
if r.ok:
break
return BeautifulSoup(r.text, 'lxml')
#Scraping FB
def scrape_post(session, base_url, post_url):
#Goes to post URL and extracts post data.
post_data = OrderedDict()
post_bs = get_bs(session, base_url+post_url)
time.sleep(5)
# Here we populate the OrderedDict object
post_data['url'] = post_url
#Find Post main element
try:
post_text_element = post_bs.find('div', id='u_0_0').div
string_groups = [p.strings for p in post_text_element.find_all('p')]
strings = [repr(string) for group in string_groups for string in group]
post_data['text'] = strings
except Exception:
post_data['text'] = []
#Extract post media URL
try:
post_data['media_url'] = post_bs.find('div', id='u_0_0').find('a')['href']
except Exception:
post_data['media_url'] = ''
#Extract remaining data
try:
post_data['comments'] = extract_comments(session, base_url, post_bs, post_url)
except Exception:
post_data['comments'] = []
return dict(post_data)
#Scraping FB
def scrape_post(session, base_url, post_url):
#Goes to post URL and extracts post data.
post_data = OrderedDict()
post_bs = get_bs(session, base_url+post_url)
time.sleep(5)
# Here we populate the OrderedDict object
post_data['url'] = post_url
#Find Post main element
try:
post_text_element = post_bs.find('div', id='u_0_0').div
string_groups = [p.strings for p in post_text_element.find_all('p')]
strings = [repr(string) for group in string_groups for string in group]
post_data['text'] = strings
except Exception:
post_data['text'] = []
#Extract post media URL
try:
post_data['media_url'] = post_bs.find('div', id='u_0_0').find('a')['href']
except Exception:
post_data['media_url'] = ''
#Extract remaining data
try:
post_data['comments'] = extract_comments(session, base_url, post_bs, post_url)
except Exception:
post_data['comments'] = []
return dict(post_data)
#Function for profile URL and creditials for FB
def json_to_obj(filename):
#Extracts data from JSON file and saves it on Python object
obj = None
with open(filename) as json_file:
obj = json.loads(json_file.read())
return obj
def save_data(data):
#Converts data to JSON.
with open('profile_posts_data.json', 'w') as json_file:
json.dump(data, json_file, indent=4)
if __name__ == "__main__":
logging.basicConfig(level=logging.INFO)
base_url = 'https://mobile.facebook.com'
session = requests.session()
# Extracts credentials for the login and all of the profiles URL to scrape
credentials = json_to_obj('credentials.json')
profiles_urls = json_to_obj('profiles_urls.json')
make_login(session, base_url, credentials)
posts_data = None
for profile_url in profiles_urls:
posts_data = crawl_profile(session, base_url, profile_url, 25)
logging.info('[!] Scraping finished. Total: {}'.format(len(posts_data)))
logging.info('[!] Saving.')
save_data(posts_data)
if __name__ == "__main__":
logging.basicConfig(level=logging.INFO)
base_url = 'https://mobile.facebook.com'
session = requests.session()
# Extracts credentials for the login and all of the profiles URL to scrape
credentials = json_to_obj(r"C:\Users\E7450\Desktop\GIS702\FBScrapping\credentials.json")
profiles_urls = json_to_obj(r"C:\Users\E7450\Desktop\GIS702\FBScrapping\profiles_urls.json")
make_login(session, base_url, credentials)
posts_data = None
for profile_url in profiles_urls:
posts_data = crawl_profile(session, base_url, profile_url, 25)
logging.info('[!] Scraping finished. Total: {}'.format(len(posts_data)))
logging.info('[!] Saving.')
save_data(posts_data)
Related
The code below creates the sitemap but it is including 404 error URL's. How do I exclude them from the sitemap?
from usp.tree import sitemap_tree_for_homepage
import xml.etree.cElementTree as ET
import simplejson as json
from datetime import date
tree = sitemap_tree_for_homepage('')
root = ET.Element("urlset", xmlns="http://www.sitemaps.org/schemas/sitemap/0.9")
for page in tree.all_pages():
url = page.url
prio = json.dumps(page.priority, use_decimal=True)
# format YYYY-MM-DDThh:mmTZD see: https://www.w3.org/TR/NOTE-datetime
lm = date.today().strftime("%Y-%m-%d")
cf = page.change_frequency.value
urlel = ET.SubElement(root, "url")
ET.SubElement(urlel, "loc").text = url
ET.SubElement(urlel, "lastmod").text = lm
ET.SubElement(urlel, "changefreq").text = cf
ET.SubElement(urlel, "priority").text = prio
ET.indent(root, " ") # pretty print
xmltree = ET.ElementTree(root)
xmltree.write("sitemap.xml", encoding="utf-8", xml_declaration=True )
You have to use (external module) requests or (built-in module) urllib.request to check if url gives status 404
import urllib.request
import urllib.error
url = 'https://stackoverflow.com/fake_url' # wrong url
#url = 'https://stackoverflow.com/' # correct url
try:
r = urllib.request.urlopen(url)
print('adding url:', url)
# ... add `url` to sitemap ...
except urllib.error.HTTPError as ex:
print('ex:', ex)
print('wrong url:', url)
or
import requests
url = 'https://stackoverflow.com/fake_url'
#url = 'https://stackoverflow.com/'
response = requests.get(url)
#if response.status_code != 404:
if response.status_code == 200:
print('adding url:', url)
# ... add `url` to sitemap ...
else:
print('wrong url:', url)
EDIT:
You could put it as function and return True/False
import urllib.request
import urllib.error
def is_correct_url(url):
try:
r = urllib.request.urlopen(url)
return True
except urllib.error.HTTPError as ex:
print('ex:', ex)
return False
# ---
for page in tree.all_pages():
url = page.url
if is_correct_url(url):
print('adding url:', url)
# ... add `url` to sitemap ...
else:
print('wrong url:', url)
import requests
def is_correct_url(url):
response = requests.get(url)
#return response.status_code != 404:
return response.status_code == 200:
# ---
for page in tree.all_pages():
url = page.url
if is_correct_url(url):
print('adding url:', url)
# ... add `url` to sitemap ...
else:
print('wrong url:', url)
This way only the non-broken links will be added
from usp.tree import sitemap_tree_for_homepage
import xml.etree.cElementTree as ET
import simplejson as json
from datetime import date
import requests
tree = sitemap_tree_for_homepage('')
root = ET.Element("urlset", xmlns="http://www.sitemaps.org/schemas/sitemap/0.9")
for page in tree.all_pages():
url = page.url
response = requests.get(url)
if response.status_code == 200:
prio = json.dumps(page.priority, use_decimal=True)
# format YYYY-MM-DDThh:mmTZD see: https://www.w3.org/TR/NOTE-datetime
lm = date.today().strftime("%Y-%m-%d")
cf = page.change_frequency.value
urlel = ET.SubElement(root, "url")
ET.SubElement(urlel, "loc").text = url
ET.SubElement(urlel, "lastmod").text = lm
ET.SubElement(urlel, "changefreq").text = cf
ET.SubElement(urlel, "priority").text = prio
ET.indent(root, " ") # pretty print
xmltree = ET.ElementTree(root)
xmltree.write("sitemap.xml", encoding="utf-8", xml_declaration=True )
I have this function that fecthes a bunch of images:
def get_player_images_with_api():
url = 'https://footballapi.pulselive.com/football/players?pageSize=30&compSeasons=274&altIds=true&page={page}&type=player&id=-1&compSeasonId=274'
img_url = 'https://resources.premierleague.com/premierleague/photos/players/250x250/{player_id}.png'
headers = {'Origin': 'https://www.premierleague.com'}
page=0
while True:
try:
data = requests.get(url.format(page=page), headers=headers).json()
for player in data['content']:
print('{:<50} {}'.format(player['name']['display'], img_url.format(player_id=player['altIds']['opta'])))
sleep(2)
page+=1
except:
break
How do I dinamically save each image on a 'path/to/image' folder with player['name'].png format?
Here you go :)
import requests
from time import sleep
import urllib.request
def get_player_images_with_api():
url = 'https://footballapi.pulselive.com/football/players?pageSize=30&compSeasons=274&altIds=true&page={page}&type=player&id=-1&compSeasonId=274'
img_url = 'https://resources.premierleague.com/premierleague/photos/players/250x250/{player_id}.png'
headers = {'Origin': 'https://www.premierleague.com'}
page = 0
while True:
try:
data = requests.get(url.format(page=page), headers=headers).json()
for player in data['content']:
print('{:<50} {}'.format(
player['name']['display'],
img_url.format(player_id=player['altIds']['opta'])))
urllib.request.urlretrieve(
img_url.format(player_id=player['altIds']['opta']),
player['name']['display'] + ".png")
sleep(2)
page += 1
except:
break
*Just to clarify in advance, I use Postman to test my requests and they return the results im looking for.
I'm connecting to an API using Python. The API will only return 500 records per request and it will provide the total number of records in the first response header 'x-test-count'
I'm obviously not python savvy and feel that im handling pagination completely wrong. Take a look at the async get function. Basically, it takes the total count from the first response and loops through running
async with session.get(paging_url) as response:
page_results = await response.json()
pages.extend(page_results)
It does return results but only 500. So it would seem that its not capturing each iteration.
class Queue:
def __init__(self, id, type):
self.id = id
self.type = type
self.requests = []
class Test:
def __init__(self):
self.queue = []
self.queue_list = []
self.coroutines = []
self.headers = {
'Content-Type': 'application/json',
'x-test-token': self.token,
}
def get_id(self, type=''):
id = datetime.now().strftime('%Y%m-%d%H-%M%S-') + str(uuid4())
if type != '':
id = type + '-' + id
return id
def url_encode(self, url):
# doesn't like encoding urls using yarl. I'm manually handling them below with UTF-8 encode
url = url.replace(' ', '%20')
#url = url.replace('?', '%3F')
return url
def queue_create(self, type=''):
id = self.get_id(type='queue')
if type == '':
self.debug('Error: queue_create was not given a type')
return
id = Queue(id=id, type=type)
self.debug('queue_create instantiated new queue class named: ' + id)
# TODO: Add to list of active queues to track for create and destroy
# Return name of new object
return id
def queue_run(self, name=''):
self.debug('Starting queue_run')
if name == '':
self.debug('Error: queue_run asked to run without providing a name')
#return
**async def get(url, headers):
async with aiohttp.ClientSession(headers=headers, connector=aiohttp.TCPConnector(verify_ssl=False)) as session:
async with session.get(url) as response:
self.debug('HTTP Response: ' + str(response.status))
# Set pagination vars to 1
current_page = 1
page_range = 1
# Check the status code. If other than 200, stop
assert response.status == 200
# Get the count of records. If not provided, set last_page to 1
try:
page_range = int(response.headers['x-test-count'])
self.debug(response.headers['x-test-count'])
except:
self.debug('x-test-count not provided, defaulted to 1')
first_page_results = await response.json()
if page_range == 1:
self.debug('Returning first page results only')
return first_page_results
else:
self.debug('Total results: ' + str(page_range) + '. Performing additional requests.')
pages = []
for records in range(1,page_range,500):
remaining_records = page_range - records
if remaining_records > 500:
paging_size = 500
else:
paging_size = remaining_records
# Create the paging URL
paging_url = url + '&size=' + str(paging_size) + '&from=' + str(records)
# Run paged requests
async with session.get(paging_url) as response:
page_results = await response.json()
# combine paged requests
pages.extend(page_results)
# Clear paging URL
paging_url = ''
return pages**
# Establish the loop
loop = asyncio.get_event_loop()
# Establish coroutines and populate with queries from queue
coroutines = []
for query in self.queue:
# Removed a lot of the actual code here. Basically, this establishes the URL and appends coroutines
coroutines.append(get(url, headers=headers))
# Start the asyncio loop
results = loop.run_until_complete(asyncio.gather(*coroutines))
return results
def add_request(self, type, endpoint, query='', deleted=False, data='', full=False, paging_size='', paging_from=''):
self.debug('Starting add_request')
self.debug('Queue before append: ', item=self.queue)
self.queue.append([type, endpoint, query, deleted, data, full, paging_size, paging_from])
self.debug('Queue after append: ', item=self.queue)
return self.queue
So to run, it looks something like this
Test = Test()
Test.add_request('read', 'personnel', '', full=True ,deleted=False)
response = Test.queue_run()
I have a list of urls, which contain JSON files.
The JSON files are all stored differently, therefore I need try and except blocks to cover the different storing methods.
The problem is that this method leads to some duplicates, because some links get requested two times or more in the different blocks.
My code:
for line in urls:
try:
response = requests.get(line)
textinhalt = response.text
#textinhalt = textinhalt.split("__IR_CURRPAGE_DATA_JSON__")[1]
daten = json.loads(textinhalt[textinhalt.find("{"):textinhalt.rfind("}")+1])
r_urls.append(daten)
except:
pass
try:
response = requests.get(line)
textinhalt = response.text
#textinhalt = textinhalt.split("__IR_CURRPAGE_DATA_JSON__")[1]
daten0 = json.loads(textinhalt[textinhalt.find("{"):textinhalt.rfind("}")+1])
r_urls.append(daten0)
except:
pass
try:
response = requests.get(line)
textinhalt = response.text
textinhalt = textinhalt.split("__IR_CURRPAGE_DATA_JSON__")[1]
daten0 = json.loads(textinhalt[textinhalt.find("{"):textinhalt.rfind("}")+1])
r_urls.append(daten0)
except:
pass
Is it possible to write the try/except blocks in a way that a link gets ignored, if it was succesfully requested in a previous block?
The first 2 try/except blocks are explicitly duplicated and there's no functional benefit to repeat them.
Instead, think through 2 consecutive phases:
extracting a remote resource
parsing JSON string and storing the result
So when extracting phase is failed - no sense to move forward, if the 1st parsing phase is failed - try another kind of parsing:
for line in urls:
try:
response = requests.get(line)
textinhalt = response.text
except:
continue
try:
try:
daten = json.loads(textinhalt[textinhalt.find("{"):textinhalt.rfind("}")+1])
except:
textinhalt = textinhalt.split("__IR_CURRPAGE_DATA_JSON__")[1]
daten = json.loads(textinhalt[textinhalt.find("{"):textinhalt.rfind("}")+1])
r_urls.append(daten)
except:
pass
This should solve it for you
for line in urls:
try:
response = requests.get(line)
textinhalt = response.text
#textinhalt = textinhalt.split("__IR_CURRPAGE_DATA_JSON__")[1]
daten = json.loads(textinhalt[textinhalt.find("{"):textinhalt.rfind("}")+1])
r_urls.append(daten)
continue
except:
pass
try:
response = requests.get(line)
textinhalt = response.text
#textinhalt = textinhalt.split("__IR_CURRPAGE_DATA_JSON__")[1]
daten0 = json.loads(textinhalt[textinhalt.find("{"):textinhalt.rfind("}")+1])
r_urls.append(daten0)
continue
except:
pass
try:
response = requests.get(line)
textinhalt = response.text
textinhalt = textinhalt.split("__IR_CURRPAGE_DATA_JSON__")[1]
daten0 = json.loads(textinhalt[textinhalt.find("{"):textinhalt.rfind("}")+1])
r_urls.append(daten0)
except:
pass
i want to create simple website for pnr check . code is working right but it performs only one work at a time either render result to other page or send mail . while working with thread it send mail on back end till then page remains loading .
please anyone give me suggestion. also i want to run it on google appengine so i haven't tried celery.
from django.http import HttpResponse
from bs4 import BeautifulSoup
import re
import requests
from django.shortcuts import render
from functools import partial, wraps
from django.core.mail import send_mail
import time
import thread
def checkpnr(request):
return render(request, 'checkpnr.html')
def check(string, pnr, sleeptime, lock, *args):
while 1:
# entering critical section
lock.acquire()
# time1=request.get_all("notif")
url_pnr = "pnr url"
r = requests.get(url_pnr)
data = r.text
soup = BeautifulSoup(data)
train = str(soup.find("ul", attrs={"class": "train_info"}))
train_number = soup.find("li", attrs={"class": "first"}).text
source = str(soup.find("travellers"))
route = str(soup.findAll("li")[1]).replace(
'<li>', '').replace('</li>', '')
#head, sep, tail = route.partition(' -')
travel_date = str(soup.findAll("li")[2].text)
date, sep, total = travel_date.partition('|')
rows = soup.findAll("td", attrs={"class": "pax"})
rowlength = len(rows)
chart_status = str(soup.findAll("tr")[rowlength + 1].findAll("td")[0]).replace(
'<td colspan="3"><strong>', '').replace('</strong>', '').replace('</td>', '')
passengers = []
status = []
coach = []
tot = []
w=''
i = 1
j = 1
while i <= rowlength:
j = str(soup.findAll("tr")[i].findAll(
"td")[0].text).replace(':', '')
passengers.append(j)
s = str(soup.findAll("tr")[i].findAll("td")[1].text)
w=w+','+s
status.append(s)
c = str(soup.findAll("tr")[i].findAll("td")[2].text)
coach.append(c)
tot.append(i)
i += 1
time.sleep(sleeptime)
emailMsg = status
subject = pnr+'-'+w
send_mail(
subject,'emailMsg', 'email-from',
[email], fail_silently=False)
lock.release()
if (status[rowlength - 1] == "CONFIRMED"):
time.sleep(sleeptime)
else:
time.sleep(1000000000000000000000000)
def fetch(request):
pnr = request.POST['pnr']
if len(pnr) != 10:
msg = "PNR must be of 10 digits ..."
return render(request, 'checkpnr.html', {'msg': msg})
email = request.POST['email']
e = request.POST['ntime']
if (e != ''):
n_time = int(e)
n = request.POST['notify']
if (n != ''):
notify = int(n)
sleeptim = notify * n_time
sleeptime= 10
# time1=request.get_all("notif")
url_pnr = "pnr url"
try:
r = requests.get(url_pnr)
data = r.text
soup = BeautifulSoup(data)
train = str(soup.find("ul", attrs={"class": "train_info"}))
train_number = soup.find("li", attrs={"class": "first"}).text
source = str(soup.find("travellers"))
route = str(soup.findAll("li")[1]).replace(
'<li>', '').replace('</li>', '')
#head, sep, tail = route.partition(' -')
travel_date = str(soup.findAll("li")[2].text)
date, sep, total = travel_date.partition('|')
rows = soup.findAll("td", attrs={"class": "pax"})
rowlength = len(rows)
chart_status = str(soup.findAll("tr")[rowlength + 1].findAll("td")[0]).replace(
'<td colspan="3"><strong>', '').replace('</strong>', '').replace('</td>', '')
passengers = []
status = []
coach = []
tot = []
w=''
i = 1
j = 1
while i <= rowlength:
j = str(soup.findAll("tr")[i].findAll(
"td")[0].text).replace(':', '')
passengers.append(j)
s = str(soup.findAll("tr")[i].findAll("td")[1].text)
w=w+','+s
status.append(s)
c = str(soup.findAll("tr")[i].findAll("td")[2].text)
coach.append(c)
tot.append(i)
i += 1
msg = "Mail not Sent"
msg1 = ''
if(email != ''):
emailMsg = status
subject = pnr+'-'+w
send_mail(
subject,'emailMsg', 'ashutosh8nitjsr#gmail.com',
[email], fail_silently=False)
msg = "mail sent.."
if __name__ == "__main__":
lock = thread.allocate_lock()
thread.start_new_thread(
check,("Thread No:1", pnr, email, sleeptime, lock))
msg1 = "thread created"
time.sleep(sleeptime)
while 1:
pass
detail2 = {
'train_number': train_number, 'route': route, 'date': date, 'chart_status': chart_status, 'tot': tot,
'passengers': passengers, 'status': status, 'coach': coach, 'msg': msg}
return render(request, 'status.html', detail2)l
except:
msg = "there was error. please try again..."
return render(request, 'checkpnr.html', {'msg': msg})
You can try using TaskQueues for this purpose on app engine.
Task Queue API