How can I get total result count from google custom search? - python

I want to get URL in http://www.malware-traffic-analysis.net/
I want to get url when I insert keyword "nuclear".
but I block because I do not get total count.
this is my sample code.I almost done..
import urllib2
import json
import pprint
def customSearch(page,keyword):
url = 'https://www.googleapis.com/customsearch/v1?key=MY_API_KEY'
+ '&cx=007471739612924802870:ak59oowq-cq&num=10&start='
+ str(page)
+ '1&q='
+ keyword
data = urllib2.urlopen(url)
data = json.load(data)
return data
def getURL(keyword):
result=[]
try:
for i in range(1): # i want to get this range
data=customSearch(i,keyword)
for temp in data['items']:
result.append(temp["link"])
except:
None
content=""
for i in result:
content+=(i+"\n")
f=open(keyword+'.txt',"w")
f.write(content)
f.close()
getURL("Nuclear")

Related

Storing API data in Json fromat

I am having one API in that all location id’s and their respective info like(address, Lat, long) present. But if I want to fetch other extra attributes like location name, location area, location access then I need to give location id one by one as parameter in API to fetch their respective extra attributes.
I have written below code.but the problem with below code is the data is coming in console and i don't know how to take this information in json and then convert it into text file.
ids=location_id_df["id"] #stored location id in dataframe
authorization =”####################### "
print("started")
def test_api(url, authorization, rawfile,ids):
for i in range(0,1000,50):
for j in ids:
#print(j)
try:
request = urllib.request.Request('https:….. /locations/{}'.format(j)+"?
offset="+str(i),headers={'authorization':authorization})
response = urllib.request.urlopen(request).read()
print(response)
except HTTPError as e:
print(e)
sys.exit(0)
with open(rawfile + "_offset_" + str(i) + ".json", "wb") as json_download:
json_download.write(response)
test_api(url, authorization, rawfile,ids)
I need to fectch response in json like
5182021_offset_0.json #contains some location id's with extra attribute data
5182021_offset_50.json #contains some location id's with extra attribute data
5182021_offset_100.json #contains some location id's with extra attribute data
........................
.......................
Here is a simplified version of your example that queries an api that returns json and saves each result to a file.
import urllib.request
import json
for i in range(2):
responses = []
for j in range(3):
request = urllib.request.Request("https://www.boredapi.com/api/activity/")
response = urllib.request.urlopen(request)
if response.status == 200:
try:
response_bytes = response.read()
finally:
response.close()
response_string = response_bytes.decode("utf8")
response_data = json.loads(response_string)
responses.append(response_data)
file_name = "data-{}.json".format(i)
with open(file_name, "w") as f:
json.dump(responses, f)
I would suggest using the Requests library as it tends to have a simpler api than urllib, and is widely used by the python community. Here is the same example with the Requests library.
import requests
import json
for i in range(2):
responses = []
for j in range(3):
response = requests.get("https://www.boredapi.com/api/activity/")
if response.status_code == 200:
response_data = response.json()
responses.append(response_data)
file_name = "data-{}.json".format(i)
with open(file_name, "w") as f:
json.dump(responses, f)

scraper filters out word's instead of line

So I was trying to make a filter that filter's out the crap from this scrape, but I have an issue where it filters out the words. I would like to filter out the whole line instead of the words.
from bs4 import BeautifulSoup
import requests
import os
def Scrape():
page = input("Page: ")
url = "https://openuserjs.org/?p=" + page
source = requests.get(url)
soup = BeautifulSoup(source.text,'lxml')
os.system('cls')
Filter(soup)
def Filter(soup):
crap = ""
f = open("Data/Crap.txt", "r")
for craptext in f:
crap = craptext
for Titles in soup.select("a.tr-link-a>b"):
print(Titles.text.replace(crap, "").strip())
while True:
Scrape()
Instead of:
print(Titles.text.replace(crap, "").strip())
Try using:
if crap not in Titles.text:
print(Titles.text.strip())

Python - Data Scraping Tables

I'm attempting to scrape data for all the quarterbacks who have been drafted. http://www.nfl.com/draft/history/fulldraft?type=position
I'm able to scrape the data. However, there are blank lines that I cannot get rid of. Excel file output
Here is the code that I used.
import urllib
import urllib.request
from bs4 import BeautifulSoup
import os
def make_soup(url):
thepage = urllib.request.urlopen(url)
soupdata = BeautifulSoup(thepage, "html.parser")
return soupdata
playerdata = playerdatasaved = ""
soup = make_soup("http://www.nfl.com/draft/history/fulldraft?type=position")
for record in soup.findAll('tr'):
playerdata = ""
for data in record.findAll('td'):
playerdata = playerdata + "," + data.text
if len(playerdata)!= 0:
playerdatasaved = playerdatasaved + "\n" + playerdata[1:]
header = "Round, Selection #, Player, Position, School, Team Drafted" + "\n"
file = open("Quarterbacks.csv","wb")
file.write(bytes(header, encoding = "ascii", errors = 'igonore'))
file.write(bytes(playerdatasaved, encoding = "ascii", errors = 'igonore'))
I've tried to use an if statement to check for \n breaks and remove the breaks. Also, I've tried to turn the data into a string and use a replace or split command. None of these corrected the issue.
Thanks for any help that you can give me!

Python - scraping a paginated site and writing the results to a file

I am a complete programming beginner, so please forgive me if I am not able to express my problem very well. I am trying to write a script that will look through a series of pages of news and will record the article titles and their links. I have managed to get that done for the first page, the problem is getting the content of the subsequent pages. By searching in stackoverflow, I think I managed to find a solution that will make the script access more than one URL BUT it seems to be overwriting the content extracted from each page it accesses so I always end up with the same number of recorded articles in the file. Something that might help: I know that URLs follow the following model: "/ultimas/?page=1", "/ultimas/?page=2", etc. and it appears to be using AJAX to request new articles
Here is my code:
import csv
import requests
from bs4 import BeautifulSoup as Soup
import urllib
r = base_url = "http://agenciabrasil.ebc.com.br/"
program_url = base_url + "/ultimas/?page="
for page in range(1, 4):
url = "%s%d" % (program_url, page)
soup = Soup(urllib.urlopen(url))
letters = soup.find_all("div", class_="titulo-noticia")
letters[0]
lobbying = {}
for element in letters:
lobbying[element.a.get_text()] = {}
letters[0].a["href"]
prefix = "http://agenciabrasil.ebc.com.br"
for element in letters:
lobbying[element.a.get_text()]["link"] = prefix + element.a["href"]
for item in lobbying.keys():
print item + ": " + "\n\t" + "link: " + lobbying[item]["link"] + "\n\t"
import os, csv
os.chdir("...")
with open("lobbying.csv", "w") as toWrite:
writer = csv.writer(toWrite, delimiter=",")
writer.writerow(["name", "link",])
for a in lobbying.keys():
writer.writerow([a.encode("utf-8"), lobbying[a]["link"]])
import json
with open("lobbying.json", "w") as writeJSON:
json.dump(lobbying, writeJSON)
print "Fim"
Any help on how I might go about adding the content of each page to the final file would be very appreciated. Thank you!
How about this one if serving the same purpose:
import csv, requests
from lxml import html
base_url = "http://agenciabrasil.ebc.com.br"
program_url = base_url + "/ultimas/?page={0}"
outfile = open('scraped_data.csv', 'w', newline='')
writer = csv.writer(outfile)
writer.writerow(["Caption","Link"])
for url in [program_url.format(page) for page in range(1, 4)]:
response = requests.get(url)
tree = html.fromstring(response.text)
for title in tree.xpath("//div[#class='noticia']"):
caption = title.xpath('.//span[#class="field-content"]/a/text()')[0]
policy = title.xpath('.//span[#class="field-content"]/a/#href')[0]
writer.writerow([caption , base_url + policy])
It looks like the code in your for loop (for page in range(1, 4):) isn't been called due to your file not been correctly indented:
If you tidy up your code, it works:
import csv, requests, os, json, urllib
from bs4 import BeautifulSoup as Soup
r = base_url = "http://agenciabrasil.ebc.com.br/"
program_url = base_url + "/ultimas/?page="
for page in range(1, 4):
url = "%s%d" % (program_url, page)
soup = Soup(urllib.urlopen(url))
letters = soup.find_all("div", class_="titulo-noticia")
lobbying = {}
for element in letters:
lobbying[element.a.get_text()] = {}
prefix = "http://agenciabrasil.ebc.com.br"
for element in letters:
lobbying[element.a.get_text()]["link"] = prefix + element.a["href"]
for item in lobbying.keys():
print item + ": " + "\n\t" + "link: " + lobbying[item]["link"] + "\n\t"
#os.chdir("...")
with open("lobbying.csv", "w") as toWrite:
writer = csv.writer(toWrite, delimiter=",")
writer.writerow(["name", "link",])
for a in lobbying.keys():
writer.writerow([a.encode("utf-8"), lobbying[a]["link"]])
with open("lobbying.json", "w") as writeJSON:
json.dump(lobbying, writeJSON)
print "Fim"

How to get all application link from the log text file?

I have a log file which contains:
http://www.downloadray.com/windows/Photos_and_Images/Image_Convertors/
http://www.downloadray.com/windows/Photos_and_Images/Graphic_Capture/
http://www.downloadray.com/windows/Photos_and_Images/Digital_Photo_Tools/
I have this code:
from bs4 import BeautifulSoup
import urllib
import urlparse
f = open("downloadray2.txt")
g = open("downloadray3.txt", "w")
for line in f.readlines():
i = 1
while 1:
url = line+"?page=%d" % i
pageHtml = urllib.urlopen(url)
soup = BeautifulSoup(pageHtml)
has_more = 1
for a in soup.select("div.n_head2 a[href]"):
try:
print (a["href"])
g.write(a["href"]+"\n")
except:
print "no link"
if has_more:
i += 1
else:
break
This code do not give error but it do not working.
I tried modified it but can't solved it.
But when I try this code,it works well:
from bs4 import BeautifulSoup
import urllib
import urlparse
g = open("downloadray3.txt", "w")
url = "http://www.downloadray.com/windows/Photos_and_Images/Image_Convertors/"
pageUrl = urllib.urlopen(url)
soup = BeautifulSoup(pageUrl)
i = 1
while 1:
url1 = url+"?page=%d" % i
pageHtml = urllib.urlopen(url1)
soup = BeautifulSoup(pageHtml)
has_more = 2
for a in soup.select("div.n_head2 a[href]"):
try:
print (a["href"])
g.write(a["href"]+"\n")
except:
print "no link"
if has_more:
i += 1
else:
break
So how can I make it can read from the log text file. It is hard to take link one by one to be read.
Have you stripped the newline from the end of the line?
for line in f.readlines():
line = line.strip()
readlines() will produce a list of lines taken from the file including the newline \n character.
Proof Evidence by printing url variable (after the line url = line+"?page=%d" % i):
Your original code:
http://www.downloadray.com/windows/Photos_and_Images/Image_Convertors/
?page=1
http://www.downloadray.com/windows/Photos_and_Images/Image_Convertors/
?page=2
http://www.downloadray.com/windows/Photos_and_Images/Image_Convertors/
?page=3
With my suggested fix:
http://www.downloadray.com/windows/Photos_and_Images/Image_Convertors/?page=1
http://www.downloadray.com/TIFF-to-JPG_download/
http://www.downloadray.com/Moo0-Image-Thumbnailer_download/
http://www.downloadray.com/Moo0-Image-Sizer_download/
http://www.downloadray.com/Advanced-Image-Viewer-and-Converter_download/
http://www.downloadray.com/GandMIC_download/
http://www.downloadray.com/SendTo-Convert_download/
http://www.downloadray.com/PNG-To-JPG-Converter-Software_download/
http://www.downloadray.com/Graphics-Converter-Pro_download/
http://www.downloadray.com/PICtoC_download/
http://www.downloadray.com/Free-Images-Converter_download/
http://www.downloadray.com/windows/Photos_and_Images/Image_Convertors/?page=2
http://www.downloadray.com/VarieDrop_download/
http://www.downloadray.com/Tinuous_download/
http://www.downloadray.com/Acme-CAD-Converter_download/
http://www.downloadray.com/AAOImageConverterandFTP_download/
http://www.downloadray.com/ImageCool-Converter_download/
http://www.downloadray.com/GeoJpeg_download/
http://www.downloadray.com/Android-Resizer-Tool_download/
http://www.downloadray.com/Scarab-Darkroom_download/
http://www.downloadray.com/Jpeg-Resizer_download/
http://www.downloadray.com/TIFF2PDF_download/
http://www.downloadray.com/windows/Photos_and_Images/Image_Convertors/?page=3
http://www.downloadray.com/JGraphite_download/
http://www.downloadray.com/Easy-PNG-to-Icon-Converter_download/
http://www.downloadray.com/JBatch-It!_download/
http://www.downloadray.com/Batch-It!-Pro_download/
http://www.downloadray.com/Batch-It!-Ultra_download/
http://www.downloadray.com/Image-to-Ico-Converter_download/
http://www.downloadray.com/PSD-To-PNG-Converter-Software_download/
http://www.downloadray.com/VectorNow_download/
http://www.downloadray.com/KeitiklImages_download/
http://www.downloadray.com/STOIK-Smart-Resizer_download/
Update:
Then again, this code won't run as expected, because the while loop will never continue since the has_more variable is never changed.
You know that you don't have more links when the list returned by `soup.select(...)` is empty. You can check for emptiness using `len(...)`. So that part might go like this:
list_of_links = soup.select("div.n_head2 a[href]")
if len(list_of_links)==0:
break
else:
for a in soup.select("div.n_head2 a[href]"):
print (a["href"])
g.write(a["href"]+"\n")
i += 1
Apparently the page still display the latest page available if it's queried beyond the maximum page. So if the maximum page number available is 82 and you query page 83, it will give page 82. To detect this case, you can save the list of previous page urls, and compare it with current list of urls.
Here is the full code (tested):
from bs4 import BeautifulSoup
import urllib
import urlparse
f = open("downloadray2.txt")
g = open("downloadray3.txt", "w")
for line in f.readlines():
line = line.strip()
i = 1
prev_urls = []
while 1:
url = line+"?page=%d" % i
print 'Examining %s' % url
pageHtml = urllib.urlopen(url)
soup = BeautifulSoup(pageHtml)
list_of_urls = soup.select("div.n_head2 a[href]")
if set(prev_urls)==set(list_of_urls):
break
else:
for a in soup.select("div.n_head2 a[href]"):
print (a["href"])
g.write(a["href"]+"\n")
i += 1
prev_urls = list_of_urls

Categories

Resources