Scraping phones and ZIPs URLs from CSV to CSV - python

I need to scrape a list of URLs stored in a CSV and export to another CSV. I must make some mistake cause I can't run it. So if anyone can help me, I appreciate.
I'm very new in Python and also unite some codes so I have some issues to identificate where is the problem. I mixed a code that import an CSV and another code that require an string search.
import scrapy
from scrapy import Spider
from scrapy import Request
from scrapy.crawler import CrawlerProcess
from bs4 import BeautifulSoup
from urllib.parse import urlparse
import requests
import pandas as pd
from bs4 import BeautifulSoup
from urllib.request import urlopen,urlparse, Request,HTTPError
import re
import numpy as np
import csv
from http.client import BadStatusLine
import ssl
The following is the code that I have so far.
phn_1 = []
zipcode_1 = []
err_msg_zipcode = []
err = []
class Spider:
name = 'spider'
# read csv with just url per line
with open('urls.csv') as file:
start_urls = [line.strip() for line in file]
def start_request(self):
request = Request(url = self.start_urls, callback=self.parse)
yield request
def parse(self, response):
s = response.body
soup = BeautifulSoup(html, 'lxml')
text = soup.get_text()
df2=pd.DataFrame()
phn_1 = [] #store all the extracted Phn numbers in a List
mail_1 = [] #store all the extracted Zipcode in a List
for line in df2.iterrows(): # Parse through each url in the list.
try:
try:
req1 = Request(row[1]['URL'], headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.75 Safari/537.36'})
gcontext = ssl.SSLContext(ssl.PROTOCOL_SSLv23) # Bypass SSL certification verification
f = urlopen(req1, context=gcontext)
url_name = f.geturl() #extract URL name
s = f.read()
phone = re.findall(r'\d{3}-\d{3}-\d{4}', s, re.MULTILINE)
zipcode = re.findall(r'(?<=, [A-Z]{2} )\d{5}', s, re.MULTILINE)
if len(phone) == 0:
print("No phone number found.")
err_msg_phn = "No phone number found."
phn_1.append((url_name, err_msg_phn))
else:
count = 1
for item in phone:
phn_1.append((url_name,item))
count += 1
print(phn_1)
if len(zipcode) == 0:
print("No zipcode found.")
err_msg_zipcode = "No zipcode address found."
zipcode_1.append((url_name,err_msg_zipcode))
else:
count = 1
for item in zipcode:
mail_1.append((url_name,item))
count += 1
print(mail_1)
except BadStatusLine: # Catch if invalid url names exist
print("could not fetch %s" % url_name)
except urllib3.request.HTTPError as err: # catch HTTP 404 not found error
if err == 404:
print("Received HTTPError on %s" % url_name)
df_p = pd.DataFrame()
df_m = pd.DataFrame()
df_final = pd.DataFrame()
df_p = pd.DataFrame(phn_1,columns=['URL','Phone_No']) # Dataframe for url and Phn number
df_phn = df_p.drop_duplicates(subset=['URL', 'Phone_No'], keep='first') #remove duplicates
df_m = pd.DataFrame(zipcode_1,columns=['URL','Zipcode']) # Dataframe for url and Zipcode
df_mail = df_m.drop_duplicates(subset=['URL','Zipcode'], keep='first') #remove duplicates
df_final = pd.merge(df_phn,df_mail, on = 'URL', how = 'inner') #Merge two dataframes on the common column
#df_final.groupby(['URL'], as_index=False)
df_final.to_csv('result_contact.csv', index=False, encoding='utf-8')
#convert the csv output to json
with open('result_contact.csv') as f:
reader = csv.DictReader(f)
rows = list(reader)
Thank you!!!

One obvious mistake I see is here:
request = Request(url = self.start_urls, callback=self.parse)
url should be a string, but you are sending a list. If you want to send multiple requests, you need to use a loop. As you are already setting start_urls and using the parse callback, you do not need to override start_requests. The default implementation should take care of it.
You may want to consider setting the start_urls in the __init__ method.

Related

How to extract daily close from WSJ using Python?

I used python 3 and pandas to parse the daily close from WSJ into EXCEL. However, the daily close shown on the web page screen cannot be extracted. Here is the link: "https://quotes.wsj.com/index/COMP/historical-prices"
How to download the close data on screen into excel?
and how to download "DOWNLOAD A SPREADSHEET" button file into excel with another name like comp.xlxs ?
Here are the codes:
import requests
import pandas as pd
url = 'https://quotes.wsj.com/index/COMP/historical-prices'
jsonData = requests.get(url).json()
final_df = pd.DataFrame()
for row in jsonData['data']:
#row = jsonData['data'][1]
data_row = []
for idx, colspan in enumerate(row['colspan']):
colspan_int = int(colspan[0])
data_row.append(row['td'][idx] * colspan_int)
flat_list = [item for sublist in data_row for item in sublist]
temp_row = pd.DataFrame([flat_list])
final_df = final_df.append(temp_row, sort=True).reset_index(drop=True)
wait2 = input("PRESS ENTER TO CONTINUE.")
Follow UP question quotes:
#
url = 'https://quotes.wsj.com/index/HK/XHKG/HSI/historical-prices/download?num_rows=15&range_days=15&endDate=12/06/2019'
response = requests.get(url)
open('HSI.csv', 'wb').write(response.content)
read_file = pd.read_csv (r'C:\A-CEO\REPORTS\STOCKS\PROFILE\Python\HSI.csv')
read_file.to_excel (r'C:\A-CEO\REPORTS\STOCKS\PROFILE\Python\HSI.xlsx', index = None, header=True)
#
url = 'https://quotes.wsj.com/index/SPX/historical-prices/download?num_rows=15&range_days=15&endDate=12/06/2019'
response = requests.get(url)
open('SPX.csv', 'wb').write(response.content)
read_file = pd.read_csv (r'C:\A-CEO\REPORTS\STOCKS\PROFILE\Python\SPX.csv')
read_file.to_excel (r'C:\A-CEO\REPORTS\STOCKS\PROFILE\Python\SPX.xlsx', index = None, header=True)
#
url = 'https://quotes.wsj.com/index/COMP/historical-prices/download?num_rows=15&range_days=15&endDate=12/06/2019'
response = requests.get(url)
open('COMP.csv', 'wb').write(response.content)
read_file = pd.read_csv (r'C:\A-CEO\REPORTS\STOCKS\PROFILE\Python\COMP.csv')
read_file.to_excel (r'C:\A-CEO\REPORTS\STOCKS\PROFILE\Python\COMP.xlsx', index = None, header=True)
the URL is wrong; once downloaded you can do "Get Info" if on a Mac, and you'll see "Where From:". You will see it's of the form below.
import requests
import pandas as pd
import io
#original URL had a bunch of other parameters I omitted, only these seem to matter but YMMV
url = 'https://quotes.wsj.com/index/COMP/historical-prices/download?num_rows=360&range_days=360&endDate=11/06/2019'
response = requests.get(url)
#do this if you want the CSV written to your machine
open('test_file.csv', 'wb').write(response.content)
# this decodes the content of the downloaded response and presents it to pandas
df_test = pd.read_csv(io.StringIO(response.content.decode('utf-8')))
To answer your additional question -- you can simply loop across a list of tickers or symbols, something like:
base_url = 'https://quotes.wsj.com/index/{ticker_name}/historical-prices/download?num_rows=360&range_days=360&endDate=11/06/2019'
ticker_list = ['COMP','SPX','HK/XHKG/HSI']
for ticker in ticker_list:
response = requests.get(base_url.format(ticker_name = ticker))
#do this if you want the CSV written to your machine
open('prices_'+ticker.replace('/','-')+'.csv', 'wb').write(response.content)
Note for HK/XHKG/HSI, we need to replace the slashes with hyphens or it's not a valid filename. You can also use this pattern to make dataframes.

Can't write in a CSV file python

I am trying to write data into a csv after scraping using pandas dataframe, but the csv is empty even after program execution. The headers are written first but they are also overwritten when dataframe comes into action.
Here is the code:
from bs4 import BeautifulSoup
import requests
import re as resju
import csv
import pandas as pd
re = requests.get('https://www.farfeshplus.com/Video.asp?ZoneID=297')
soup = BeautifulSoup(re.content, 'html.parser')
links = soup.findAll('a', {'class': 'opacityit'})
links_with_text = [a['href'] for a in links]
headers = ['Name', 'LINK']
# this is output file, u can change the path as you desire, default is the working directory
file = open('data123.csv', 'w', encoding="utf-8")
writer = csv.writer(file)
writer.writerow(headers)
for i in links_with_text:
new_re = requests.get(i)
new_soup = BeautifulSoup(new_re.content, 'html.parser')
m = new_soup.select_one('h1 div')
Name = m.text
print(Name)
n = new_soup.select_one('iframe')
ni = n['src']
iframe = requests.get(ni)
i_soup = BeautifulSoup(iframe.content, 'html.parser')
d_script = i_soup.select_one('body > script')
d_link = d_script.text
mp4 = resju.compile(r"(?<=mp4:\s\[\')(.*)\'\]")
final_link = mp4.findall(d_link)[0]
print(final_link)
df = pd.DataFrame(zip(Name, final_link))
df.to_csv(file, header=None, index=False)
file.close()
df.head() returns:
0 1
0 ل h
1 ي t
2 ل t
3 ى p
4 s
0 1
0 ل h
1 ي t
2 ل t
3 ى p
4 s
Any suggestion ?
It seems you are using a mix of libraries to write to a csv, pandas handles all this nicely so there is no need to use the inbuilt csv module from python -
I've modified your code below - this should return your dataframe as a whole df and write it out as a csv.
also using Headers=None you were setting the columns to nothing, so they would be referenced by an index number.
from bs4 import BeautifulSoup
import requests
import re as resju
#import csv
import pandas as pd
re = requests.get('https://www.farfeshplus.com/Video.asp?ZoneID=297')
soup = BeautifulSoup(re.content, 'html.parser')
links = soup.findAll('a', {'class': 'opacityit'})
links_with_text = [a['href'] for a in links]
names_ = [] # global list to hold all iterable variables from your loops
final_links_ = []
for i in links_with_text:
new_re = requests.get(i)
new_soup = BeautifulSoup(new_re.content, 'html.parser')
m = new_soup.select_one('h1 div')
Name = m.text
names_.append(name) # append to global list.
print(Name)
n = new_soup.select_one('iframe')
ni = n['src']
iframe = requests.get(ni)
i_soup = BeautifulSoup(iframe.content, 'html.parser')
d_script = i_soup.select_one('body > script')
d_link = d_script.text
mp4 = resju.compile(r"(?<=mp4:\s\[\')(.*)\'\]")
final_link = mp4.findall(d_link)[0]
print(final_link)
final_links_.append(final_link) # append to global list.
df = pd.DataFrame(zip(names_, final_links_)) # use global lists.
df.columns = ['Name', 'LINK']
df.to_csv(file, index=False)

looping Row and scraping data taking input from excel file

i want to scrape web data using input values from excel and scraping web for each row_value taken and save the output to same excel file.
from bs4 import BeautifulSoup
import requests
from urllib import request
import os
import pandas as pd
ciks = pd.read_csv("ciks.csv")
ciks.head()
output
CIK
0 1557822
1 1598429
2 1544670
3 1574448
4 1592290
then
for x in ciks:
url="https://www.sec.gov/cgi-bin/browse-edgar?CIK=" + x +"&owner=exclude&action=getcompany"
r = request.urlopen(url)
bytecode = r.read()
htmlstr = bytecode.decode()
soup = BeautifulSoup(bytecode)
t = soup.find('span',{'class':'companyName'})
print(t.text)
i got an erorr :
----> 9 print (t.text)
AttributeError: 'NoneType' object has no attribute 'text'
here, i want to scrape web data taking each row value as input from the CSV file.
It would be easier to convert the column values as list and then use it in the for loop - see solution below,
from bs4 import BeautifulSoup
import requests
from urllib import request
import os
import pandas as pd
#ciks = pd.read_csv("ciks.csv")
df = pd.read_csv("ciks.csv")
mylist = df['CIK'].tolist()# CIK is the column name
company =[]
for item in mylist:
print(item)
url="https://www.sec.gov/cgi-bin/browse-edgar?CIK=" + str(item) +"&owner=exclude&action=getcompany"
r = request.urlopen(url)
bytecode = r.read()
htmlstr = bytecode.decode()
soup = BeautifulSoup(bytecode,features="lxml")
t = soup.find('span',{'class':'companyName'})
company.append(t.text)
print(t.text)
df.assign(company= company)
print(df)
df.to_csv("ciks.csv")

how can i check parameter in while loop for each item in array?

I'm trying to get titles from old website.
The problem that i'm getting in some cases - null value.
Therefore, I have tried to do a while loop and change the URL.
Is my While loop in the right place?
The procedure is like this:
open file
get url
check url
get title
print title
while title = null):
replace part of the url and check url again
from urllib.request import urlopen
from bs4 import BeautifulSoup
from openpyxl import Workbook
import os
import xlrd
import lxml
# set file location
os.chdir("/excel_files")
# set the name of the file
file_name = "old.xlsx"
# open workbook
workbook = xlrd.open_workbook(file_name)
# set existing worksheet
sheet = workbook.sheet_by_index(0)
temp_list = [20131022212405,20090127003537,2009012702352,]
for i in range(sheet.nrows):
try:
u = sheet.cell_value(i,1)
html = urlopen(u)
bsObj = BeautifulSoup(html.read(), features='lxml')
# get title
title = str(bsObj.title)
print('row no. ',i, 'title is :' , title)
except:
title = 'null'
while (title == 'null'):
try:
u = u.replace(temp_list[i], temp_list[i + 1])
html = urlopen(u)
bsObj = BeautifulSoup(html.read(), features='lxml')
title = str(bsObj.title)
except:
print('title is :',title)
I'm getting null all the time - instead of getting only the row that actually is null.
It looks like your try/except indentation in the first for loop (for i in range(sheet.nrows):) is wrong, try and except should be on the same level.

Getting the: " FileNotFoundError: [Errno 2] No such file or directory: 'posted.txt '" even though I do have such file

Hey guys so I am working on a twitter bot that takes posts from reddit and tweets them. My problem right now is when I run it I get the " FileNotFoundError: [Errno 2] No such file or directory: 'posted.txt '" error.
But the thing is as you will see in the image below and through my
code, 'posted.txt' does exist and it is in the same directory. So I am kind of stuck on what the
actual problem is. I have a nearly identical program to this where the
'Already_Tweeted' function works but the only difference is that this
one takes in image files as well (using BeautifulSoup). Could that be contributing to this
error?
This is not the complete project, only what is hopefully relevant
import praw
import tweepy
import time
import os
from bs4 import BeautifulSoup as bs
import requests
posted_reddit_ids = 'posted.txt'
def tweet_creator(subreddit_info):
'''Goes through posts on reddit and extracts a shortened link, title & ID'''
post_links = [] #list to store our links
post_titles = [] #list to store our titles
post_ids = [] #list to store our id's
post_imgs = []
print("[bot] extracting posts from sub-reddit")
for submission in subreddit_info.new(limit=5):
if not already_tweeted(submission.id):
post_titles.append(submission.title)
post_links.append(submission.shortlink)
post_ids.append(submission.id)
post_imgs = get_image(submission.url)
print(post_imgs)
else:
print("Already Tweeted")
return post_links, post_titles, post_ids, post_imgs
def already_tweeted(id):
'''reads through our .txt file and determines if tweet has already been posted'''
found = 0
with open(posted_reddit_ids, 'r') as f:
for line in f:
if id in line:
found = 1
break
return found
def main():
'''Main function'''
# If the tweet tracking file does not already exist, create it
if not os.path.exists(posted_reddit_ids):
with open(posted_reddit_ids, 'w'):
pass
if not os.path.exists(img_dir):
os.makedirs(img_dir)
subreddit = setup_connection_reddit(subreddit_to_watch)
post_links, post_titles, post_ids, post_imgs = tweet_creator(subreddit)
tweeter(post_links, post_titles, post_ids,post_imgs)
if __name__ == '__main__':
main()
To show the file and program are in the same directory
Edit:
It seems the error completely goes away when I remove the post_imgs = get_image(submission.url)
Here is my code for the get_image function, maybe this can help solve my problem
def get_image(img_url):
url = img_url
r = requests.get(url, headers = {'User-Agent' : 'reddit Twitter tool monitoring (by /u/RivianJourneyMan)'})
data = r.text
soup = bs(data, 'lxml')
image_tags = soup.findAll('img')
os.chdir(img_dir)
x = 0
mylist = []
for image in image_tags:
try:
url = image['src']
source = requests.get(url, stream = True)
if source.status_code == 200:
img_file = img_dir + str(x) + '.jpg'
with open(img_file, 'wb') as f:
f.write(requests.get(url).content)
mylist.append(img_file)
f.close()
x += 1
return img_file
except:
mylist.append(None)
print(mylist)
return mylist
What I am trying to accomplish here, is return a list of .jpg files in the get_image function and then copy that list over to the post_imgs in the tweet_creator function.
I use pathlib instead of module os, and it did not raise the FileNotFoundError:
#!/usr/bin/env python3.6
import time
import praw
import requests
import tweepy
from bs4 import BeautifulSoup as bs
from pathlib import Path
posted_reddit_ids = "posted.txt"
def tweet_creator(subreddit_info):
"""Goes through posts on reddit and extracts a shortened link, title & ID"""
post_links = [] # list to store our links
post_titles = [] # list to store our titles
post_ids = [] # list to store our id's
post_imgs = []
print("[bot] extracting posts from sub-reddit")
for submission in subreddit_info.new(limit=5):
if not already_tweeted(submission.id):
post_titles.append(submission.title)
post_links.append(submission.shortlink)
post_ids.append(submission.id)
post_imgs = get_image(submission.url)
print(post_imgs)
else:
print("Already Tweeted")
return post_links, post_titles, post_ids, post_imgs
def already_tweeted(id):
"""reads through our .txt file and determines if tweet has already been posted"""
return id in Path(posted_reddit_ids).read_text()
def main():
"""Main function"""
# If the tweet tracking file does not already exist, create it
Path(posted_reddit_ids).exists() or Path(posted_reddit_ids).write_text("")
Path(img_dir).exists() or Path(img_dir).mkdir(parents=True)
subreddit = setup_connection_reddit(subreddit_to_watch)
post_links, post_titles, post_ids, post_imgs = tweet_creator(subreddit)
tweeter(post_links, post_titles, post_ids, post_imgs)
if __name__ == "__main__":
main()

Categories

Resources