I have html document embeded with pdf document in base64 encoded format. I like to extract the string and save it as pdf file. using below code to save it as pdf file.
but its on opening in adobe reader, saying invalid format. looking to fix this issue.
I think pdf file encoded using Javascript encodeURIComponent function. need to convert using Python.
sample embed tag
<embed type="application/pdf" src="data:application/pdf;base64,JVBERi0xLjQKJeLjz9MKMSAwIG9iago8PC9D">
Code
import base64
def decode_b64():
b64 = "JVBERi0xLjQKJeLjz9MKMSAwIG9iago8PC9D"
buffer = BytesIO.BytesIO()
content = base64.b64decode(b64)
buffer.write(content)
with open(Path(Path.home(), 'Downloads', 'mytest.pdf'), "wb") as f:
f.write(buffer.getvalue())
if __name__ == "__main__":
decode_b64()
=== Update 1:
found the way to convert using JavaScript: It will be nice if we can port this code to Python.
const {readFileSync, writeFile, promises: fsPromises} = require('fs');
var data=readFileSync("pdf-file.html", 'utf-8')
var DOMParser = require('xmldom').DOMParser;
var parser = new DOMParser();
const virtualDoc = parser.parseFromString(data, 'text/html');
var elem = virtualDoc.getElementsByTagName('embed')[0];
for (var i = 0; i < elem.attributes.length; i++) {
var attrib = elem.attributes[i];
if (attrib.specified) {
if( attrib.name == "src") {
var result =attrib.value
result=result.replace('data:application/pdf;base64,','');
let buff = Buffer.from(decodeURIComponent(result), 'base64');
writeFile('pdf-file.pdf', buff, err => {
if (err) {
console.error(err);
}
});
}
}
}
This is a situation that you should have been able to chase down yourself. I wasn't 100% sure how Javascript encoded those two characters, so I wrote up a simple HTML page:
<script>
var s = "abcde++defgh//";
alert(encodeURIComponent(s));
</script>
When I ran that page, the result was "abcde%2B%2Bdefgh%2F%2F", and that is all the information you need to fix up those strings.
def decode_b64():
b64 = "JVBERi0xLjQKJeLjz9MKMSAwIG9iago8PC9D......"
b64 = b64.replace('%2B','+').replace('%2F','/')
content = base64.b64decode(b64)
with open(Path(Path.home(), 'Downloads', 'mytest.pdf'), "wb") as f:
f.write(content)
Related
How can I extract the URL from a script of HTML with Python?
The HTML provided:
function download() {
window.open('https:somelink.com');
}
const text = `<div style=\'position: relative;padding-bottom: 56.25%;height: 0;overflow: hidden;\'>
<iframe allowfullscreen=\'allowfullscreen\' src=\'URL\' style=\'border: 0;height: 100%;left: 0;position: absolute;top: 0;width: 100%;\' ></iframe>
</div>`;
function embed() {
var element = document.getElementById('embed-text');
console.log(element);
element.innerHTML = text
}
Desired output will be:
https://somelink.com
Any help will do. Thanks!
You should use regex like this:
var urlRegex = /(https?:\/\/[^\s]+)/; // the regex
// your string
var input = "<div style=\'position: relative;padding-bottom: 56.25%;height: 0;overflow: hidden;\'><iframe allowfullscreen=\'allowfullscreen\' src=\" https://my-url.com/test \" style=\'border: 0;height: 100%;left: 0;position: absolute;top: 0;width: 100%;\' ></iframe></div>";
console.log(input.match(urlRegex)[1]); // use regex and lot result
I'm invoking a python script from node js. The python script retrieves data from a REST API and stores it in a dataframe and then there's a search function based on user input. I'm confused as to what variable type does python send the data to node js in? I've tried to convert into a string but in node js it says it is an unresolved variable type. Here's the code:
r = requests.get(url)
data = r.json()
nested = json.loads(r.text)
nested_full = json_normalize(nested)
req_data= json_normalize(nested,record_path ='items')
search = req_data.get(["name", "id"," ])
#search.head(10)
filter = sys.argv[1:]
print(filter)
input = filter[0]
print(input)
result = search[search["requestor_name"].str.contains(input)]
result = result.to_String(index=false)
response = '```' + str(result) + '```'
print(response)
sys.stdout.flush()
Here's the node js program that invokes the above python script. How do i store the output in a format which i can pass to another function in node?
var input = 'robert';
var childProcess = require("child_process").spawn('python', ['./search.py', input], {stdio: 'inherit'})
const stream = require('stream');
const format = require('string-format')
childProcess.on('data', function(data){
process.stdout.write("python script output",data)
result += String(data);
console.log("Here it is", data);
});
childProcess.on('close', function(code) {
if ( code === 1 ){
process.stderr.write("error occured",code);
process.exit(1);
}
else{
process.stdout.write('done');
}
});
According to the docs:
childProcess.stdout.on('data', (data) => {
console.log(`stdout: ${data}`);
});
I'm trying to program web crawler.
I have server.js / crawling.js / dataCrawler.py
When I call crawlData that is defined in crawling.js at server.js, the method I defined in crawling.js using spawn for executing the dataCrawler.py gets called.
I need data in server.js, but executing dataCrawler.py takes a while So I cannot get proper data but null or undefined.
Do you have any solution ? or Anyone who has same issue?
My codes are below. (I don't put these perfectly. Just reference for structure)
//server.js
var crawler = require("./crawling")
var resultArr = crawler.crawlData();
console.log('nodeserver:', resultArr)
//crawling.js
exports.crawlData = ()=>{
var dataArr = [];
var temp;
var py = spawn('python', ['dataCrawler.py']);
var data = [totalUrl, gubun];
var dataFromPy = null;
py.stdout.on('data', function(result){
var dataArr = encoding.convert(result, 'utf-8')
dataArr = JSON.parse(encoding.convert(result, 'utf-8'));
py.stdout.on('end', function(){
temp = dataArr
});
});
py.stdin.write(JSON.stringify(data));
py.stdin.end();
return temp;
}
//dataCrawler.py
def crawling(url, gubun, page_count):
idx = 0
result = []
jsonData = {}
for i in range(1, page_count + 1):
....
crawling code
....
return result
def main():
lines = sys.stdin.readlines()
paraFromServer = json.loads(lines[0])
url = paraFromServer[0]
gubun = paraFromServer[1]
result = crawling(url, gubun, page_count)
print(result)
main()
You didn't account for the asynchronous nature of javascript. What you have to do is, pass in a callback method to crawlData method, which will be invoked once scraping is done.
exports.crawlData = (cb)=>{
....
py.stdout.on('data', function(result){
var dataArr = encoding.convert(result, 'utf-8')
dataArr = JSON.parse(encoding.convert(result, 'utf-8'));
py.stdout.on('end', function(){
cb(dataArr); // ideally the pattern is cb(error, data)
});
});
...
So server.js becomes:
var crawler = require("./crawling")
crawler.crawlData((data) => {
console.log(data);
// Do whatever you want to do with the data.
});
Callbacks can cause Callback hell. Try exploring promises or async / await.
Alternatively you can use spawnSync if running in parallel isn't a concern
exports.crawlData = () => {
const result = spawnSync('python', ['dataCrawler.py'], {
input: JSON.stringify([totalUrl, gubun])
});
return JSON.parse(encoding.convert(result, 'utf-8'));
}
Hello, i need this Javacode in python
I have programmed in Java, but i need this code in python
The script is typed from a file read and write again in a file.
Input File
cc_oo_g_csss.sh
cc_oo_guv_zppp.sh
cc_aba_ddd.sh
cc_aba_ccxyp.sh
cc_abus_pl_fa_part1.sh
cc_abus_pl_fa_part2.sh
cc_abus_pl_fa_part3.sh
cc_abus_pl_fa_part4.sh
c_abus_pl_fa_part5.sh
cc_abus_pl_fa_part6.sh
cc_abus_pl_fa_part7.sh
cc_abus_pl_fa_part8.sh
cc_abus_pl_fa_merge.sh
cc_abac_nsv_ssd.sh
cc_abac_kriv.sh
cc_rufrep_nia_inst_leg.sh
cc_rufrep_nia_inst_comb.sh
cc_rufrep_nia_inst_flow.sh
cc_rufrep_nia_inst.sh
cc_vision_kriv.sh
cc_vision_interface_part1.sh
cc_vision_interface_part2.sh
cc_vision_interface_part3.sh
cc_vision_interface_part4.sh
cc_vision_interface_merge.sh
cc_vision_deriv.sh
cc_ria_flows_rep_plain.sh
cc_iaed_fls_rep_merge.sh
I need as a file
cc_oo_g_csss.sh
cc_oo_guv_zppp.sh
cc_aba_ddd.sh
cc_aba_ccxyp.sh
D:\Temp\c_run_multiple_shell_skripts.sh
-S "cc_abus_pl_fauz_part1.sh, cc_abus_pl_fa_part2.sh, cc_abus_pl_fa_part3.sh, cc_abus_pl_fa_part4.sh,
cc_abus_pl_fa_part5.sh, cc_abus_pl_fa_part6.sh, cc_abus_pl_fa_part7.sh, cc_abus_pl_fa_part8.sh "
-F cc_abus_pl_fa_merge.sh
cc_abac_nsv_ssd.sh
cc_abac_kriv.sh
cc_rufrep_nia_inst_leg.sh
cc_rufrep_nia_inst_comb.sh
cc_rufrep_nia_inst_flow.sh
cc_rufrep_nia_inst.sh
cc_vision_kriv.sh
D:\Temp\c_run_multiple_shell_skripts.sh
-S "cc_vision_interface_part1.sh, cc_vision_interface_part2.sh, cc_vision_interface_part3.sh,
cc_vision_interface_part4.sh"
-F cc_vision_interface_merge.sh
cc_vision_deriv.sh
cc_ria_flows_rep_plain.sh
cc_iaed_fls_rep_merge.sh
public class shellsort {
public static Vector<String> sortforshell(String path) throws IOException{
String[] input = readFile(path,Charset.defaultCharset()).split(System.getProperty("line.separator"));
Vector<String> output = new Vector<String>();
int i = 0;
while(i<input.length){
if(input[i].contains("part")){
output.add("D:/hhh/cc_multiple_script.sh");
String partLine = "-S ";
partLine = partLine.concat(input[i]);
String validate =input[i].substring(0,input[i].indexOf("part"));
i++;
while(input[i].contains("part")&&input[i].contains(validate)){
partLine = partLine.concat(", "+input[i]);
i++;
}
output.add(partLine);
if(input[i].contains("merge")&&input[i].contains(validate)){
output.add("-F "+input[i]);
i++;
}
}else{
output.add(input[i]);
i++;
}
}
return output;
}
static String readFile(String path, Charset encoding) throws IOException {
byte[] encoded = Files.readAllBytes(Paths.get(path));
return new String(encoded, encoding);
}
public static void main(String[]args) throws IOException{
Vector<String> output = sortforshell("/input.txt");
for(int i=0;i<output.size();i++){
System.out.println(output.get(i));
}
}
}
sjadjhdahs >
asdasda
Read file:
f = open('/tmp/file')
out = f.read()
f.close()
Write file:
f = open('/tmp/file','w')
f.write('some text')
f.close()
Simple enogh, huh?
I'm using Tornado Webserver and the jQuery Webcam Plugin.
Everything is going fine except that I don't think i'm getting the raw data properly. I'm getting "FFD8FFE000104A46494600010100000100010000FFDB0084000503040404030504040405050506070C08070707070F0B0B090C110F1212110F111113161C1713141A1511111821181A1D1D1F1F1F13172224221E241C1E1F1E010505050706070E08080E1E1411141E1E1E1E1E1E1E1E1E1E1E1E1E1E1E1E1E1E1E1E1E1E1E1E1E1E1E1E1E1E1E1E1E1E1" for my data.
frontend:
$("#camera").webcam({width: 320,
height: 240,
mode: "save",
swffile: "/static/js/jscam.swf",
onTick: function() {
alert('OnTick');},
onCapture: function() {
webcam.capture();
var x = webcam.save('/saveimage');
},
onDebug: function(type, string) {
alert('error');
alert(type + ": " + string);},
});
backend:
filecontent = self.request.body
f = open('static/studentphotos/'+ filename +'.jpg','w')
f.write(filecontent)
f.close()"
Using your data as x, notice the JFIF in the output from unhexlify:
In [88]: binascii.unhexlify(x[:-1])
Out[88]: '\xff\xd8\xff\xe0\x00\x10JFIF...'
So it appears the data is a JPEG that needs to be unhexlified. Therefore try:
import binascii
filecontent = self.request.body
with open('static/studentphotos/'+ filename +'.jpg','w') as f:
f.write(binascii.unhexlify(filecontent))