使用Javascript'；s atob解码base64不'；t正确解码utf-8字符串_Javascript_Encoding_Utf 8

使用Javascript'；s atob解码base64不'；t正确解码utf-8字符串

javascript encoding utf-8

使用Javascript'；s atob解码base64不'；t正确解码utf-8字符串,javascript,encoding,utf-8,Javascript,Encoding,Utf 8,我正在使用Javascriptwindow.atob（）函数来解码base64编码的字符串（特别是来自GitHub API的base64编码内容）。问题是我要取回ASCII编码字符（比如而不是™）。如何正确处理传入的base64编码流，以便将其解码为utf-8？Unicode问题尽管JavaScript（ECMAScript）已经成熟，但Base64、ASCII和Unicode编码的脆弱性已经引起了很多人的头疼（大部分都在这个问题的历史中）考虑以下示例： const ok = "a

我正在使用Javascript

window.atob（）

函数来解码base64编码的字符串（特别是来自GitHub API的base64编码内容）。问题是我要取回ASCII编码字符（比如

而不是™）。如何正确处理传入的base64编码流，以便将其解码为utf-8？Unicode问题
尽管JavaScript（ECMAScript）已经成熟，但Base64、ASCII和Unicode编码的脆弱性已经引起了很多人的头疼（大部分都在这个问题的历史中）
考虑以下示例：
const ok = "a";
console.log(ok.codePointAt(0).toString(16)); //   61: occupies < 1 byte

const notOK = "✓"
console.log(notOK.codePointAt(0).toString(16)); // 2713: occupies > 1 byte

console.log(btoa(ok));    // YQ==
console.log(btoa(notOK)); // error

二进制解码⇢ UTF-8
解码base64⇢ UTF8
（为什么我们需要这样做？（'00'+c.charCodeAt（0）.toString（16））.slice（-2）
将一个0前置到单个字符串，例如当c==\n
时，c.charCodeAt（0）.toString（16）
返回a
，强制a
表示为0a
）

类型脚本支持
这是一个相同的解决方案，具有一些额外的TypeScript兼容性（通过@MA Maddin）：

第一个解决方案（已弃用）
这使用了escape
和unescape
（现在已被弃用，尽管这仍然适用于所有现代浏览器）：

最后一件事：我第一次遇到这个问题是在调用GitHub API时。为了让它在（移动）Safari上正常工作，我实际上必须在解码base64源代码之前去掉所有空白。无论这在2021年是否仍然相关，我不知道：
function b64_to_utf8( str ) {
    str = str.replace(/\s/g, '');    
    return decodeURIComponent(escape(window.atob( str )));
}

不推荐使用小修正、取消扫描和转义，因此：
function utf8_to_b64( str ) {
    return window.btoa(decodeURIComponent(encodeURIComponent(str)));
}

function b64_to_utf8( str ) {
     return decodeURIComponent(encodeURIComponent(window.atob(str)));
}


function b64_to_utf8( str ) {
    str = str.replace(/\s/g, '');    
    return decodeURIComponent(encodeURIComponent(window.atob(str)));
}

事情变了。这些方法已被弃用
可以在对字符串进行Base64编码之前对其进行URI编码。请注意，这不会生成Base64编码的UTF8，而是生成Base64编码的URL编码数据。双方必须就相同的编码达成一致
请参见此处的工作示例：
对于OP的问题，第三方库（如）应该可以解决该问题。
以下是一些针对可能缺少escape/unescape（）
的浏览器的经得起未来考验的代码。请注意，IE 9和更早版本不支持atob/btoa（）
，因此需要为它们使用自定义的base64函数
// Polyfill for escape/unescape
if( !window.unescape ){
    window.unescape = function( s ){
        return s.replace( /%([0-9A-F]{2})/g, function( m, p ) {
            return String.fromCharCode( '0x' + p );
        } );
    };
}
if( !window.escape ){
    window.escape = function( s ){
        var chr, hex, i = 0, l = s.length, out = '';
        for( ; i < l; i ++ ){
            chr = s.charAt( i );
            if( chr.search( /[A-Za-z0-9\@\*\_\+\-\.\/]/ ) > -1 ){
                out += chr; continue; }
            hex = s.charCodeAt( i ).toString( 16 );
            out += '%' + ( hex.length % 2 != 0 ? '0' : '' ) + hex;
        }
        return out;
    };
}

// Base64 encoding of UTF-8 strings
var utf8ToB64 = function( s ){
    return btoa( unescape( encodeURIComponent( s ) ) );
};
var b64ToUtf8 = function( s ){
    return decodeURIComponent( escape( atob( s ) ) );
};

//用于escape/unescape的Polyfill
如果（！window.unescape）{
window.unescape=函数{
返回s.replace（/%（[0-9A-F]{2}）/g，函数（m，p）{
返回字符串.fromCharCode（'0x'+p）；
} );
};
}
如果（！window.escape）{
window.escape=函数{
变量chr，hex，i=0，l=s.length，out=''；
对于（；i-1）{
out+=chr；continue；}
十六进制=s.charCodeAt（i）.toString（16）；
out+='%'+（十六进制长度%2！=0？'0'：''）+hex；
}
返回；
};
}
//UTF-8字符串的Base64编码
var utf8ToB64=函数{
返回btoa（unescape（一个或多个组件））；
};
var b64ToUtf8=函数{
返回组件（转义（atob））；
};

这里可以找到一个更全面的UTF-8编码和解码示例：
如果您更喜欢将字符串视为字节，那么可以使用以下函数
function u_atob(ascii) {
    return Uint8Array.from(atob(ascii), c => c.charCodeAt(0));
}

function u_btoa(buffer) {
    var binary = [];
    var bytes = new Uint8Array(buffer);
    for (var i = 0, il = bytes.byteLength; i < il; i++) {
        binary.push(String.fromCharCode(bytes[i]));
    }
    return btoa(binary.join(''));
}


// example, it works also with astral plane characters such as 'including above solution if still facing issue try as below, Considerign the case where escape is not supported for TS.

blob = new Blob(["\ufeff", csv_content]); // this will make symbols to appears in excel 

函数u_atob（ascii）{
返回Uint8Array.from（atob（ascii），c=>c.charCodeAt（0））；
}
功能u_btoa（缓冲区）{
var二进制=[]；
var字节=新的Uint8Array（缓冲区）；
for（var i=0，il=bytes.bytellength；i//例如，它也适用于星体平面字符，如“，包括上述解决方案，如果仍然面临问题，请尝试如下，考虑到TS不支持转义的情况
function b64DecodeUnicode(str: any) {        
        return decodeURIComponent(atob(str).split('').map((c: any) => {
            return '%' + ('00' + c.charCodeAt(0).toString(16)).slice(-2);
        }).join(''));
    }

对于csv_内容，您可以像下面这样尝试
function b64EncodeUnicode(str) {
    // first we use encodeURIComponent to get percent-encoded UTF-8,
    // then we convert the percent encodings into raw bytes which
    // can be fed into btoa.
    return btoa(encodeURIComponent(str).replace(/%([0-9A-F]{2})/g,
        function toSolidBytes(match, p1) {
            return String.fromCharCode('0x' + p1);
    }));
}

b64EncodeUnicode('✓ à la mode'); // "4pyTIMOgIGxhIG1vZGU="
b64EncodeUnicode('\n'); // "Cg=="

以下是2018年更新的解决方案，如
从UNICODE编码到B64
function b64DecodeUnicode(str) {
    // Going backwards: from bytestream, to percent-encoding, to original string.
    return decodeURIComponent(atob(str).split('').map(function(c) {
        return '%' + ('00' + c.charCodeAt(0).toString(16)).slice(-2);
    }).join(''));
}

b64DecodeUnicode('4pyTIMOgIGxhIG1vZGU='); // "✓ à la mode"
b64DecodeUnicode('Cg=='); // "\n"

从B64解码为UNICODE
!function(e){"use strict";function h(b){var a=b.charCodeAt(0);if(55296<=a&&56319>=a)if(b=b.charCodeAt(1),b===b&&56320<=b&&57343>=b){if(a=1024*(a-55296)+b-56320+65536,65535<a)return d(240|a>>>18,128|a>>>12&63,128|a>>>6&63,128|a&63)}else return d(239,191,189);return 127>=a?inputString:2047>=a?d(192|a>>>6,128|a&63):d(224|a>>>12,128|a>>>6&63,128|a&63)}function k(b){var a=b.charCodeAt(0)<<24,f=l(~a),c=0,e=b.length,g="";if(5>f&&e>=f){a=a<<f>>>24+f;for(c=1;c<f;++c)a=a<<6|b.charCodeAt(c)&63;65535>=a?g+=d(a):1114111>=a?(a-=65536,g+=d((a>>10)+55296,(a&1023)+56320)):c=0}for(;c<e;++c)g+="\ufffd";return g}var m=Math.log,n=Math.LN2,l=Math.clz32||function(b){return 31-m(b>>>0)/n|0},d=String.fromCharCode,p=atob,q=btoa;e.btoaUTF8=function(b,a){return q((a?"\u00ef\u00bb\u00bf":"")+b.replace(/[\x80-\uD7ff\uDC00-\uFFFF]|[\uD800-\uDBFF][\uDC00-\uDFFF]?/g,h))};e.atobUTF8=function(b,a){a||"\u00ef\u00bb\u00bf"!==b.substring(0,3)||(b=b.substring(3));return p(b).replace(/[\xc0-\xff][\x80-\xbf]*/g,k)}}(""+void 0==typeof global?""+void 0==typeof self?this:self:global)

我假设您可能需要一个能够产生广泛使用的base64 URI的解决方案。请访问数据：文本/纯文本；字符集=utf-8；base64,4pi44pi54pi64pi74pi84pi+4pi/
查看演示（复制数据uri，打开新选项卡，将数据uri粘贴到地址栏，然后按enter键进入页面）。尽管这个URI是base64编码的，浏览器仍然能够识别高代码点并正确解码。小型编码器+解码器为1058字节（+Gzip）→（589字节）

您好\u1234 W\186\0256ld！
在顶部框中输入文本。然后URL将自动更新。


适合我的完整文章：
我们从Unicode/UTF-8编码的部分是
function b64DecodeUnicode(str) {
    // Going backwards: from bytestream, to percent-encoding, to original string.
    return decodeURIComponent(atob(str).split('').map(function(c) {
        return '%' + ('00' + c.charCodeAt(0).toString(16)).slice(-2);
    }).join(''));
}

这是当今最常用的方法之一。
将base64解码为UTF8字符串
以下是@brandonscript最新投票结果
function decodeBase64(base64) {
    const text = atob(base64);
    const length = text.length;
    const bytes = new Uint8Array(length);
    for (let i = 0; i < length; i++) {
        bytes[i] = text.charCodeAt(i);
    }
    const decoder = new TextDecoder(); // default is utf-8
    return decoder.decode(bytes);
}

上面的代码可以工作，但速度非常慢。如果您的输入是非常大的base64字符串，例如，对于base64 html文档为30000个字符。这将需要大量的计算
这是我的答案，使用内置的文本解码器，对于大输入，比上面的代码快近10倍
函数decodeBase64（base64）{
常量文本=atob（base64）；
常量长度=text.length；
常量字节=新的Uint8Array（长度）；
for（设i=0；i
您链接的MDN页面有一个以短语“用于Unicode或UTF-8字符串”开头的段落。您在节点上吗？有比atob
更好的解决方案，看起来doc链接甚至与现在不同，建议使用正则表达式解决方案来管理它。这不起作用，因为encodeURIComponent
与decodeuricomonen相反
function u_atob(ascii) {
    return Uint8Array.from(atob(ascii), c => c.charCodeAt(0));
}

function u_btoa(buffer) {
    var binary = [];
    var bytes = new Uint8Array(buffer);
    for (var i = 0, il = bytes.byteLength; i < il; i++) {
        binary.push(String.fromCharCode(bytes[i]));
    }
    return btoa(binary.join(''));
}


// example, it works also with astral plane characters such as 'including above solution if still facing issue try as below, Considerign the case where escape is not supported for TS.

blob = new Blob(["\ufeff", csv_content]); // this will make symbols to appears in excel 

function b64DecodeUnicode(str: any) {        
        return decodeURIComponent(atob(str).split('').map((c: any) => {
            return '%' + ('00' + c.charCodeAt(0).toString(16)).slice(-2);
        }).join(''));
    }

function b64EncodeUnicode(str) {
    // first we use encodeURIComponent to get percent-encoded UTF-8,
    // then we convert the percent encodings into raw bytes which
    // can be fed into btoa.
    return btoa(encodeURIComponent(str).replace(/%([0-9A-F]{2})/g,
        function toSolidBytes(match, p1) {
            return String.fromCharCode('0x' + p1);
    }));
}

b64EncodeUnicode('✓ à la mode'); // "4pyTIMOgIGxhIG1vZGU="
b64EncodeUnicode('\n'); // "Cg=="

function b64DecodeUnicode(str) {
    // Going backwards: from bytestream, to percent-encoding, to original string.
    return decodeURIComponent(atob(str).split('').map(function(c) {
        return '%' + ('00' + c.charCodeAt(0).toString(16)).slice(-2);
    }).join(''));
}

b64DecodeUnicode('4pyTIMOgIGxhIG1vZGU='); // "✓ à la mode"
b64DecodeUnicode('Cg=='); // "\n"

!function(e){"use strict";function h(b){var a=b.charCodeAt(0);if(55296<=a&&56319>=a)if(b=b.charCodeAt(1),b===b&&56320<=b&&57343>=b){if(a=1024*(a-55296)+b-56320+65536,65535<a)return d(240|a>>>18,128|a>>>12&63,128|a>>>6&63,128|a&63)}else return d(239,191,189);return 127>=a?inputString:2047>=a?d(192|a>>>6,128|a&63):d(224|a>>>12,128|a>>>6&63,128|a&63)}function k(b){var a=b.charCodeAt(0)<<24,f=l(~a),c=0,e=b.length,g="";if(5>f&&e>=f){a=a<<f>>>24+f;for(c=1;c<f;++c)a=a<<6|b.charCodeAt(c)&63;65535>=a?g+=d(a):1114111>=a?(a-=65536,g+=d((a>>10)+55296,(a&1023)+56320)):c=0}for(;c<e;++c)g+="\ufffd";return g}var m=Math.log,n=Math.LN2,l=Math.clz32||function(b){return 31-m(b>>>0)/n|0},d=String.fromCharCode,p=atob,q=btoa;e.btoaUTF8=function(b,a){return q((a?"\u00ef\u00bb\u00bf":"")+b.replace(/[\x80-\uD7ff\uDC00-\uFFFF]|[\uD800-\uDBFF][\uDC00-\uDFFF]?/g,h))};e.atobUTF8=function(b,a){a||"\u00ef\u00bb\u00bf"!==b.substring(0,3)||(b=b.substring(3));return p(b).replace(/[\xc0-\xff][\x80-\xbf]*/g,k)}}(""+void 0==typeof global?""+void 0==typeof self?this:self:global)

var fromCharCode = String.fromCharCode;
var btoaUTF8 = (function(btoa, replacer){"use strict";
    return function(inputString, BOMit){
        return btoa((BOMit ? "\xEF\xBB\xBF" : "") + inputString.replace(
            /[\x80-\uD7ff\uDC00-\uFFFF]|[\uD800-\uDBFF][\uDC00-\uDFFF]?/g, replacer
        ));
    }
})(btoa, function(nonAsciiChars){"use strict";
    // make the UTF string into a binary UTF-8 encoded string
    var point = nonAsciiChars.charCodeAt(0);
    if (point >= 0xD800 && point <= 0xDBFF) {
        var nextcode = nonAsciiChars.charCodeAt(1);
        if (nextcode !== nextcode) // NaN because string is 1 code point long
            return fromCharCode(0xef/*11101111*/, 0xbf/*10111111*/, 0xbd/*10111101*/);
        // https://mathiasbynens.be/notes/javascript-encoding#surrogate-formulae
        if (nextcode >= 0xDC00 && nextcode <= 0xDFFF) {
            point = (point - 0xD800) * 0x400 + nextcode - 0xDC00 + 0x10000;
            if (point > 0xffff)
                return fromCharCode(
                    (0x1e/*0b11110*/<<3) | (point>>>18),
                    (0x2/*0b10*/<<6) | ((point>>>12)&0x3f/*0b00111111*/),
                    (0x2/*0b10*/<<6) | ((point>>>6)&0x3f/*0b00111111*/),
                    (0x2/*0b10*/<<6) | (point&0x3f/*0b00111111*/)
                );
        } else return fromCharCode(0xef, 0xbf, 0xbd);
    }
    if (point <= 0x007f) return nonAsciiChars;
    else if (point <= 0x07ff) {
        return fromCharCode((0x6<<5)|(point>>>6), (0x2<<6)|(point&0x3f));
    } else return fromCharCode(
        (0xe/*0b1110*/<<4) | (point>>>12),
        (0x2/*0b10*/<<6) | ((point>>>6)&0x3f/*0b00111111*/),
        (0x2/*0b10*/<<6) | (point&0x3f/*0b00111111*/)
    );
});

var clz32 = Math.clz32 || (function(log, LN2){"use strict";
    return function(x) {return 31 - log(x >>> 0) / LN2 | 0};
})(Math.log, Math.LN2);
var fromCharCode = String.fromCharCode;
var atobUTF8 = (function(atob, replacer){"use strict";
    return function(inputString, keepBOM){
        inputString = atob(inputString);
        if (!keepBOM && inputString.substring(0,3) === "\xEF\xBB\xBF")
            inputString = inputString.substring(3); // eradicate UTF-8 BOM
        // 0xc0 => 0b11000000; 0xff => 0b11111111; 0xc0-0xff => 0b11xxxxxx
        // 0x80 => 0b10000000; 0xbf => 0b10111111; 0x80-0xbf => 0b10xxxxxx
        return inputString.replace(/[\xc0-\xff][\x80-\xbf]*/g, replacer);
    }
})(atob, function(encoded){"use strict";
    var codePoint = encoded.charCodeAt(0) << 24;
    var leadingOnes = clz32(~codePoint);
    var endPos = 0, stringLen = encoded.length;
    var result = "";
    if (leadingOnes < 5 && stringLen >= leadingOnes) {
        codePoint = (codePoint<<leadingOnes)>>>(24+leadingOnes);
        for (endPos = 1; endPos < leadingOnes; ++endPos)
            codePoint = (codePoint<<6) | (encoded.charCodeAt(endPos)&0x3f/*0b00111111*/);
        if (codePoint <= 0xFFFF) { // BMP code point
          result += fromCharCode(codePoint);
        } else if (codePoint <= 0x10FFFF) {
          // https://mathiasbynens.be/notes/javascript-encoding#surrogate-formulae
          codePoint -= 0x10000;
          result += fromCharCode(
            (codePoint >> 10) + 0xD800,  // highSurrogate
            (codePoint & 0x3ff) + 0xDC00 // lowSurrogate
          );
        } else endPos = 0; // to fill it in with INVALIDs
    }
    for (; endPos < stringLen; ++endPos) result += "\ufffd"; // replacement character
    return result;
});

function utf8_to_b64( str ) {
   return window.btoa(unescape(encodeURIComponent( str )));
}

function b64_to_utf8( str ) {
   return decodeURIComponent(escape(window.atob( str )));
}

// Usage:
utf8_to_b64('✓ à la mode'); // "4pyTIMOgIGxhIG1vZGU="
b64_to_utf8('4pyTIMOgIGxhIG1vZGU='); // "✓ à la mode"

function b64DecodeUnicode(str) {
    // Going backwards: from bytestream, to percent-encoding, to original string.
    return decodeURIComponent(atob(str).split('').map(function(c) {
        return '%' + ('00' + c.charCodeAt(0).toString(16)).slice(-2);
    }).join(''));
}

function decodeBase64(base64) {
    const text = atob(base64);
    const length = text.length;
    const bytes = new Uint8Array(length);
    for (let i = 0; i < length; i++) {
        bytes[i] = text.charCodeAt(i);
    }
    const decoder = new TextDecoder(); // default is utf-8
    return decoder.decode(bytes);
}