記一次ECONNRESET的問題排查

現象

最近發現雲構建任務在做實時日誌上傳OSS時會報ECONNRESET錯誤碼,並且只在一些特定的任務中出現。一個構建任務是由多個step組成的,執行完一個再執行下一個,直至結束。當某個step執行耗時比較長的時候,就會出現上述錯誤。

排查

為了復現這個問題,我寫了一段測試代碼,如下:

use strict;const begin = Date.now();function createStream() { const stream = new Readable(); stream._read = () => {}; return stream;}const store = oss({ accessKeyId: xxxx, accessKeySecret: xxx, bucket: cbu-assets, endpoint: oss-abc.aliyuncs.com,});co(function*() { const stream = createStream(); stream.push(foo); // 模擬某個耗時step new Promise(resolve => { setTimeout(() => { stream.push(bar); stream.push(null); }, 120 * 1000); }); const { res } = yield store.putStream(osskey, stream); console.log(res);}).catch(e => { console.log(`cost: ${Date.now() - begin}ms`); console.error(e.code); console.error(e.stack);});

每個step都會輸出日誌,為了提高處理效率,這裡利用stream的特性,日誌一旦產出就執行上傳。為了模擬耗時的step,我設置了120秒的間隔,表示這個step的執行時間。然後,我得到了如下報錯:

cost: 30019msECONNRESETResponseError: socket hang up (req "error"), PUT http://cbu-assets.oss-abc.aliyuncs.com/aa -1 (connected: true, keepalive socket: false, agent status: {"createSocketCount":1,"closeSocketCount":1,"errorSocketCount":0,"timeoutSocketCount":1,"requestCount":0,"freeSockets":{},"sockets":{},"requests":{}})headers: {} at createHangUpError (_http_client.js:331:15) at Socket.socketCloseListener (_http_client.js:363:23) at emitOne (events.js:121:20) at Socket.emit (events.js:211:7) at TCP._handle.close [as _onclose] (net.js:554:12)

當時的第一感是OSS的問題,因為ECONNRESET這種錯誤一般是服務端主動斷開連接,之前遇到過好幾次,過一會兒再調就好了。但是這次的報錯是有規律可尋的,心裡上無法說服自己,於是打算排查一下。根據錯誤堆棧,可以最後執行到createHangUpError函數里,先看一下它的源碼:

// lib/_http_client.js#Ln331function createHangUpError() { var error = new Error(socket hang up); error.code = ECONNRESET; return error;}

原來ECONNRESET是在這裡設置到error對象上的。接著,再看下調用它的socketCloseListener

function ClientRequest(options, cb) { // 省略... if (this.agent) { // If there is an agent we should default to Connection:keep-alive, // but only if the Agent will actually reuse the connection! // If its not a keepAlive agent, and the maxSockets==Infinity, then // theres never a case where this socket will actually be reused if (!this.agent.keepAlive && !Number.isFinite(this.agent.maxSockets)) { this._last = true; this.shouldKeepAlive = false; } else { this._last = false; this.shouldKeepAlive = true; } this.agent.addRequest(this, options); } // 省略...}ClientRequest.prototype.onSocket = function onSocket(socket) { process.nextTick(onSocketNT, this, socket);};function onSocketNT(req, socket) { if (req.aborted) { // If we were aborted while waiting for a socket, skip the whole thing. if (!req.agent) { socket.destroy(); } else { socket.emit(free); } } else { tickOnSocket(req, socket); }}function tickOnSocket(req, socket) { // 省略... socket.on(close, socketCloseListener);}// lib/_http_client.js#Ln337function socketCloseListener() { // 省略... if (req.res && req.res.readable) { // 省略... } else if (!req.res && !req.socket._hadError) { // This socket error fired before we started to // receive a response. The error needs to // fire on the request. req.emit(error, createHangUpError()); req.socket._hadError = true; } // 省略...}

為了說明這個函數的作用,需要有一些上下文。首先是ClientRequest這個類,它是nodejs內部對http請求的封裝,客戶端每發起一個請求,就會new一個ClientRequest實例。它有一個agent欄位,是http.Agent實例,在nodejs里,對http連接的管理就是通過http.Agent來實現的。多說一句,http同樣支持==長連接==,因為它的底層是基於TCP/IP協議傳輸的。在ClientRequest的構造函數里,調用了this.agent.addRequest方法,這表示每次請求都會從連接池==獲取==一個連接(socket)。

socketCloseListener函數是socketclose事件處理函數,從代碼可以看到socket在沒有收到response的情況下觸發了close事件,因此可以判斷是==客戶端主動斷開連接導致的==。另外,從列印的信息可知,從發起請求到收到報錯經歷了30019ms,也就是30秒左右。帶著這些頭緒,接著看客戶端是如何觸發斷開連接的邏輯的。

先來看ali-oss是如何初始化實例的:

var AgentKeepalive = require(agentkeepalive);var globalHttpAgent = new AgentKeepalive();function Client(options, ctx) { if (!(this instanceof Client)) { return new Client(options, ctx); } if (options && options.inited) { this.options = options; } else { this.options = Client.initOptions(options); } // support custom agent and urllib client if (this.options.urllib) { this.urllib = this.options.urllib; } else { this.urllib = urllib; this.agent = this.options.agent || globalHttpAgent; } this.ctx = ctx; this.userAgent = this._getUserAgent();}

在初始化OSS實例的時候,如果沒有手動傳入agent,默認用的是globalHttpAgent,它是一個KeepAliveAgent實例。那麼KeepAliveAgent是啥?和http.Agent又是啥關係呢?為了解開這個問題,繼續跟進agentkeepalive的源碼:

const OriginalAgent = require(./_http_agent).Agent;class Agent extends OriginalAgent { constructor(options) { // 省略... // Sets the socket to timeout after timeout milliseconds of inactivity on the socket. // By default is double free socket keepalive timeout. if (options.timeout === undefined) { options.timeout = options.freeSocketKeepAliveTimeout * 2; // make sure socket default inactivity timeout >= 30s if (options.timeout < 30000) { options.timeout = 30000; } } options.timeout = ms(options.timeout); super(options); // 省略... }}

看到這裡基本清楚了,KeepAliveAgent繼承了http.Agent,它針對長連接做了一些配置。可以看到,這裡的options.timeout如果沒有外部傳入的話,會被默認設置成為30秒。接著來看下父類的實現:

function Agent(options) { EventEmitter.call(this); var self = this; // 省略... // working socket timeout. By default working socket do not have a timeout. self.timeout = self.options.timeout || 0; // 省略...}Agent.prototype.addRequest = function addRequest(req, options) { // 省略... var freeLen = this.freeSockets[name] ? this.freeSockets[name].length : 0; var sockLen = freeLen + this.sockets[name].length; if (freeLen) { // we have a free socket, so use that. var socket = this.freeSockets[name].shift(); debug(have free socket); // [patch start] // remove free socket error event handler socket.removeListener(error, freeSocketErrorListener); // restart the default timer socket.setTimeout(this.timeout); if (this.socketActiveTTL && Date.now() - socket.createdTime > this.socketActiveTTL) { debug(`socket ${socket.createdTime} expired`); socket.destroy(); return this.createSocket(req, options, handleSocketCreation(req)); } // [patch end] // dont leak if (!this.freeSockets[name].length) delete this.freeSockets[name]; socket.ref(); req.onSocket(socket); this.sockets[name].push(socket); } else if (sockLen < this.maxSockets) { debug(call onSocket, sockLen, freeLen); // If we are under maxSockets create a new one. // [patch start] this.createSocket(req, options, handleSocketCreation(req)); // [patch end] }}Agent.prototype.createSocket = function createSocket(req, options, cb) { // 省略... const newSocket = self.createConnection(options, oncreate); function oncreate(err, s) { // 省略... function onTimeout() { debug(CLIENT socket onTimeout); s.destroy(); // Remove it from freeSockets immediately to prevent new requests from being sent through this socket. self.removeSocket(s, options); self.emit(timeout); } s.on(timeout, onTimeout); s.setTimeout(self.timeout); // 省略... }}

原來timeout被用於傳入socket.setTimeout。至此,真相大白。。timeout表示的是socket的最大允許空閑時間(idle time),超過這個時間就會被客戶端回收(onTimeout函數里調用了socket.destroy()),也就是連接被斷開。而這個最大允許空閑時間和我們列印出來的請求耗時是吻合的,即30秒

OK,明確了問題所在,對測試代碼進行修改:

use strict;const begin = Date.now();const AgentKeepAlive = require(agentkeepalive);function createStream() { const stream = new Readable(); stream._read = () => {}; return stream;}const store = oss({ accessKeyId: xxxx, accessKeySecret: xxx, bucket: cbu-assets, endpoint: oss-cn-hangzhou-zmf.aliyuncs.com, // 手動傳入agent,並把最大空閑時間設置為30分鐘 agent: new AgentKeepAlive({ timeout: 30m, }),});co(function*() { const stream = createStream(); stream.push(foo); // 模擬某個耗時step new Promise(resolve => { setTimeout(() => { stream.push(bar); stream.push(null); }, 120 * 1000); }); const { res } = yield store.putStream(osskey, stream); console.log(res);}).catch(e => { console.log(`cost: ${Date.now() - begin}ms`); console.error(e.code); console.error(e.stack);});

這次在構造OSS實例的時候手動傳入agent實例,並把socket最大允許空閑時間設置得足夠大(30分鐘),再次執行代碼,又得到如下報錯:

cost: 60020msundefinedResponseTimeoutError: Response timeout for 60000ms, PUT http://cbu-assets.oss-abc.aliyuncs.com/aa -1 (connected: true, keepalive socket: false, agent status: {"createSocketCount":1,"createSocketErrorCount":0,"closeSocketCount":1,"errorSocketCount":0,"timeoutSocketCount":0,"requestCount":0,"freeSockets":{},"sockets":{},"requests":{}})headers: {} at Timeout._onTimeout (/Users/xuezu/test/oss/node_modules/_urllib@2.25.1@urllib/lib/urllib.js:749:15) at ontimeout (timers.js:475:11) at tryOnTimeout (timers.js:310:5) at Timer.listOnTimeout (timers.js:270:5)

這次錯誤名變成了ResponseTimeoutError,這個錯誤很常見,==是等待響應超時了==。列印的整個請求過程時間是60秒,而urllib的默認響應超時時間是5秒,怎麼過了這麼久才報出來?還是查一下在哪裡設置過了:

Client.initOptions = function initOptions(options) { // 省略... if (!options || !options.accessKeyId || !options.accessKeySecret) { throw new Error(require accessKeyId, accessKeySecret); } var opts = { region: oss-cn-hangzhou, internal: false, secure: false, timeout: 60000, // 60s bucket: null, endpoint: null, cname: false, }; // 省略...};

原來OSS實例初始化的時候重新設置過。OK,繼續改代碼:

use strict;const begin = Date.now();const AgentKeepAlive = require(agentkeepalive);function createStream() { const stream = new Readable(); stream._read = () => {}; return stream;}const store = oss({ accessKeyId: xxxx, accessKeySecret: xxx, bucket: cbu-assets, endpoint: oss-cn-hangzhou-zmf.aliyuncs.com, agent: new AgentKeepAlive({ timeout: 30m, }), timeout: 30m, // 將響應超時時間設置為30分鐘});co(function*() { const stream = createStream(); stream.push(foo); // 模擬某個耗時step new Promise(resolve => { setTimeout(() => { stream.push(bar); stream.push(null); }, 120 * 1000); }); const { res } = yield store.putStream(osskey, stream); console.log(res);}).catch(e => { console.log(`cost: ${Date.now() - begin}ms`); console.error(e.code); console.error(e.stack);});

加了timeout配置項,並設置得足夠大(30分鐘)。ready?Go!阿哦,還是報錯:

cost: 60129msECONNRESETResponseError: socket hang up (req "error"), PUT http://cbu-assets.oss-abc.aliyuncs.com/aa -1 (connected: true, keepalive socket: false, agent status: {"createSocketCount":1,"createSocketErrorCount":0,"closeSocketCount":0,"errorSocketCount":0,"timeoutSocketCount":0,"requestCount":0,"freeSockets":{},"sockets":{"cbu-assets.oss-abc.aliyuncs.com:80:":1},"requests":{}})headers: {} at createHangUpError (_http_client.js:331:15) at Socket.socketOnEnd (_http_client.js:423:23) at emitNone (events.js:111:20) at Socket.emit (events.js:208:7) at endReadableNT (_stream_readable.js:1056:12) at _combinedTickCallback (internal/process/next_tick.js:138:11) at process._tickCallback (internal/process/next_tick.js:180:9)

咦?這次錯誤跟第一次遇到的好像!錯誤碼也是ECONNRESET!但是仔細一看,還是有些許區別,一個是請求時間由之前的30秒變成了60秒,另一個區別是錯誤堆棧不大一樣,堆棧的第二行由原來的socketCloseListener變成socketOnEnd,可見是不同的行為導致的。只能再看看源碼了:

function tickOnSocket(req, socket) { // 省略... socket.on(end, socketOnEnd); socket.on(close, socketCloseListener);}function socketOnEnd() { var socket = this; var req = this._httpMessage; var parser = this.parser; if (!req.res && !req.socket._hadError) { // If we dont have a response then we know that the socket // ended prematurely and we need to emit an error on the request. req.emit(error, createHangUpError()); req.socket._hadError = true; } if (parser) { parser.finish(); freeParser(parser, req, socket); } socket.destroy();}

原來,邏輯走到了socketend事件處理函數里。查閱下API文檔,看看socketend事件代表啥意思:

Emitted when the other end of the socket sends a FIN packet, thus ending the readable side of the socket.

意思是說,當客戶端接收到遠端伺服器發送的FIN包後,會觸發end事件。也就是說,==連接是被遠程伺服器主動斷開的,而客戶端是被動關閉連接。==結合列印的請求時間60秒來看,OSS伺服器的socket最大允許空閑時間應該是60秒。至此,真相大白了(這次是真的)!

解決

那這個case咋解決呢?答案是==增加心跳包==。既然OSS服務端最大允許空閑60秒,那就需要客戶端在60秒之內至少要發一個包過去,以維持連接。我們可以對ReadAble Stream稍微改造下(僅供參考):

use strict;const { Readable } = require(stream);const { OSS_HEARTBEAT_PACKET } = require(./constants);class OSSReadStream extends Readable { constructor(options) { super(options); this._lastPushTime = null; this._heartbeatInterval = 10000; // 每隔10秒輪詢檢測 this._heartbeatTimer = this._setUpHeartbeatTimer(); this.on(end, () => { // clear timer clearInterval(this._heartbeatTimer); this._heartbeatTimer = null; }); } push(...args) { this._lastPushTime = Date.now(); return super.push(...args); } _setUpHeartbeatTimer() { if (this._heartbeatTimer) { return; } return setInterval(() => { const lastPushTime = this._lastPushTime; const now = Date.now(); // 如果距離上一次push超過50s,則push一個心跳包 if (lastPushTime && now - lastPushTime > 50000) { this.push(OSS_HEARTBEAT_PACKET); } }, this._heartbeatInterval); }}module.exports = OSSReadStream;

這裡的OSS_HEARTBEAT_PACKET就是我們要發送心跳包的內容,因為我的需求是上傳日誌,所以這裡設置成一個不可見字元(經過測試,如果設置成零長度的Buffer不會起作用,感謝 @Code Hz 同學提出質疑,以免影響展示邏輯。這個僅僅是個例子,用來說明一下問題。再改下測試代碼:

use strict;const begin = Date.now();const AgentKeepAlive = require(agentkeepalive);const OssReadableStream = require(./oss_stream);function createStream() { const stream = new OssReadableStream(); // 換成改造過的stream,增加心跳包上傳功能 stream._read = () => {}; return stream;}const store = oss({ accessKeyId: xxxx, accessKeySecret: xxx, bucket: cbu-assets, endpoint: oss-cn-hangzhou-zmf.aliyuncs.com, agent: new AgentKeepAlive({ timeout: 30m, }), timeout: 30m, // 將響應超時時間設置為30分鐘});co(function*() { const stream = createStream(); stream.push(foo); // 模擬某個耗時step new Promise(resolve => { setTimeout(() => { stream.push(bar); stream.push(null); }, 120 * 1000); }); const { res } = yield store.putStream(osskey, stream); console.log(res);}).catch(e => { console.log(`cost: ${Date.now() - begin}ms`); console.error(e.code); console.error(e.stack);});

再運行一下,就能正常上傳了。至此,問題算是解決了。

最後

如果你對全棧技術感興趣,歡迎一起探討,互相學習和進步。勾搭方式,Wechat: 113338655 || Email: ruoqi.drq@alibaba-inc.com


推薦閱讀:

使用nodejs開發的網站有哪些?
如何實現一個基於 DOM 的模板引擎
前端日刊-2018.01.21
初級web前端工程師面試必看(HTML+CSS)
[譯]手把手教你創建國際象棋 AI

TAG:前端開發 | Nodejs |