Node.js: Handling Uncertain HTTP Response Compression

When writing code to fetch content over HTTP you usually know in advance whether or not the response will be delivered compressed with gzip or deflate. For one, it is often the case that you control the web server in question, and secondly you can set the Accept-Encoding request header to specify whether or not the server should in fact compress responses, and which methods can be used.

That said the world is a strange place and sometimes you find yourself dealing with an API that may or may not decide to compress the content it is delivering, and may or may not pay any attention to the Accept-Encoding request header. Handling this in Node.js has some potential pitfalls, and hence this post. The examples given here use the popular Request package.

Boilerplate Code

We'll start out by defining some handler functions to use in the example code here. The objective when running a request is to obtain a plain text response body and the response headers.

/**
 * Process the response.
 *
 * @param {Object} headers
 *   A hash of response header name-value pairs.
 * @param {String} body
 *   The uncompressed response body.
 */
function processResponse (headers, body) {
  // Processing goes here.
}

/**
 * Manage an error response.
 *
 * @param {Error} error
 *   An error instance.
 */
function handleError (error) {
  // Error handling code goes here.
}

/**
 * Obtain the encoding for the content given the headers.
 *
 * @param {Object} headers
 *   A hash of response header name-value pairs.
 * @return {String}
 *   The encoding if specified, or 'utf-8'.
 */
function obtainCharset (headers) {
  // Find the charset, if specified.
  var charset;
  var contentType = headers['content-type'] || '';
  var matches = contentType.match(/charset=([^;,\r\n]+)/i);
  if (matches && matches[1]) {
    charset = matches[1];
  }

  return charset || 'utf-8';
}

Uncompressed Response

If you know that content is uncompressed, then the Request package can be used in a very standard fashion:

var util = require('util');
var request = require('request');

request({
  url: 'http://www.example.com/path/to/api',
  headers: {
    // Tell the server not to compress the response.
    'Accept-Encoding': 'identity'
  }
}, function (error, response, body) {
  if (error) {
    return handleError(error);
  } else if (response.statusCode >= 400) {
    return handleError(new Error(util.format(
      'Response with status code %s: %s', response.statusCode, body
    )));
  }

  processResponse(response.headers, body);
});

Compressed Response

Similarly if the response is known to be compressed with gzip:

var util = require('util');
var zlib = require('zlib');
var request = require('request');

var buffers = [];

var req = request({
  url: 'http://www.example.com/path/to/api',
  headers: {
    'Accept-Encoding': 'gzip'
  }
}, function (error, response) {
  if (error) {
    return handleError(error);
  } else if (response.statusCode >= 400) {
    return handleError(new Error(util.format(
      'Response with status code %s.', response.statusCode
    )));
  }

  // Note that the third body argument and response.body will be garbage
  // strings at this point. You can't do anything with them. To get at the
  // response you have to use the streamed data in the buffers.
  zlib.gunzip(Buffer.concat(buffers), function (gunzipError, bodyBuffer) {
    if (gunzipError) {
      return handleError(gunzipError);
    }
    var charset = obtainCharset(response.headers);
    processResponse(response.headers, bodyBuffer.toString(charset));
  });
});

// Gather up the streamed compressed response into an array of buffers.
req.on('data', function (buf) {
  buffers[buffers.length] = buf;
});

You can stream the response directly to zlib rather than gathering up buffers from the request stream, but this has pitfalls. Not all servers are going to deliver sufficiently well-formed gzipped content for the "end" event to actually fire in the following code:

var stream = request(options).pipe(zlib.createGunzip());
var body = '';

stream.on('error', handleError);

// For some servers this event will never fire.
stream.on('end', function () {
  processResponse(body);
});

stream.on('data', function (data) {
  body += data.toString();
});

A Response That May or May Not Be Compressed

The right way to go about this is to collect the streamed data of the response regardless of whether you need it or not, because you won't find out whether you need it until the request is complete and you can inspect the response headers in the callback. At that point it is too late to go back and try to pipe or otherwise obtain the raw response buffer:

var util = require('util');
var zlib = require('zlib');
var request = require('request');

var buffers = [];

var req = request({
  url: 'http://www.example.com/path/to/api',
  headers: {
    'Accept-Encoding': 'deflate,gzip'
  }
}, function (error, response, body) {
  if (error) {
    return handleError(error);
  } else if (response.statusCode >= 400) {
    return handleError(new Error(util.format(
      'Response with status code %s.', response.statusCode
    )));
  }

  var charset = obtainCharset(response.headers);

  if (response.headers['content-encoding'] === 'gzip') {
    zlib.gunzip(Buffer.concat(buffers), function (gunzipError, bodyBuffer) {
      if (gunzipError) {
        return handleError(gunzipError);
      }
      processResponse(response.headers, bodyBuffer.toString(charset));
    });
  } else if (response.headers['content-encoding'] === 'deflate') {
    zlib.inflate(Buffer.concat(buffers), function (inflateError, bodyBuffer) {
      if (inflateError) {
        return handleError(inflateError);
      }
      processResponse(response.headers, bodyBuffer.toString(charset));
    });
  } else {
    processResponse(response.headers, body);
  }
});

// Gather up the streamed compressed response into an array of buffers.
req.on('data', function (buf) {
  buffers[buffers.length] = buf;
});