用于文件中HTTP消息的gzip解压脚本
我正在使用的一个工具会把每个连接的HTTP数据保存到日志文件里。我在想有没有什么脚本可以把文件中压缩过的gzip消息解压出来。
数据看起来是这样的:
GET /something HTTP/1.1
Content-Type: text/plain
User-Agent: Mozilla/5.0
Connection: Keep-Alive
Accept-Encoding: gzip, deflate
Accept-Language: en-US,*
Host: something.somedomain
HTTP/1.1 200 OK
Content-Encoding: gzip
Content-Type: text/xml;charset=UTF-8
Date: Wed, 28 May 2014 20:33:14 GMT
Server: something
Content-Length: 160
Connection: keep-alive
<GZIP SECTION ...
FOLLOWING MORE REQUESTS/RESPONSES
我本来想手动处理这些数据,但那样会花太多时间。然后我想如果能写个脚本就好了,但因为我对bash/python/perl这些不太熟悉,所以希望有人已经写好了这样的脚本。
谢谢大家的建议。
1 个回答
1
我自己动手写了一个C++应用程序,完成我想要的功能。也许将来会有人觉得这个工具有用。这个程序还可以处理分块编码。我的使用方法是 'ls | grep ".log$" | ungzip'。这些日志文件是来自SSLSplit的。
// ungzip.cpp : Defines the entry point for the console application.
//
#include "stdafx.h"
void inflate(std::istream& dataIn, std::ostream& dataOut)
{
boost::iostreams::filtering_streambuf<boost::iostreams::input> in;
in.push(boost::iostreams::gzip_decompressor());
in.push(dataIn);
boost::iostreams::copy(in, dataOut);
}
struct membuf : std::streambuf
{
membuf(char* begin, char* end) {
this->setg(begin, begin, end);
}
};
int _tmain(int argc, _TCHAR* argv[])
{
boost::iostreams::mapped_file fileIn;
std::ofstream fileOut;
// For each filename on stdin
for (std::string fileName; std::getline(std::cin, fileName);)
{
// Try opening memory mapping of that file.
try
{
fileIn.open(fileName);
if (fileIn.is_open() == false)
{
std::cout << "Error 1" << std::endl;
continue;
}
}
catch (std::exception e)
{
std::cout << e.what();
continue;
}
// Open file to write inflated output to
std::string strOut = fileName;
strOut += ".ugz";
fileOut.open(strOut, std::ios::binary);
if (fileOut.is_open() == false)
{
std::cout << "Error 2" << std::endl;
fileIn.close();
continue;
}
// Load whole file into string to verify if it atleast has HTTP/1.1 somewhere in it.
//Doesnt mean its not binary, but better than nothing.
char * pchData = fileIn.data();
std::string strWhole(pchData, pchData + fileIn.size());
std::regex reg("HTTP/1.1 ");
std::smatch match;
std::stringstream ss(strWhole);
// Interesting header information
enum {REGXCNT = 3};
std::regex regs[REGXCNT] = { std::regex("Content-Length: (\\d+)"), std::regex("Content-Encoding: gzip"), std::regex("Transfer-Encoding: chunked") };
// Verify
if (std::regex_search(strWhole, match, reg))
{
int len = 0;
bool bGzipped = false;
bool bChunked = false;
// While there is something to read
while (!ss.eof())
{
std::string strLine;
std::getline(ss, strLine);
// Empty line between Header and Body
if (strLine == "\r")
{
// Print out the empty line \r\n
fileOut << strLine << std::endl;
// If its gzipped or chunked treat it differently
if (bGzipped || bChunked)
{
// GZipped but not chunked
if (bGzipped && !bChunked)
{
// Construct helper structures inflate and write out
char * pbyBinaryData = new char[len];
ss.read(pbyBinaryData, len);
std::stringbuf stringBuf;
membuf gzipdata(pbyBinaryData, pbyBinaryData + len);
std::istream _in(&gzipdata);
std::ostream _out(&stringBuf);
inflate(_in, _out);
std::stringstream ssOut;
ssOut << _out.rdbuf();
std::string strDataOut = ssOut.str();
fileOut.write(strDataOut.c_str(), strDataOut.length());
delete [] pbyBinaryData;
}
// Chunked data goes here
else if (bChunked)
{
// This vector is used for gzipped data
std::vector<char> unchunkedData;
// Load all chunks
while (true)
{
std::getline(ss, strLine);
// Strip \r from it. It should be always at the end, but whatever - performance is not the issue
strLine.erase(std::remove(strLine.begin(), strLine.end(), '\r'), strLine.end());
// Load chunksize
int nChunkSize = std::stoi(strLine, 0, 16);
if (nChunkSize != 0)
{
// Each chunk is ended \r\n -> +2
char * tmpBuf = new char[nChunkSize + 2];
// Read actual data
ss.read(tmpBuf, nChunkSize + 2);
if (!bGzipped)
{
//Data not gzipped. Write them out directly
fileOut.write(tmpBuf, nChunkSize);
}
else
{
//Data gzipped. Add them to vector to decompress later
unchunkedData.insert(unchunkedData.end(), tmpBuf, tmpBuf + nChunkSize);
}
delete[] tmpBuf;
}
else
{
// All chunks loaded. Break the while loop.
break;
}
}
// Data was gzipped. Time to decompress
if (bGzipped)
{
std::stringbuf stringBuf;
membuf gzipdata(unchunkedData.data(), unchunkedData.data()+unchunkedData.size());
std::istream _in(&gzipdata);
std::ostream _out(&stringBuf);
inflate(_in, _out);
std::stringstream ssOut;
ssOut << _out.rdbuf();
std::string strDataOut = ssOut.str();
fileOut.write(strDataOut.c_str(), strDataOut.length());
}
}
}
// Reset flags
bChunked = false;
len = 0;
bGzipped = false;
}
// Otherwise just save it and try to find a key header info in it
else
{
fileOut << strLine << std::endl;
for (int i = 0; i < REGXCNT; ++i)
{
if (std::regex_search(strLine, match, regs[i]))
{
switch (i)
{
case 0:
len = std::stoi(match[1]);
break;
case 1:
bGzipped = true;
break;
case 2:
bChunked = true;
break;
}
break;
}
}
}
}
}
fileOut.flush();
fileIn.close();
fileOut.close();
}
return 0;
}
头文件 stdafx.h:
#pragma once
#pragma warning (disable: 4244)
#include <tchar.h>
#include <iostream>
#include <boost/tokenizer.hpp>
#include <boost/iostreams/filtering_streambuf.hpp>
#include <boost/iostreams/filter/gzip.hpp>
#include <boost/iostreams/copy.hpp>
#include <boost/iostreams/stream.hpp>
#include <boost/iostreams/device/mapped_file.hpp>
#include <fstream>
#include <regex>
#include <vector>
#include <streambuf>
#include <sstream>