用于文件中HTTP消息的gzip解压脚本

0 投票
1 回答
529 浏览
提问于 2025-04-18 07:52

我正在使用的一个工具会把每个连接的HTTP数据保存到日志文件里。我在想有没有什么脚本可以把文件中压缩过的gzip消息解压出来。

数据看起来是这样的:

GET /something HTTP/1.1
Content-Type: text/plain
User-Agent: Mozilla/5.0
Connection: Keep-Alive
Accept-Encoding: gzip, deflate
Accept-Language: en-US,*
Host: something.somedomain

HTTP/1.1 200 OK
Content-Encoding: gzip
Content-Type: text/xml;charset=UTF-8
Date: Wed, 28 May 2014 20:33:14 GMT
Server: something
Content-Length: 160
Connection: keep-alive

<GZIP SECTION ...

FOLLOWING MORE REQUESTS/RESPONSES

我本来想手动处理这些数据,但那样会花太多时间。然后我想如果能写个脚本就好了,但因为我对bash/python/perl这些不太熟悉,所以希望有人已经写好了这样的脚本。

谢谢大家的建议。

1 个回答

1

我自己动手写了一个C++应用程序,完成我想要的功能。也许将来会有人觉得这个工具有用。这个程序还可以处理分块编码。我的使用方法是 'ls | grep ".log$" | ungzip'。这些日志文件是来自SSLSplit的。

// ungzip.cpp : Defines the entry point for the console application.
//

#include "stdafx.h"

void inflate(std::istream& dataIn, std::ostream& dataOut)
{
    boost::iostreams::filtering_streambuf<boost::iostreams::input> in;
    in.push(boost::iostreams::gzip_decompressor());
    in.push(dataIn);
    boost::iostreams::copy(in, dataOut);
}

struct membuf : std::streambuf
{
    membuf(char* begin, char* end) {
        this->setg(begin, begin, end);
    }
};

int _tmain(int argc, _TCHAR* argv[])
{
    boost::iostreams::mapped_file fileIn;
    std::ofstream fileOut;

    // For each filename on stdin
    for (std::string fileName; std::getline(std::cin, fileName);)
    {
        // Try opening memory mapping of that file.
        try
        {
            fileIn.open(fileName);
            if (fileIn.is_open() == false)
            {
                std::cout << "Error 1" << std::endl;
                continue;
            }
        }
        catch (std::exception e)
        {
            std::cout << e.what();
            continue;
        }

        // Open file to write inflated output to
        std::string strOut = fileName;
        strOut += ".ugz";
        fileOut.open(strOut, std::ios::binary);
        if (fileOut.is_open() == false)
        {
            std::cout << "Error 2" << std::endl;
            fileIn.close();
            continue;
        }

        // Load whole file into string to verify if it atleast has HTTP/1.1 somewhere in it. 
        //Doesnt mean its not binary, but better than nothing.
        char * pchData = fileIn.data();
        std::string strWhole(pchData, pchData + fileIn.size());
        std::regex reg("HTTP/1.1 ");
        std::smatch match;

        std::stringstream ss(strWhole);

        // Interesting header information
        enum {REGXCNT = 3};
        std::regex regs[REGXCNT] = { std::regex("Content-Length: (\\d+)"), std::regex("Content-Encoding: gzip"), std::regex("Transfer-Encoding: chunked") };

        // Verify
        if (std::regex_search(strWhole, match, reg))
        {
            int len = 0;
            bool bGzipped = false;
            bool bChunked = false;

            // While there is something to read
            while (!ss.eof())
            {
                std::string strLine;
                std::getline(ss, strLine);

                // Empty line between Header and Body
                if (strLine == "\r")
                {
                    // Print out the empty line \r\n
                    fileOut << strLine << std::endl;

                    // If its gzipped or chunked treat it differently
                    if (bGzipped || bChunked)
                    {
                        // GZipped but not chunked
                        if (bGzipped && !bChunked)
                        {
                            // Construct helper structures inflate and write out
                            char * pbyBinaryData = new char[len];
                            ss.read(pbyBinaryData, len);
                            std::stringbuf stringBuf;
                            membuf gzipdata(pbyBinaryData, pbyBinaryData + len);
                            std::istream _in(&gzipdata);
                            std::ostream _out(&stringBuf);
                            inflate(_in, _out);
                            std::stringstream ssOut;
                            ssOut << _out.rdbuf();
                            std::string strDataOut = ssOut.str();
                            fileOut.write(strDataOut.c_str(), strDataOut.length());
                            delete [] pbyBinaryData;
                        }
                        // Chunked data goes here
                        else if (bChunked)
                        {
                            // This vector is used for gzipped data
                            std::vector<char> unchunkedData;

                            // Load all chunks
                            while (true)
                            {
                                std::getline(ss, strLine);
                                // Strip \r from it. It should be always at the end, but whatever - performance is not the issue
                                strLine.erase(std::remove(strLine.begin(), strLine.end(), '\r'), strLine.end());
                                // Load chunksize
                                int nChunkSize = std::stoi(strLine, 0, 16);

                                if (nChunkSize != 0)
                                {
                                    // Each chunk is ended \r\n -> +2
                                    char * tmpBuf = new char[nChunkSize + 2];
                                    // Read actual data
                                    ss.read(tmpBuf, nChunkSize + 2);
                                    if (!bGzipped)
                                    {
                                        //Data not gzipped. Write them out directly
                                        fileOut.write(tmpBuf, nChunkSize);
                                    }
                                    else
                                    {
                                        //Data gzipped. Add them to vector to decompress later
                                        unchunkedData.insert(unchunkedData.end(), tmpBuf, tmpBuf + nChunkSize);
                                    }
                                    delete[] tmpBuf;
                                }
                                else
                                {
                                    // All chunks loaded. Break the while loop.
                                    break;
                                }
                            }
                            // Data was gzipped. Time to decompress
                            if (bGzipped)
                            {
                                std::stringbuf stringBuf;
                                membuf gzipdata(unchunkedData.data(), unchunkedData.data()+unchunkedData.size());
                                std::istream _in(&gzipdata);
                                std::ostream _out(&stringBuf);
                                inflate(_in, _out);
                                std::stringstream ssOut;
                                ssOut << _out.rdbuf();
                                std::string strDataOut = ssOut.str();
                                fileOut.write(strDataOut.c_str(), strDataOut.length());
                            }
                        }
                    }
                    // Reset flags
                    bChunked = false;
                    len = 0;
                    bGzipped = false;
                }
                // Otherwise just save it and try to find a key header info in it
                else
                {
                    fileOut << strLine << std::endl;
                    for (int i = 0; i < REGXCNT; ++i)
                    {
                        if (std::regex_search(strLine, match, regs[i]))
                        {
                            switch (i)
                            {
                            case 0:
                                len = std::stoi(match[1]);
                                break;
                            case 1:
                                bGzipped = true;
                                break;
                            case 2:
                                bChunked = true;
                                break;
                            }
                            break;
                        }
                    }
                }
            }

        }

        fileOut.flush();
        fileIn.close();
        fileOut.close();
    }

    return 0;
}

头文件 stdafx.h:

#pragma once
#pragma warning (disable: 4244)

#include <tchar.h>
#include <iostream>
#include <boost/tokenizer.hpp>
#include <boost/iostreams/filtering_streambuf.hpp>
#include <boost/iostreams/filter/gzip.hpp>
#include <boost/iostreams/copy.hpp>
#include <boost/iostreams/stream.hpp>
#include <boost/iostreams/device/mapped_file.hpp>
#include <fstream>
#include <regex>
#include <vector>
#include <streambuf>
#include <sstream>

撰写回答