Remove HTTP headers from gzip or zip on stdin yy054 (revised)

  • Correction:

          /* remove HTTP headers from multiple gzip or single zip from stdin */
        
         int fileno (FILE *);
         int setenv (const char *, const char *, int);
         #define jmp (yy_start) = 1 + 2 *
         int x;
        %option nounput noinput noyywrap
        %%
        HTTP\/[\40-\176]+\x0d\x0a x++;
        [\40-\176]+:[\40-\176]+\r\n if(!x)fwrite(yytext,1,yyleng,yyout);
        \x0D\x0A if(!x)fwrite(yytext,1,yyleng,yyout);x=0;
        %%
        int main()
        { 
        yylex();
        exit(0);
        }
    
    
    Usage example:

    Retrieve hostnames, IP addresses and (if available) sitemap URLs from latest Common Crawl.

         ftp -4 https://data.commoncrawl.org/crawl-data/CC-MAIN-2023-50/robotstxt.paths.gz # <-- 180K
         gzip -dc robotstxt.paths.gz \
         |head -5 \
         |sed 's>.*>GET /& HTTP/1.1[]Host: data.commoncrawl.org[]Connection: >;
               $!s/$/keep-alive[]/;$s/$/close[]/' \
         |tr [] '\r\n' \
         |openssl s_client -quiet -connect data.commoncrawl.org:443 \
         |yy054 \
         |zegrep -a '(^Sitemap:)|(^Host:)|(^WARC-Target-URI:)|(^WARC-IP-Address:)' > 1.txt
         exec cat 1.txt

  • Usage example:

    Download NetBSD 1.0 in a single TCP connection.

        y="GET /pub/NetBSD-archive/NetBSD-1.0/source/src10/"
        z="Host: archive.netbsd.org"
        sed '$!s>.*>'"$y"'& HTTP/1.1[]'"$z"'[]Connection: keep-alive[]>;
             $s>.*>'"$y"'& HTTP/1.0[]'"$z"'[]>' << eof \
        |tr '[]' '\r\n' \
        |openssl s_client -quiet -connect 151.101.129.6:443 -servername archive.netbsd.org > http+gzip
        src10.aa
        src10.ab
        src10.ac
        src10.ad
        src10.ae
        src10.af
        src10.ag
        src10.ah
        src10.ai
        src10.aj
        src10.ak
        src10.al
        src10.am
        src10.an
        src10.ao
        src10.ap
        src10.aq
        src10.ar
        src10.as
        src10.at
        src10.au
        src10.av
        src10.aw
        src10.ax
        src10.ay
        src10.az
        src10.ba
        src10.bb
        src10.bc
        src10.bd
        src10.be
        src10.bf
        eof
    
        yy054 < http+gzip|tar tvzf /dev/stdin
    
    Alternate usage:

    Include an argv[1] will print HTTP headers only

        yy054 print < http+gzip
        yy054 x < http+gzip