Great, I'm surprised your ability to decipher the file type.
When PSPP fully support ZSAV  will be a big improvement in handling large
databases.

Thank you

--
Hugo Valencia


2013/10/9 Ben Pfaff <b...@cs.stanford.edu>

> I modified to pspp-dump-sav to interpret the descriptors in zsav
> compressed files.  So far this modified pspp-dump-sav interprets,
> without complaint, all seven of the .zsav files I have.
>
> The next step is to modify the PSPP sav file reader to actually read the
> data.
>
> --8<--------------------------cut here-------------------------->8--
>
>
> diff --git a/utilities/pspp-dump-sav.c b/utilities/pspp-dump-sav.c
> index c6b5823..6ca45bc 100644
> --- a/utilities/pspp-dump-sav.c
> +++ b/utilities/pspp-dump-sav.c
> @@ -14,6 +14,34 @@
>     You should have received a copy of the GNU General Public License
>     along with this program.  If not, see <http://www.gnu.org/licenses/>.
> */
>
> +/*
> +000035a  5a 03 00 00 00 00 00 00 - Byte offset of this block, 0x35a
> +0000362  12 94 03 00 00 00 00 00 - Byte offset of the next block, 0x39412.
> +000036a  48 00 00 00 00 00 00 00 - Length of next block's header, 0x48
> bytes.
> +
> +0000372  0x37c49 bytes of compressed data that inflate to 0x3ff000 bytes
> +0037fbb  0x1457 bytes of compressed data that inflate to 0x6bf00 bytes
> +
> +0039412  9c ff ff ff ff ff ff ff - Value -100, dunno why (compression
> bias?)
> +003941a  00 00 00 00 00 00 00 00 - ?
> +0039422  00 f0 3f 00             - Inflated data block size
> +0034926  02 00 00 00             - Number of compressed data blocks
> +
> +First compressed data block descriptor:
> +003942a  5a 03 00 00 00 00 00 00
> +       = starting byte offset of data in block 1 if no zlib compression
> +0039432  72 03 00 00 00 00 00 00 - Starting offset of data block, 0x372.
> +003943a  00 f0 3f 00             - Inflated data size, 0x3ff000 bytes.
> +003943e  49 7c 03 00             - Compressed data size, 0x37c49 bytes.
> +
> +Second compressed data block descriptor:
> +0039442  5a f3 3f 00 00 00 00 00 - 0x3ff35a = 0x35a + 0x3ff000
> +       = starting byte offset of data in block 2 if no zlib compression
> +003944a  bb 7f 03 00 00 00 00 00 - Starting offset of data block, 0x37fbb.
> +0039452  00 bf 06 00             - Inflated data size, 0x6bf00 bytes.
> +0039456  57 14 00 00             - Deflated data size, 0x1457 bytes.
> +*/
> +
>  #include <config.h>
>
>  #include <ctype.h>
> @@ -39,6 +67,13 @@
>
>  #define ID_MAX_LEN 64
>
> +enum compression
> +  {
> +    COMP_NONE,
> +    COMP_SIMPLE,
> +    COMP_ZLIB
> +  };
> +
>  struct sfm_reader
>    {
>      const char *file_name;
> @@ -52,7 +87,7 @@ struct sfm_reader
>      enum integer_format integer_format;
>      enum float_format float_format;
>
> -    bool compressed;
> +    enum compression compression;
>      double bias;
>    };
>
> @@ -87,7 +122,8 @@ static void read_long_string_missing_values (struct
> sfm_reader *r,
>                                               size_t size, size_t count);
>  static void read_unknown_extension (struct sfm_reader *,
>                                      size_t size, size_t count);
> -static void read_compressed_data (struct sfm_reader *, int max_cases);
> +static void read_simple_compressed_data (struct sfm_reader *, int
> max_cases);
> +static void read_zlib_compressed_data (struct sfm_reader *);
>
>  static struct text_record *open_text_record (
>    struct sfm_reader *, size_t size);
> @@ -180,7 +216,7 @@ main (int argc, char *argv[])
>        r.n_var_widths = 0;
>        r.allocated_var_widths = 0;
>        r.var_widths = 0;
> -      r.compressed = false;
> +      r.compression = COMP_NONE;
>
>        if (argc - optind > 1)
>          printf ("Reading \"%s\":\n", r.file_name);
> @@ -218,8 +254,13 @@ main (int argc, char *argv[])
>                (long long int) ftello (r.file),
>                (long long int) ftello (r.file) + 4);
>
> -      if (r.compressed && max_cases > 0)
> -        read_compressed_data (&r, max_cases);
> +      if (r.compression == COMP_SIMPLE)
> +        {
> +          if (max_cases > 0)
> +            read_simple_compressed_data (&r, max_cases);
> +        }
> +      else if (r.compression == COMP_ZLIB)
> +        read_zlib_compressed_data (&r);
>
>        fclose (r.file);
>      }
> @@ -245,7 +286,11 @@ read_header (struct sfm_reader *r)
>    read_string (r, rec_type, sizeof rec_type);
>    read_string (r, eye_catcher, sizeof eye_catcher);
>
> -  if (strcmp ("$FL2", rec_type) != 0)
> +  if (!strcmp ("$FL2", rec_type))
> +    r->compression = COMP_NONE;
> +  else if (!strcmp ("$FL3", rec_type))
> +    r->compression = COMP_ZLIB;
> +  else
>      sys_error (r, "This is not an SPSS system file.");
>
>    /* Identify integer format. */
> @@ -265,7 +310,20 @@ read_header (struct sfm_reader *r)
>    weight_index = read_int (r);
>    ncases = read_int (r);
>
> -  r->compressed = compressed != 0;
> +  if (r->compression == COMP_NONE)
> +    {
> +      if (compressed == 1)
> +        r->compression = COMP_SIMPLE;
> +      else if (compressed != 0)
> +        sys_error (r, "SAV file header has invalid compression value "
> +                   "%"PRId32".", compressed);
> +    }
> +  else
> +    {
> +      if (compressed != 2)
> +        sys_error (r, "ZSAV file header has invalid compression value "
> +                   "%"PRId32".", compressed);
> +    }
>
>    /* Identify floating-point format and obtain compression bias. */
>    read_bytes (r, raw_bias, sizeof raw_bias);
> @@ -289,7 +347,12 @@ read_header (struct sfm_reader *r)
>    printf ("File header record:\n");
>    printf ("\t%17s: %s\n", "Product name", eye_catcher);
>    printf ("\t%17s: %"PRId32"\n", "Layout code", layout_code);
> -  printf ("\t%17s: %"PRId32"\n", "Compressed", compressed);
> +  printf ("\t%17s: %"PRId32" (%s)\n", "Compressed",
> +          compressed,
> +          r->compression == COMP_NONE ? "no compression"
> +          : r->compression == COMP_SIMPLE ? "simple compression"
> +          : r->compression == COMP_ZLIB ? "ZLIB compression"
> +          : "<error>");
>    printf ("\t%17s: %"PRId32"\n", "Weight index", weight_index);
>    printf ("\t%17s: %"PRId32"\n", "Number of cases", ncases);
>    printf ("\t%17s: %g\n", "Compression bias", r->bias);
> @@ -1170,7 +1233,7 @@ read_variable_attributes (struct sfm_reader *r,
> size_t size, size_t count)
>  }
>
>  static void
> -read_compressed_data (struct sfm_reader *r, int max_cases)
> +read_simple_compressed_data (struct sfm_reader *r, int max_cases)
>  {
>    enum { N_OPCODES = 8 };
>    uint8_t opcodes[N_OPCODES];
> @@ -1258,6 +1321,82 @@ read_compressed_data (struct sfm_reader *r, int
> max_cases)
>          }
>      }
>  }
> +
> +static void
> +read_zlib_compressed_data (struct sfm_reader *r)
> +{
> +  long long int ofs;
> +  long long int this_ofs, next_ofs, next_len;
> +  long long int bias, zero;
> +  unsigned int block_size, n_blocks;
> +  unsigned int i;
> +
> +  read_int (r);
> +  ofs = ftello (r->file);
> +  printf ("\n%08llx: ZLIB compressed data header:\n", ofs);
> +
> +  this_ofs = read_int64 (r);
> +  next_ofs = read_int64 (r);
> +  next_len = read_int64 (r);
> +
> +  printf ("\tHeader offset: 0x%llx\n", this_ofs);
> +  if (this_ofs != ofs)
> +    printf ("\t\t(This was expected to be 0x%llx.)\n", ofs);
> +  printf ("\tTrailer offset: 0x%llx\n", next_ofs);
> +  printf ("\tTrailer length: %lld\n", next_len);
> +  if (next_len < 24 || next_len % 24)
> +    printf ("\t\t(Trailer length is not a positive multiple of 24.)\n");
> +
> +  printf ("\n%08llx: 0x%llx bytes of ZLIB compressed data\n",
> +          ofs + 8 * 3, next_ofs - (ofs + 8 * 3));
> +
> +  skip_bytes (r, next_ofs - (ofs + 8 * 3));
> +
> +  printf ("\n%08llx: ZLIB compressed data trailer:\n", next_ofs);
> +  bias = read_int64 (r);
> +  zero = read_int64 (r);
> +  block_size = read_int (r);
> +  n_blocks = read_int (r);
> +  printf ("\tCompression bias: %lld\n", bias);
> +  printf ("\tZero: 0x%llx\n", zero);
> +  if (zero != 0)
> +    printf ("\t\t(This was expected to be 0.)\n");
> +  printf ("\tBlock size: 0x%x\n", block_size);
> +  if (block_size != 0x3ff000)
> +    printf ("\t\t(Block size is ordinarily 0x3ff000.)\n");
> +  printf ("\tNumber of blocks: %u\n", n_blocks);
> +  if (n_blocks != next_len / 24 - 1)
> +    printf ("\t\t(Expected %llu blocks.)\n", next_len / 24 - 1);
> +
> +  for (i = 0; i < n_blocks; i++)
> +    {
> +      long long int blockinfo_ofs = ftello (r->file);
> +      unsigned long long int uncompressed_ofs = read_int64 (r);
> +      unsigned long long int compressed_ofs = read_int64 (r);
> +      unsigned int inflated_size = read_int (r);
> +      unsigned int deflated_size = read_int (r);
> +
> +      printf ("\n%08llx: Block info for block %d of %d\n",
> +              blockinfo_ofs, i + 1, n_blocks);
> +
> +      printf ("\tOffset if ZLIB were turned off: 0x%llx\n",
> uncompressed_ofs);
> +      if (i == 0 && uncompressed_ofs != ofs)
> +        printf ("\t\t(This was expected to be 0x%llx.)\n", ofs);
> +
> +      printf ("\tOffset of ZLIB compressed data: 0x%llx\n",
> compressed_ofs);
> +      if (i == 0 && compressed_ofs != ofs + 24)
> +        printf ("\t\t(This was expected to be 0x%llx.)\n", ofs + 24);
> +
> +      printf ("\tDeflated data length: 0x%x\n", deflated_size);
> +      if (i == n_blocks - 1 && compressed_ofs + deflated_size != next_ofs)
> +        printf ("\t\t(This was expected to be 0x%llx.)\n",
> +                next_ofs - deflated_size);
> +
> +      printf ("\tInflated data length: 0x%x\n", inflated_size);
> +      if (i < n_blocks - 1 && inflated_size != block_size)
> +        printf ("\t\t(This was expected to be 0x%x.)\n", block_size);
> +    }
> +}
>
>  /* Helpers for reading records that consist of structured text
>     strings. */
>
_______________________________________________
Pspp-users mailing list
Pspp-users@gnu.org
https://lists.gnu.org/mailman/listinfo/pspp-users

Reply via email to