Great, I'm surprised your ability to decipher the file type. When PSPP fully support ZSAV will be a big improvement in handling large databases.
Thank you -- Hugo Valencia 2013/10/9 Ben Pfaff <b...@cs.stanford.edu> > I modified to pspp-dump-sav to interpret the descriptors in zsav > compressed files. So far this modified pspp-dump-sav interprets, > without complaint, all seven of the .zsav files I have. > > The next step is to modify the PSPP sav file reader to actually read the > data. > > --8<--------------------------cut here-------------------------->8-- > > > diff --git a/utilities/pspp-dump-sav.c b/utilities/pspp-dump-sav.c > index c6b5823..6ca45bc 100644 > --- a/utilities/pspp-dump-sav.c > +++ b/utilities/pspp-dump-sav.c > @@ -14,6 +14,34 @@ > You should have received a copy of the GNU General Public License > along with this program. If not, see <http://www.gnu.org/licenses/>. > */ > > +/* > +000035a 5a 03 00 00 00 00 00 00 - Byte offset of this block, 0x35a > +0000362 12 94 03 00 00 00 00 00 - Byte offset of the next block, 0x39412. > +000036a 48 00 00 00 00 00 00 00 - Length of next block's header, 0x48 > bytes. > + > +0000372 0x37c49 bytes of compressed data that inflate to 0x3ff000 bytes > +0037fbb 0x1457 bytes of compressed data that inflate to 0x6bf00 bytes > + > +0039412 9c ff ff ff ff ff ff ff - Value -100, dunno why (compression > bias?) > +003941a 00 00 00 00 00 00 00 00 - ? > +0039422 00 f0 3f 00 - Inflated data block size > +0034926 02 00 00 00 - Number of compressed data blocks > + > +First compressed data block descriptor: > +003942a 5a 03 00 00 00 00 00 00 > + = starting byte offset of data in block 1 if no zlib compression > +0039432 72 03 00 00 00 00 00 00 - Starting offset of data block, 0x372. > +003943a 00 f0 3f 00 - Inflated data size, 0x3ff000 bytes. > +003943e 49 7c 03 00 - Compressed data size, 0x37c49 bytes. > + > +Second compressed data block descriptor: > +0039442 5a f3 3f 00 00 00 00 00 - 0x3ff35a = 0x35a + 0x3ff000 > + = starting byte offset of data in block 2 if no zlib compression > +003944a bb 7f 03 00 00 00 00 00 - Starting offset of data block, 0x37fbb. > +0039452 00 bf 06 00 - Inflated data size, 0x6bf00 bytes. > +0039456 57 14 00 00 - Deflated data size, 0x1457 bytes. > +*/ > + > #include <config.h> > > #include <ctype.h> > @@ -39,6 +67,13 @@ > > #define ID_MAX_LEN 64 > > +enum compression > + { > + COMP_NONE, > + COMP_SIMPLE, > + COMP_ZLIB > + }; > + > struct sfm_reader > { > const char *file_name; > @@ -52,7 +87,7 @@ struct sfm_reader > enum integer_format integer_format; > enum float_format float_format; > > - bool compressed; > + enum compression compression; > double bias; > }; > > @@ -87,7 +122,8 @@ static void read_long_string_missing_values (struct > sfm_reader *r, > size_t size, size_t count); > static void read_unknown_extension (struct sfm_reader *, > size_t size, size_t count); > -static void read_compressed_data (struct sfm_reader *, int max_cases); > +static void read_simple_compressed_data (struct sfm_reader *, int > max_cases); > +static void read_zlib_compressed_data (struct sfm_reader *); > > static struct text_record *open_text_record ( > struct sfm_reader *, size_t size); > @@ -180,7 +216,7 @@ main (int argc, char *argv[]) > r.n_var_widths = 0; > r.allocated_var_widths = 0; > r.var_widths = 0; > - r.compressed = false; > + r.compression = COMP_NONE; > > if (argc - optind > 1) > printf ("Reading \"%s\":\n", r.file_name); > @@ -218,8 +254,13 @@ main (int argc, char *argv[]) > (long long int) ftello (r.file), > (long long int) ftello (r.file) + 4); > > - if (r.compressed && max_cases > 0) > - read_compressed_data (&r, max_cases); > + if (r.compression == COMP_SIMPLE) > + { > + if (max_cases > 0) > + read_simple_compressed_data (&r, max_cases); > + } > + else if (r.compression == COMP_ZLIB) > + read_zlib_compressed_data (&r); > > fclose (r.file); > } > @@ -245,7 +286,11 @@ read_header (struct sfm_reader *r) > read_string (r, rec_type, sizeof rec_type); > read_string (r, eye_catcher, sizeof eye_catcher); > > - if (strcmp ("$FL2", rec_type) != 0) > + if (!strcmp ("$FL2", rec_type)) > + r->compression = COMP_NONE; > + else if (!strcmp ("$FL3", rec_type)) > + r->compression = COMP_ZLIB; > + else > sys_error (r, "This is not an SPSS system file."); > > /* Identify integer format. */ > @@ -265,7 +310,20 @@ read_header (struct sfm_reader *r) > weight_index = read_int (r); > ncases = read_int (r); > > - r->compressed = compressed != 0; > + if (r->compression == COMP_NONE) > + { > + if (compressed == 1) > + r->compression = COMP_SIMPLE; > + else if (compressed != 0) > + sys_error (r, "SAV file header has invalid compression value " > + "%"PRId32".", compressed); > + } > + else > + { > + if (compressed != 2) > + sys_error (r, "ZSAV file header has invalid compression value " > + "%"PRId32".", compressed); > + } > > /* Identify floating-point format and obtain compression bias. */ > read_bytes (r, raw_bias, sizeof raw_bias); > @@ -289,7 +347,12 @@ read_header (struct sfm_reader *r) > printf ("File header record:\n"); > printf ("\t%17s: %s\n", "Product name", eye_catcher); > printf ("\t%17s: %"PRId32"\n", "Layout code", layout_code); > - printf ("\t%17s: %"PRId32"\n", "Compressed", compressed); > + printf ("\t%17s: %"PRId32" (%s)\n", "Compressed", > + compressed, > + r->compression == COMP_NONE ? "no compression" > + : r->compression == COMP_SIMPLE ? "simple compression" > + : r->compression == COMP_ZLIB ? "ZLIB compression" > + : "<error>"); > printf ("\t%17s: %"PRId32"\n", "Weight index", weight_index); > printf ("\t%17s: %"PRId32"\n", "Number of cases", ncases); > printf ("\t%17s: %g\n", "Compression bias", r->bias); > @@ -1170,7 +1233,7 @@ read_variable_attributes (struct sfm_reader *r, > size_t size, size_t count) > } > > static void > -read_compressed_data (struct sfm_reader *r, int max_cases) > +read_simple_compressed_data (struct sfm_reader *r, int max_cases) > { > enum { N_OPCODES = 8 }; > uint8_t opcodes[N_OPCODES]; > @@ -1258,6 +1321,82 @@ read_compressed_data (struct sfm_reader *r, int > max_cases) > } > } > } > + > +static void > +read_zlib_compressed_data (struct sfm_reader *r) > +{ > + long long int ofs; > + long long int this_ofs, next_ofs, next_len; > + long long int bias, zero; > + unsigned int block_size, n_blocks; > + unsigned int i; > + > + read_int (r); > + ofs = ftello (r->file); > + printf ("\n%08llx: ZLIB compressed data header:\n", ofs); > + > + this_ofs = read_int64 (r); > + next_ofs = read_int64 (r); > + next_len = read_int64 (r); > + > + printf ("\tHeader offset: 0x%llx\n", this_ofs); > + if (this_ofs != ofs) > + printf ("\t\t(This was expected to be 0x%llx.)\n", ofs); > + printf ("\tTrailer offset: 0x%llx\n", next_ofs); > + printf ("\tTrailer length: %lld\n", next_len); > + if (next_len < 24 || next_len % 24) > + printf ("\t\t(Trailer length is not a positive multiple of 24.)\n"); > + > + printf ("\n%08llx: 0x%llx bytes of ZLIB compressed data\n", > + ofs + 8 * 3, next_ofs - (ofs + 8 * 3)); > + > + skip_bytes (r, next_ofs - (ofs + 8 * 3)); > + > + printf ("\n%08llx: ZLIB compressed data trailer:\n", next_ofs); > + bias = read_int64 (r); > + zero = read_int64 (r); > + block_size = read_int (r); > + n_blocks = read_int (r); > + printf ("\tCompression bias: %lld\n", bias); > + printf ("\tZero: 0x%llx\n", zero); > + if (zero != 0) > + printf ("\t\t(This was expected to be 0.)\n"); > + printf ("\tBlock size: 0x%x\n", block_size); > + if (block_size != 0x3ff000) > + printf ("\t\t(Block size is ordinarily 0x3ff000.)\n"); > + printf ("\tNumber of blocks: %u\n", n_blocks); > + if (n_blocks != next_len / 24 - 1) > + printf ("\t\t(Expected %llu blocks.)\n", next_len / 24 - 1); > + > + for (i = 0; i < n_blocks; i++) > + { > + long long int blockinfo_ofs = ftello (r->file); > + unsigned long long int uncompressed_ofs = read_int64 (r); > + unsigned long long int compressed_ofs = read_int64 (r); > + unsigned int inflated_size = read_int (r); > + unsigned int deflated_size = read_int (r); > + > + printf ("\n%08llx: Block info for block %d of %d\n", > + blockinfo_ofs, i + 1, n_blocks); > + > + printf ("\tOffset if ZLIB were turned off: 0x%llx\n", > uncompressed_ofs); > + if (i == 0 && uncompressed_ofs != ofs) > + printf ("\t\t(This was expected to be 0x%llx.)\n", ofs); > + > + printf ("\tOffset of ZLIB compressed data: 0x%llx\n", > compressed_ofs); > + if (i == 0 && compressed_ofs != ofs + 24) > + printf ("\t\t(This was expected to be 0x%llx.)\n", ofs + 24); > + > + printf ("\tDeflated data length: 0x%x\n", deflated_size); > + if (i == n_blocks - 1 && compressed_ofs + deflated_size != next_ofs) > + printf ("\t\t(This was expected to be 0x%llx.)\n", > + next_ofs - deflated_size); > + > + printf ("\tInflated data length: 0x%x\n", inflated_size); > + if (i < n_blocks - 1 && inflated_size != block_size) > + printf ("\t\t(This was expected to be 0x%x.)\n", block_size); > + } > +} > > /* Helpers for reading records that consist of structured text > strings. */ >
_______________________________________________ Pspp-users mailing list Pspp-users@gnu.org https://lists.gnu.org/mailman/listinfo/pspp-users