Training courses

Kernel and Embedded Linux

Bootlin training courses

Embedded Linux, kernel,
Yocto Project, Buildroot, real-time,
graphics, boot time, debugging...

Bootlin logo

Elixir Cross Referencer

   1
   2
   3
   4
   5
   6
   7
   8
   9
  10
  11
  12
  13
  14
  15
  16
  17
  18
  19
  20
  21
  22
  23
  24
  25
  26
  27
  28
  29
  30
  31
  32
  33
  34
  35
  36
  37
  38
  39
  40
  41
  42
  43
  44
  45
  46
  47
  48
  49
  50
  51
  52
  53
  54
  55
  56
  57
  58
  59
  60
  61
  62
  63
  64
  65
  66
  67
  68
  69
  70
  71
  72
  73
  74
  75
  76
  77
  78
  79
  80
  81
  82
  83
  84
  85
  86
  87
  88
  89
  90
  91
  92
  93
  94
  95
  96
  97
  98
  99
 100
 101
 102
 103
 104
 105
 106
 107
 108
 109
 110
 111
 112
 113
 114
 115
 116
 117
 118
 119
 120
 121
 122
 123
 124
 125
 126
 127
 128
 129
 130
 131
 132
 133
 134
 135
 136
 137
 138
 139
 140
 141
 142
 143
 144
 145
 146
 147
 148
 149
 150
 151
 152
 153
 154
 155
 156
 157
 158
 159
 160
 161
 162
 163
 164
 165
 166
 167
 168
 169
 170
 171
 172
 173
 174
 175
 176
 177
 178
 179
 180
 181
 182
 183
 184
 185
 186
 187
 188
 189
 190
 191
 192
 193
 194
 195
 196
 197
 198
 199
 200
 201
 202
 203
 204
 205
 206
 207
 208
 209
 210
 211
 212
 213
 214
 215
 216
 217
 218
 219
 220
 221
 222
 223
 224
 225
 226
 227
 228
 229
 230
 231
 232
 233
 234
 235
 236
 237
 238
 239
 240
 241
 242
 243
 244
 245
 246
 247
 248
 249
 250
 251
 252
 253
 254
 255
 256
 257
 258
 259
 260
 261
 262
 263
 264
 265
 266
 267
 268
 269
 270
 271
 272
 273
 274
 275
 276
 277
 278
 279
 280
 281
 282
 283
 284
 285
 286
 287
 288
 289
 290
 291
 292
 293
 294
 295
 296
 297
 298
 299
 300
 301
 302
 303
 304
 305
 306
 307
 308
 309
 310
 311
 312
 313
 314
 315
 316
 317
 318
 319
 320
 321
 322
 323
 324
 325
 326
 327
 328
 329
 330
 331
 332
 333
 334
 335
 336
 337
 338
 339
 340
 341
 342
 343
 344
 345
 346
 347
 348
 349
 350
 351
 352
 353
 354
 355
 356
 357
 358
 359
 360
 361
 362
 363
 364
 365
 366
 367
 368
 369
 370
 371
 372
 373
 374
 375
 376
 377
 378
 379
 380
 381
 382
 383
 384
 385
 386
 387
 388
 389
 390
 391
 392
 393
 394
 395
 396
 397
 398
 399
 400
 401
 402
 403
 404
 405
 406
 407
 408
 409
 410
 411
 412
 413
 414
 415
 416
 417
 418
 419
 420
 421
 422
 423
 424
 425
 426
 427
 428
 429
 430
 431
 432
 433
 434
 435
 436
 437
 438
 439
 440
 441
 442
 443
 444
 445
 446
 447
 448
 449
 450
 451
 452
 453
 454
 455
 456
 457
 458
 459
 460
 461
 462
 463
 464
 465
 466
 467
 468
 469
 470
 471
 472
 473
 474
 475
 476
 477
 478
 479
 480
 481
 482
 483
 484
 485
 486
 487
 488
 489
 490
 491
 492
 493
 494
 495
 496
 497
 498
 499
 500
 501
 502
 503
 504
 505
 506
 507
 508
 509
 510
 511
 512
 513
 514
 515
 516
 517
 518
 519
 520
 521
 522
 523
 524
 525
 526
 527
 528
 529
 530
 531
 532
 533
 534
 535
 536
 537
 538
 539
 540
 541
 542
 543
 544
 545
 546
 547
 548
 549
 550
 551
 552
 553
 554
 555
 556
 557
 558
 559
 560
 561
 562
 563
 564
 565
 566
 567
 568
 569
 570
 571
 572
 573
 574
 575
 576
 577
 578
 579
 580
 581
 582
 583
 584
 585
 586
 587
 588
 589
 590
 591
 592
 593
 594
 595
 596
 597
 598
 599
 600
 601
 602
 603
 604
 605
 606
 607
 608
 609
 610
 611
 612
 613
 614
 615
 616
 617
 618
 619
 620
 621
 622
 623
 624
 625
 626
 627
 628
 629
 630
 631
 632
 633
 634
 635
 636
 637
 638
 639
 640
 641
 642
 643
 644
 645
 646
 647
 648
 649
 650
 651
 652
 653
 654
 655
 656
 657
 658
 659
 660
 661
 662
 663
 664
 665
 666
 667
 668
 669
 670
 671
 672
 673
 674
 675
 676
 677
 678
 679
 680
 681
 682
 683
 684
 685
 686
 687
 688
 689
 690
 691
 692
 693
 694
 695
 696
 697
 698
 699
 700
 701
 702
 703
 704
 705
 706
 707
 708
 709
 710
 711
 712
 713
 714
 715
 716
 717
 718
 719
 720
 721
 722
 723
 724
 725
 726
 727
 728
 729
 730
 731
 732
 733
 734
 735
 736
 737
 738
 739
 740
 741
 742
 743
 744
 745
 746
 747
 748
 749
 750
 751
 752
 753
 754
 755
 756
 757
 758
 759
 760
 761
 762
 763
 764
 765
 766
 767
 768
 769
 770
 771
 772
 773
 774
 775
 776
 777
 778
 779
 780
 781
 782
 783
 784
 785
 786
 787
 788
 789
 790
 791
 792
 793
 794
 795
 796
 797
 798
 799
 800
 801
 802
 803
 804
 805
 806
 807
 808
 809
 810
 811
 812
 813
 814
 815
 816
 817
 818
 819
 820
 821
 822
 823
 824
 825
 826
 827
 828
 829
 830
 831
 832
 833
 834
 835
 836
 837
 838
 839
 840
 841
 842
 843
 844
 845
 846
 847
 848
 849
 850
 851
 852
 853
 854
 855
 856
 857
 858
 859
 860
 861
 862
 863
 864
 865
 866
 867
 868
 869
 870
 871
 872
 873
 874
 875
 876
 877
 878
 879
 880
 881
 882
 883
 884
 885
 886
 887
 888
 889
 890
 891
 892
 893
 894
 895
 896
 897
 898
 899
 900
 901
 902
 903
 904
 905
 906
 907
 908
 909
 910
 911
 912
 913
 914
 915
 916
 917
 918
 919
 920
 921
 922
 923
 924
 925
 926
 927
 928
 929
 930
 931
 932
 933
 934
 935
 936
 937
 938
 939
 940
 941
 942
 943
 944
 945
 946
 947
 948
 949
 950
 951
 952
 953
 954
 955
 956
 957
 958
 959
 960
 961
 962
 963
 964
 965
 966
 967
 968
 969
 970
 971
 972
 973
 974
 975
 976
 977
 978
 979
 980
 981
 982
 983
 984
 985
 986
 987
 988
 989
 990
 991
 992
 993
 994
 995
 996
 997
 998
 999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150

The .xz File Format
===================

Version 1.0.4 (2009-08-27)


        0. Preface
           0.1. Notices and Acknowledgements
           0.2. Getting the Latest Version
           0.3. Version History
        1. Conventions
           1.1. Byte and Its Representation
           1.2. Multibyte Integers
        2. Overall Structure of .xz File
           2.1. Stream
                2.1.1. Stream Header
                       2.1.1.1. Header Magic Bytes
                       2.1.1.2. Stream Flags
                       2.1.1.3. CRC32
                2.1.2. Stream Footer
                       2.1.2.1. CRC32
                       2.1.2.2. Backward Size
                       2.1.2.3. Stream Flags
                       2.1.2.4. Footer Magic Bytes
           2.2. Stream Padding
        3. Block
           3.1. Block Header
                3.1.1. Block Header Size
                3.1.2. Block Flags
                3.1.3. Compressed Size
                3.1.4. Uncompressed Size
                3.1.5. List of Filter Flags
                3.1.6. Header Padding
                3.1.7. CRC32
           3.2. Compressed Data
           3.3. Block Padding
           3.4. Check
        4. Index
           4.1. Index Indicator
           4.2. Number of Records
           4.3. List of Records
                4.3.1. Unpadded Size
                4.3.2. Uncompressed Size
           4.4. Index Padding
           4.5. CRC32
        5. Filter Chains
           5.1. Alignment
           5.2. Security
           5.3. Filters
                5.3.1. LZMA2
                5.3.2. Branch/Call/Jump Filters for Executables
                5.3.3. Delta
                       5.3.3.1. Format of the Encoded Output
           5.4. Custom Filter IDs
                5.4.1. Reserved Custom Filter ID Ranges
        6. Cyclic Redundancy Checks
        7. References


0. Preface

        This document describes the .xz file format (filename suffix
        ".xz", MIME type "application/x-xz"). It is intended that this
        this format replace the old .lzma format used by LZMA SDK and
        LZMA Utils.


0.1. Notices and Acknowledgements

        This file format was designed by Lasse Collin
        <lasse.collin@tukaani.org> and Igor Pavlov.

        Special thanks for helping with this document goes to
        Ville Koskinen. Thanks for helping with this document goes to
        Mark Adler, H. Peter Anvin, Mikko Pouru, and Lars Wirzenius.

        This document has been put into the public domain.


0.2. Getting the Latest Version

        The latest official version of this document can be downloaded
        from <http://tukaani.org/xz/xz-file-format.txt>.

        Specific versions of this document have a filename
        xz-file-format-X.Y.Z.txt where X.Y.Z is the version number.
        For example, the version 1.0.0 of this document is available
        at <http://tukaani.org/xz/xz-file-format-1.0.0.txt>.


0.3. Version History

        Version   Date          Description

        1.0.4     2009-08-27    Language improvements in Sections 1.2,
                                2.1.1.2, 3.1.1, 3.1.2, and 5.3.1

        1.0.3     2009-06-05    Spelling fixes in Sections 5.1 and 5.4

        1.0.2     2009-06-04    Typo fixes in Sections 4 and 5.3.1

        1.0.1     2009-06-01    Typo fix in Section 0.3 and minor
                                clarifications to Sections 2, 2.2,
                                3.3, 4.4, and 5.3.2

        1.0.0     2009-01-14    The first official version


1. Conventions

        The key words "MUST", "MUST NOT", "REQUIRED", "SHOULD",
        "SHOULD NOT", "RECOMMENDED", "MAY", and "OPTIONAL" in this
        document are to be interpreted as described in [RFC-2119].

        Indicating a warning means displaying a message, returning
        appropriate exit status, or doing something else to let the
        user know that something worth warning occurred. The operation
        SHOULD still finish if a warning is indicated.

        Indicating an error means displaying a message, returning
        appropriate exit status, or doing something else to let the
        user know that something prevented successfully finishing the
        operation. The operation MUST be aborted once an error has
        been indicated.


1.1. Byte and Its Representation

        In this document, byte is always 8 bits.

        A "null byte" has all bits unset. That is, the value of a null
        byte is 0x00.

        To represent byte blocks, this document uses notation that
        is similar to the notation used in [RFC-1952]:

            +-------+
            |  Foo  |   One byte.
            +-------+

            +---+---+
            |  Foo  |   Two bytes; that is, some of the vertical bars
            +---+---+   can be missing.

            +=======+
            |  Foo  |   Zero or more bytes.
            +=======+

        In this document, a boxed byte or a byte sequence declared
        using this notation is called "a field". The example field
        above would be called "the Foo field" or plain "Foo".

        If there are many fields, they may be split to multiple lines.
        This is indicated with an arrow ("--->"):

            +=====+
            | Foo |
            +=====+

                 +=====+
            ---> | Bar |
                 +=====+

        The above is equivalent to this:

            +=====+=====+
            | Foo | Bar |
            +=====+=====+


1.2. Multibyte Integers

        Multibyte integers of static length, such as CRC values,
        are stored in little endian byte order (least significant
        byte first).

        When smaller values are more likely than bigger values (for
        example file sizes), multibyte integers are encoded in a
        variable-length representation:
          - Numbers in the range [0, 127] are copied as is, and take
            one byte of space.
          - Bigger numbers will occupy two or more bytes. All but the
            last byte of the multibyte representation have the highest
            (eighth) bit set.

        For now, the value of the variable-length integers is limited
        to 63 bits, which limits the encoded size of the integer to
        nine bytes. These limits may be increased in the future if
        needed.

        The following C code illustrates encoding and decoding of
        variable-length integers. The functions return the number of
        bytes occupied by the integer (1-9), or zero on error.

            #include <stddef.h>
            #include <inttypes.h>

            size_t
            encode(uint8_t buf[static 9], uint64_t num)
            {
                if (num > UINT64_MAX / 2)
                    return 0;

                size_t i = 0;

                while (num >= 0x80) {
                    buf[i++] = (uint8_t)(num) | 0x80;
                    num >>= 7;
                }

                buf[i++] = (uint8_t)(num);

                return i;
            }

            size_t
            decode(const uint8_t buf[], size_t size_max, uint64_t *num)
            {
                if (size_max == 0)
                    return 0;

                if (size_max > 9)
                    size_max = 9;

                *num = buf[0] & 0x7F;
                size_t i = 0;

                while (buf[i++] & 0x80) {
                    if (i >= size_max || buf[i] == 0x00)
                        return 0;

                    *num |= (uint64_t)(buf[i] & 0x7F) << (i * 7);
                }

                return i;
            }


2. Overall Structure of .xz File

        A standalone .xz files consist of one or more Streams which may
        have Stream Padding between or after them:

            +========+================+========+================+
            | Stream | Stream Padding | Stream | Stream Padding | ...
            +========+================+========+================+

        The sizes of Stream and Stream Padding are always multiples
        of four bytes, thus the size of every valid .xz file MUST be
        a multiple of four bytes.

        While a typical file contains only one Stream and no Stream
        Padding, a decoder handling standalone .xz files SHOULD support
        files that have more than one Stream or Stream Padding.

        In contrast to standalone .xz files, when the .xz file format
        is used as an internal part of some other file format or
        communication protocol, it usually is expected that the decoder
        stops after the first Stream, and doesn't look for Stream
        Padding or possibly other Streams.


2.1. Stream

        +-+-+-+-+-+-+-+-+-+-+-+-+=======+=======+     +=======+
        |     Stream Header     | Block | Block | ... | Block |
        +-+-+-+-+-+-+-+-+-+-+-+-+=======+=======+     +=======+

             +=======+-+-+-+-+-+-+-+-+-+-+-+-+
        ---> | Index |     Stream Footer     |
             +=======+-+-+-+-+-+-+-+-+-+-+-+-+

        All the above fields have a size that is a multiple of four. If
        Stream is used as an internal part of another file format, it
        is RECOMMENDED to make the Stream start at an offset that is
        a multiple of four bytes.

        Stream Header, Index, and Stream Footer are always present in
        a Stream. The maximum size of the Index field is 16 GiB (2^34).

        There are zero or more Blocks. The maximum number of Blocks is
        limited only by the maximum size of the Index field.

        Total size of a Stream MUST be less than 8 EiB (2^63 bytes).
        The same limit applies to the total amount of uncompressed
        data stored in a Stream.

        If an implementation supports handling .xz files with multiple
        concatenated Streams, it MAY apply the above limits to the file
        as a whole instead of limiting per Stream basis.


2.1.1. Stream Header

        +---+---+---+---+---+---+-------+------+--+--+--+--+
        |  Header Magic Bytes   | Stream Flags |   CRC32   |
        +---+---+---+---+---+---+-------+------+--+--+--+--+


2.1.1.1. Header Magic Bytes

        The first six (6) bytes of the Stream are so called Header
        Magic Bytes. They can be used to identify the file type.

            Using a C array and ASCII:
            const uint8_t HEADER_MAGIC[6]
                    = { 0xFD, '7', 'z', 'X', 'Z', 0x00 };

            In plain hexadecimal:
            FD 37 7A 58 5A 00

        Notes:
          - The first byte (0xFD) was chosen so that the files cannot
            be erroneously detected as being in .lzma format, in which
            the first byte is in the range [0x00, 0xE0].
          - The sixth byte (0x00) was chosen to prevent applications
            from misdetecting the file as a text file.

        If the Header Magic Bytes don't match, the decoder MUST
        indicate an error.


2.1.1.2. Stream Flags

        The first byte of Stream Flags is always a null byte. In the
        future, this byte may be used to indicate a new Stream version
        or other Stream properties.

        The second byte of Stream Flags is a bit field:

            Bit(s)  Mask  Description
             0-3    0x0F  Type of Check (see Section 3.4):
                              ID    Size      Check name
                              0x00   0 bytes  None
                              0x01   4 bytes  CRC32
                              0x02   4 bytes  (Reserved)
                              0x03   4 bytes  (Reserved)
                              0x04   8 bytes  CRC64
                              0x05   8 bytes  (Reserved)
                              0x06   8 bytes  (Reserved)
                              0x07  16 bytes  (Reserved)
                              0x08  16 bytes  (Reserved)
                              0x09  16 bytes  (Reserved)
                              0x0A  32 bytes  SHA-256
                              0x0B  32 bytes  (Reserved)
                              0x0C  32 bytes  (Reserved)
                              0x0D  64 bytes  (Reserved)
                              0x0E  64 bytes  (Reserved)
                              0x0F  64 bytes  (Reserved)
             4-7    0xF0  Reserved for future use; MUST be zero for now.

        Implementations SHOULD support at least the Check IDs 0x00
        (None) and 0x01 (CRC32). Supporting other Check IDs is
        OPTIONAL. If an unsupported Check is used, the decoder SHOULD
        indicate a warning or error.

        If any reserved bit is set, the decoder MUST indicate an error.
        It is possible that there is a new field present which the
        decoder is not aware of, and can thus parse the Stream Header
        incorrectly.


2.1.1.3. CRC32

        The CRC32 is calculated from the Stream Flags field. It is
        stored as an unsigned 32-bit little endian integer. If the
        calculated value does not match the stored one, the decoder
        MUST indicate an error.

        The idea is that Stream Flags would always be two bytes, even
        if new features are needed. This way old decoders will be able
        to verify the CRC32 calculated from Stream Flags, and thus
        distinguish between corrupt files (CRC32 doesn't match) and
        files that the decoder doesn't support (CRC32 matches but
        Stream Flags has reserved bits set).


2.1.2. Stream Footer

        +-+-+-+-+---+---+---+---+-------+------+----------+---------+
        | CRC32 | Backward Size | Stream Flags | Footer Magic Bytes |
        +-+-+-+-+---+---+---+---+-------+------+----------+---------+


2.1.2.1. CRC32

        The CRC32 is calculated from the Backward Size and Stream Flags
        fields. It is stored as an unsigned 32-bit little endian
        integer. If the calculated value does not match the stored one,
        the decoder MUST indicate an error.

        The reason to have the CRC32 field before the Backward Size and
        Stream Flags fields is to keep the four-byte fields aligned to
        a multiple of four bytes.


2.1.2.2. Backward Size

        Backward Size is stored as a 32-bit little endian integer,
        which indicates the size of the Index field as multiple of
        four bytes, minimum value being four bytes:

            real_backward_size = (stored_backward_size + 1) * 4;

        If the stored value does not match the real size of the Index
        field, the decoder MUST indicate an error.

        Using a fixed-size integer to store Backward Size makes
        it slightly simpler to parse the Stream Footer when the
        application needs to parse the Stream backwards.


2.1.2.3. Stream Flags

        This is a copy of the Stream Flags field from the Stream
        Header. The information stored to Stream Flags is needed
        when parsing the Stream backwards. The decoder MUST compare
        the Stream Flags fields in both Stream Header and Stream
        Footer, and indicate an error if they are not identical.


2.1.2.4. Footer Magic Bytes

        As the last step of the decoding process, the decoder MUST
        verify the existence of Footer Magic Bytes. If they don't
        match, an error MUST be indicated.

            Using a C array and ASCII:
            const uint8_t FOOTER_MAGIC[2] = { 'Y', 'Z' };

            In hexadecimal:
            59 5A

        The primary reason to have Footer Magic Bytes is to make
        it easier to detect incomplete files quickly, without
        uncompressing. If the file does not end with Footer Magic Bytes
        (excluding Stream Padding described in Section 2.2), it cannot
        be undamaged, unless someone has intentionally appended garbage
        after the end of the Stream.


2.2. Stream Padding

        Only the decoders that support decoding of concatenated Streams
        MUST support Stream Padding.

        Stream Padding MUST contain only null bytes. To preserve the
        four-byte alignment of consecutive Streams, the size of Stream
        Padding MUST be a multiple of four bytes. Empty Stream Padding
        is allowed. If these requirements are not met, the decoder MUST
        indicate an error.

        Note that non-empty Stream Padding is allowed at the end of the
        file; there doesn't need to be a new Stream after non-empty
        Stream Padding. This can be convenient in certain situations
        [GNU-tar].

        The possibility of Stream Padding MUST be taken into account
        when designing an application that parses Streams backwards,
        and the application supports concatenated Streams.


3. Block

        +==============+=================+===============+=======+
        | Block Header | Compressed Data | Block Padding | Check |
        +==============+=================+===============+=======+


3.1. Block Header

        +-------------------+-------------+=================+
        | Block Header Size | Block Flags | Compressed Size |
        +-------------------+-------------+=================+

             +===================+======================+
        ---> | Uncompressed Size | List of Filter Flags |
             +===================+======================+

             +================+--+--+--+--+
        ---> | Header Padding |   CRC32   |
             +================+--+--+--+--+


3.1.1. Block Header Size

        This field overlaps with the Index Indicator field (see
        Section 4.1).

        This field contains the size of the Block Header field,
        including the Block Header Size field itself. Valid values are
        in the range [0x01, 0xFF], which indicate the size of the Block
        Header as multiples of four bytes, minimum size being eight
        bytes:

            real_header_size = (encoded_header_size + 1) * 4;

        If a Block Header bigger than 1024 bytes is needed in the
        future, a new field can be added between the Block Header and
        Compressed Data fields. The presence of this new field would
        be indicated in the Block Header field.


3.1.2. Block Flags

        The Block Flags field is a bit field:

            Bit(s)  Mask  Description
             0-1    0x03  Number of filters (1-4)
             2-5    0x3C  Reserved for future use; MUST be zero for now.
              6     0x40  The Compressed Size field is present.
              7     0x80  The Uncompressed Size field is present.

        If any reserved bit is set, the decoder MUST indicate an error.
        It is possible that there is a new field present which the
        decoder is not aware of, and can thus parse the Block Header
        incorrectly.


3.1.3. Compressed Size

        This field is present only if the appropriate bit is set in
        the Block Flags field (see Section 3.1.2).

        The Compressed Size field contains the size of the Compressed
        Data field, which MUST be non-zero. Compressed Size is stored
        using the encoding described in Section 1.2. If the Compressed
        Size doesn't match the size of the Compressed Data field, the
        decoder MUST indicate an error.


3.1.4. Uncompressed Size

        This field is present only if the appropriate bit is set in
        the Block Flags field (see Section 3.1.2).

        The Uncompressed Size field contains the size of the Block
        after uncompressing. Uncompressed Size is stored using the
        encoding described in Section 1.2. If the Uncompressed Size
        does not match the real uncompressed size, the decoder MUST
        indicate an error.

        Storing the Compressed Size and Uncompressed Size fields serves
        several purposes:
          - The decoder knows how much memory it needs to allocate
            for a temporary buffer in multithreaded mode.
          - Simple error detection: wrong size indicates a broken file.
          - Seeking forwards to a specific location in streamed mode.

        It should be noted that the only reliable way to determine
        the real uncompressed size is to uncompress the Block,
        because the Block Header and Index fields may contain
        (intentionally or unintentionally) invalid information.


3.1.5. List of Filter Flags

        +================+================+     +================+
        | Filter 0 Flags | Filter 1 Flags | ... | Filter n Flags |
        +================+================+     +================+

        The number of Filter Flags fields is stored in the Block Flags
        field (see Section 3.1.2).

        The format of each Filter Flags field is as follows:

            +===========+====================+===================+
            | Filter ID | Size of Properties | Filter Properties |
            +===========+====================+===================+

        Both Filter ID and Size of Properties are stored using the
        encoding described in Section 1.2. Size of Properties indicates
        the size of the Filter Properties field as bytes. The list of
        officially defined Filter IDs and the formats of their Filter
        Properties are described in Section 5.3.

        Filter IDs greater than or equal to 0x4000_0000_0000_0000
        (2^62) are reserved for implementation-specific internal use.
        These Filter IDs MUST never be used in List of Filter Flags.


3.1.6. Header Padding

        This field contains as many null byte as it is needed to make
        the Block Header have the size specified in Block Header Size.
        If any of the bytes are not null bytes, the decoder MUST
        indicate an error. It is possible that there is a new field
        present which the decoder is not aware of, and can thus parse
        the Block Header incorrectly.


3.1.7. CRC32

        The CRC32 is calculated over everything in the Block Header
        field except the CRC32 field itself. It is stored as an
        unsigned 32-bit little endian integer. If the calculated
        value does not match the stored one, the decoder MUST indicate
        an error.

        By verifying the CRC32 of the Block Header before parsing the
        actual contents allows the decoder to distinguish between
        corrupt and unsupported files.


3.2. Compressed Data

        The format of Compressed Data depends on Block Flags and List
        of Filter Flags. Excluding the descriptions of the simplest
        filters in Section 5.3, the format of the filter-specific
        encoded data is out of scope of this document.


3.3. Block Padding

        Block Padding MUST contain 0-3 null bytes to make the size of
        the Block a multiple of four bytes. This can be needed when
        the size of Compressed Data is not a multiple of four. If any
        of the bytes in Block Padding are not null bytes, the decoder
        MUST indicate an error.


3.4. Check

        The type and size of the Check field depends on which bits
        are set in the Stream Flags field (see Section 2.1.1.2).

        The Check, when used, is calculated from the original
        uncompressed data. If the calculated Check does not match the
        stored one, the decoder MUST indicate an error. If the selected
        type of Check is not supported by the decoder, it SHOULD
        indicate a warning or error.


4. Index

        +-----------------+===================+
        | Index Indicator | Number of Records |
        +-----------------+===================+

             +=================+===============+-+-+-+-+
        ---> | List of Records | Index Padding | CRC32 |
             +=================+===============+-+-+-+-+

        Index serves several purposes. Using it, one can
          - verify that all Blocks in a Stream have been processed;
          - find out the uncompressed size of a Stream; and
          - quickly access the beginning of any Block (random access).


4.1. Index Indicator

        This field overlaps with the Block Header Size field (see
        Section 3.1.1). The value of Index Indicator is always 0x00.


4.2. Number of Records

        This field indicates how many Records there are in the List
        of Records field, and thus how many Blocks there are in the
        Stream. The value is stored using the encoding described in
        Section 1.2. If the decoder has decoded all the Blocks of the
        Stream, and then notices that the Number of Records doesn't
        match the real number of Blocks, the decoder MUST indicate an
        error.


4.3. List of Records

        List of Records consists of as many Records as indicated by the
        Number of Records field:

            +========+========+
            | Record | Record | ...
            +========+========+

        Each Record contains information about one Block:

            +===============+===================+
            | Unpadded Size | Uncompressed Size |
            +===============+===================+

        If the decoder has decoded all the Blocks of the Stream, it
        MUST verify that the contents of the Records match the real
        Unpadded Size and Uncompressed Size of the respective Blocks.

        Implementation hint: It is possible to verify the Index with
        constant memory usage by calculating for example SHA-256 of
        both the real size values and the List of Records, then
        comparing the hash values. Implementing this using
        non-cryptographic hash like CRC32 SHOULD be avoided unless
        small code size is important.

        If the decoder supports random-access reading, it MUST verify
        that Unpadded Size and Uncompressed Size of every completely
        decoded Block match the sizes stored in the Index. If only
        partial Block is decoded, the decoder MUST verify that the
        processed sizes don't exceed the sizes stored in the Index.


4.3.1. Unpadded Size

        This field indicates the size of the Block excluding the Block
        Padding field. That is, Unpadded Size is the size of the Block
        Header, Compressed Data, and Check fields. Unpadded Size is
        stored using the encoding described in Section 1.2. The value
        MUST never be zero; with the current structure of Blocks, the
        actual minimum value for Unpadded Size is five.

        Implementation note: Because the size of the Block Padding
        field is not included in Unpadded Size, calculating the total
        size of a Stream or doing random-access reading requires
        calculating the actual size of the Blocks by rounding Unpadded
        Sizes up to the next multiple of four.

        The reason to exclude Block Padding from Unpadded Size is to
        ease making a raw copy of Compressed Data without Block
        Padding. This can be useful, for example, if someone wants
        to convert Streams to some other file format quickly.


4.3.2. Uncompressed Size

        This field indicates the Uncompressed Size of the respective
        Block as bytes. The value is stored using the encoding
        described in Section 1.2.


4.4. Index Padding

        This field MUST contain 0-3 null bytes to pad the Index to
        a multiple of four bytes. If any of the bytes are not null
        bytes, the decoder MUST indicate an error.


4.5. CRC32

        The CRC32 is calculated over everything in the Index field
        except the CRC32 field itself. The CRC32 is stored as an
        unsigned 32-bit little endian integer. If the calculated
        value does not match the stored one, the decoder MUST indicate
        an error.


5. Filter Chains

        The Block Flags field defines how many filters are used. When
        more than one filter is used, the filters are chained; that is,
        the output of one filter is the input of another filter. The
        following figure illustrates the direction of data flow.

                    v   Uncompressed Data   ^
                    |       Filter 0        |
            Encoder |       Filter 1        | Decoder
                    |       Filter n        |
                    v    Compressed Data    ^


5.1. Alignment

        Alignment of uncompressed input data is usually the job of
        the application producing the data. For example, to get the
        best results, an archiver tool should make sure that all
        PowerPC executable files in the archive stream start at
        offsets that are multiples of four bytes.

        Some filters, for example LZMA2, can be configured to take
        advantage of specified alignment of input data. Note that
        taking advantage of aligned input can be beneficial also when
        a filter is not the first filter in the chain. For example,
        if you compress PowerPC executables, you may want to use the
        PowerPC filter and chain that with the LZMA2 filter. Because
        not only the input but also the output alignment of the PowerPC
        filter is four bytes, it is now beneficial to set LZMA2
        settings so that the LZMA2 encoder can take advantage of its
        four-byte-aligned input data.

        The output of the last filter in the chain is stored to the
        Compressed Data field, which is is guaranteed to be aligned
        to a multiple of four bytes relative to the beginning of the
        Stream. This can increase
          - speed, if the filtered data is handled multiple bytes at
            a time by the filter-specific encoder and decoder,
            because accessing aligned data in computer memory is
            usually faster; and
          - compression ratio, if the output data is later compressed
            with an external compression tool.


5.2. Security

        If filters would be allowed to be chained freely, it would be
        possible to create malicious files, that would be very slow to
        decode. Such files could be used to create denial of service
        attacks.

        Slow files could occur when multiple filters are chained:

            v   Compressed input data
            |   Filter 1 decoder (last filter)
            |   Filter 0 decoder (non-last filter)
            v   Uncompressed output data

        The decoder of the last filter in the chain produces a lot of
        output from little input. Another filter in the chain takes the
        output of the last filter, and produces very little output
        while consuming a lot of input. As a result, a lot of data is
        moved inside the filter chain, but the filter chain as a whole
        gets very little work done.

        To prevent this kind of slow files, there are restrictions on
        how the filters can be chained. These restrictions MUST be
        taken into account when designing new filters.

        The maximum number of filters in the chain has been limited to
        four, thus there can be at maximum of three non-last filters.
        Of these three non-last filters, only two are allowed to change
        the size of the data.

        The non-last filters, that change the size of the data, MUST
        have a limit how much the decoder can compress the data: the
        decoder SHOULD produce at least n bytes of output when the
        filter is given 2n bytes of input. This  limit is not
        absolute, but significant deviations MUST be avoided.

        The above limitations guarantee that if the last filter in the
        chain produces 4n bytes of output, the chain as a whole will
        produce at least n bytes of output.


5.3. Filters

5.3.1. LZMA2

        LZMA (Lempel-Ziv-Markov chain-Algorithm) is a general-purpose
        compression algorithm with high compression ratio and fast
        decompression. LZMA is based on LZ77 and range coding
        algorithms.

        LZMA2 is an extension on top of the original LZMA. LZMA2 uses
        LZMA internally, but adds support for flushing the encoder,
        uncompressed chunks, eases stateful decoder implementations,
        and improves support for multithreading. Thus, the plain LZMA
        will not be supported in this file format.

            Filter ID:                  0x21
            Size of Filter Properties:  1 byte
            Changes size of data:       Yes
            Allow as a non-last filter: No
            Allow as the last filter:   Yes

            Preferred alignment:
                Input data:             Adjustable to 1/2/4/8/16 byte(s)
                Output data:            1 byte

        The format of the one-byte Filter Properties field is as
        follows:

            Bits   Mask   Description
            0-5    0x3F   Dictionary Size
            6-7    0xC0   Reserved for future use; MUST be zero for now.

        Dictionary Size is encoded with one-bit mantissa and five-bit
        exponent. The smallest dictionary size is 4 KiB and the biggest
        is 4 GiB.

            Raw value   Mantissa   Exponent   Dictionary size
                0           2         11         4 KiB
                1           3         11         6 KiB
                2           2         12         8 KiB
                3           3         12        12 KiB
                4           2         13        16 KiB
                5           3         13        24 KiB
                6           2         14        32 KiB
              ...         ...        ...      ...
               35           3         27       768 MiB
               36           2         28      1024 MiB
               37           3         29      1536 MiB
               38           2         30      2048 MiB
               39           3         30      3072 MiB
               40           2         31      4096 MiB - 1 B

        Instead of having a table in the decoder, the dictionary size
        can be decoded using the following C code:

            const uint8_t bits = get_dictionary_flags() & 0x3F;
            if (bits > 40)
                return DICTIONARY_TOO_BIG; // Bigger than 4 GiB

            uint32_t dictionary_size;
            if (bits == 40) {
                dictionary_size = UINT32_MAX;
            } else {
                dictionary_size = 2 | (bits & 1);
                dictionary_size <<= bits / 2 + 11;
            }


5.3.2. Branch/Call/Jump Filters for Executables

        These filters convert relative branch, call, and jump
        instructions to their absolute counterparts in executable
        files. This conversion increases redundancy and thus
        compression ratio.

            Size of Filter Properties:  0 or 4 bytes
            Changes size of data:       No
            Allow as a non-last filter: Yes
            Allow as the last filter:   No

        Below is the list of filters in this category. The alignment
        is the same for both input and output data.

            Filter ID   Alignment   Description
              0x04       1 byte     x86 filter (BCJ)
              0x05       4 bytes    PowerPC (big endian) filter
              0x06      16 bytes    IA64 filter
              0x07       4 bytes    ARM (little endian) filter
              0x08       2 bytes    ARM Thumb (little endian) filter
              0x09       4 bytes    SPARC filter

        If the size of Filter Properties is four bytes, the Filter
        Properties field contains the start offset used for address
        conversions. It is stored as an unsigned 32-bit little endian
        integer. The start offset MUST be a multiple of the alignment
        of the filter as listed in the table above; if it isn't, the
        decoder MUST indicate an error. If the size of Filter
        Properties is zero, the start offset is zero.

        Setting the start offset may be useful if an executable has
        multiple sections, and there are many cross-section calls.
        Taking advantage of this feature usually requires usage of
        the Subblock filter, whose design is not complete yet.


5.3.3. Delta

        The Delta filter may increase compression ratio when the value
        of the next byte correlates with the value of an earlier byte
        at specified distance.

            Filter ID:                  0x03
            Size of Filter Properties:  1 byte
            Changes size of data:       No
            Allow as a non-last filter: Yes
            Allow as the last filter:   No

            Preferred alignment:
                Input data:             1 byte
                Output data:            Same as the original input data

        The Properties byte indicates the delta distance, which can be
        1-256 bytes backwards from the current byte: 0x00 indicates
        distance of 1 byte and 0xFF distance of 256 bytes.


5.3.3.1. Format of the Encoded Output

        The code below illustrates both encoding and decoding with
        the Delta filter.

            // Distance is in the range [1, 256].
            const unsigned int distance = get_properties_byte() + 1;
            uint8_t pos = 0;
            uint8_t delta[256];

            memset(delta, 0, sizeof(delta));

            while (1) {
                const int byte = read_byte();
                if (byte == EOF)
                    break;

                uint8_t tmp = delta[(uint8_t)(distance + pos)];
                if (is_encoder) {
                    tmp = (uint8_t)(byte) - tmp;
                    delta[pos] = (uint8_t)(byte);
                } else {
                    tmp = (uint8_t)(byte) + tmp;
                    delta[pos] = tmp;
                }

                write_byte(tmp);
                --pos;
            }


5.4. Custom Filter IDs

        If a developer wants to use custom Filter IDs, he has two
        choices. The first choice is to contact Lasse Collin and ask
        him to allocate a range of IDs for the developer.

        The second choice is to generate a 40-bit random integer,
        which the developer can use as his personal Developer ID.
        To minimize the risk of collisions, Developer ID has to be
        a randomly generated integer, not manually selected "hex word".
        The following command, which works on many free operating
        systems, can be used to generate Developer ID:

            dd if=/dev/urandom bs=5 count=1 | hexdump

        The developer can then use his Developer ID to create unique
        (well, hopefully unique) Filter IDs.

            Bits    Mask                    Description
             0-15   0x0000_0000_0000_FFFF   Filter ID
            16-55   0x00FF_FFFF_FFFF_0000   Developer ID
            56-62   0x3F00_0000_0000_0000   Static prefix: 0x3F

        The resulting 63-bit integer will use 9 bytes of space when
        stored using the encoding described in Section 1.2. To get
        a shorter ID, see the beginning of this Section how to
        request a custom ID range.


5.4.1. Reserved Custom Filter ID Ranges

        Range                       Description
        0x0000_0300 - 0x0000_04FF   Reserved to ease .7z compatibility
        0x0002_0000 - 0x0007_FFFF   Reserved to ease .7z compatibility
        0x0200_0000 - 0x07FF_FFFF   Reserved to ease .7z compatibility


6. Cyclic Redundancy Checks

        There are several incompatible variations to calculate CRC32
        and CRC64. For simplicity and clarity, complete examples are
        provided to calculate the checks as they are used in this file
        format. Implementations MAY use different code as long as it
        gives identical results.

        The program below reads data from standard input, calculates
        the CRC32 and CRC64 values, and prints the calculated values
        as big endian hexadecimal strings to standard output.

            #include <stddef.h>
            #include <inttypes.h>
            #include <stdio.h>

            uint32_t crc32_table[256];
            uint64_t crc64_table[256];

            void
            init(void)
            {
                static const uint32_t poly32 = UINT32_C(0xEDB88320);
                static const uint64_t poly64
                        = UINT64_C(0xC96C5795D7870F42);

                for (size_t i = 0; i < 256; ++i) {
                    uint32_t crc32 = i;
                    uint64_t crc64 = i;

                    for (size_t j = 0; j < 8; ++j) {
                        if (crc32 & 1)
                            crc32 = (crc32 >> 1) ^ poly32;
                        else
                            crc32 >>= 1;

                        if (crc64 & 1)
                            crc64 = (crc64 >> 1) ^ poly64;
                        else
                            crc64 >>= 1;
                    }

                    crc32_table[i] = crc32;
                    crc64_table[i] = crc64;
                }
            }

            uint32_t
            crc32(const uint8_t *buf, size_t size, uint32_t crc)
            {
                crc = ~crc;
                for (size_t i = 0; i < size; ++i)
                    crc = crc32_table[buf[i] ^ (crc & 0xFF)]
                            ^ (crc >> 8);
                return ~crc;
            }

            uint64_t
            crc64(const uint8_t *buf, size_t size, uint64_t crc)
            {
                crc = ~crc;
                for (size_t i = 0; i < size; ++i)
                    crc = crc64_table[buf[i] ^ (crc & 0xFF)]
                            ^ (crc >> 8);
                return ~crc;
            }

            int
            main()
            {
                init();

                uint32_t value32 = 0;
                uint64_t value64 = 0;
                uint64_t total_size = 0;
                uint8_t buf[8192];

                while (1) {
                    const size_t buf_size
                            = fread(buf, 1, sizeof(buf), stdin);
                    if (buf_size == 0)
                        break;

                    total_size += buf_size;
                    value32 = crc32(buf, buf_size, value32);
                    value64 = crc64(buf, buf_size, value64);
                }

                printf("Bytes:  %" PRIu64 "\n", total_size);
                printf("CRC-32: 0x%08" PRIX32 "\n", value32);
                printf("CRC-64: 0x%016" PRIX64 "\n", value64);

                return 0;
            }


7. References

        LZMA SDK - The original LZMA implementation
        http://7-zip.org/sdk.html

        LZMA Utils - LZMA adapted to POSIX-like systems
        http://tukaani.org/lzma/

        XZ Utils - The next generation of LZMA Utils
        http://tukaani.org/xz/

        [RFC-1952]
        GZIP file format specification version 4.3
        http://www.ietf.org/rfc/rfc1952.txt
          - Notation of byte boxes in section "2.1. Overall conventions"

        [RFC-2119]
        Key words for use in RFCs to Indicate Requirement Levels
        http://www.ietf.org/rfc/rfc2119.txt

        [GNU-tar]
        GNU tar 1.21 manual
        http://www.gnu.org/software/tar/manual/html_node/Blocking-Factor.html
          - Node 9.4.2 "Blocking Factor", paragraph that begins
            "gzip will complain about trailing garbage"
          - Note that this URL points to the latest version of the
            manual, and may some day not contain the note which is in
            1.21. For the exact version of the manual, download GNU
            tar 1.21: ftp://ftp.gnu.org/pub/gnu/tar/tar-1.21.tar.gz