[python] support cromfs extraction

[python] some updates
- add ELF - add basic ROMFS - small fixs
2024-09-19 09:32:02 +07:00 · 2024-08-30 23:03:23 +07:00 · 2024-08-30 21:22:49 +07:00 · 2024-08-30 21:21:58 +07:00
8 changed files with 284 additions and 18 deletions
--- a/python/main.py
+++ b/python/main.py
@ -2,13 +2,18 @@ import argparse
 import os
 import io
 from pathlib import Path
 import matcher
 signatures = [
    matcher.Zip,
    matcher.Ambarella,
    matcher.SquashFS,
-    matcher.FlattenDeviceTree
+    matcher.RomFS,
    matcher.CromFS,
    matcher.FlattenDeviceTree,
    matcher.ELF,
 ]
 def detect(args):
@ -25,12 +30,22 @@ def detect(args):
        print("detected", filetype.name)
        for m in filetype.matches:
            print(">", m)
            filetype.view(m, None)
    return matches
 def extract(args):
-    pass
+    file = Path(args.file)
    folder = file.parent / (file.name + "_extracted")
    folder.mkdir(exist_ok=True)
    matches = detect(args)
    for filetype in matches:
        print("detected", filetype.name)
        for m in filetype.matches:
            print(">", m)
            filetype.view(m, folder)
 def main():
    parser = argparse.ArgumentParser(description='Program for detecting or extracting data.')
@ -64,19 +79,8 @@ def main():
    if args.command == 'detect':
        detect(args)
        # if args.isa:
        #     # Perform ISA detection on the file
        #     print('Performing ISA detection on:', args.file)
        # else:
        #     parser.print_help()
    elif args.command == 'extract':
        extract(args)
        # if args.dry:
        #     # Perform a dry run without extracting
        #     print('Dry run extraction from:', args.file)
        # else:
        #     # Extract data from the file
        #     print('Extracting data from:', args.file)
    else:
        parser.print_help()
--- a/python/matcher/init.py
+++ b/python/matcher/init.py
@ -11,3 +11,8 @@ from .flatten_device_tree import FlattenDeviceTree
 # file system formats
 from .squashfs import SquashFS
 from .ubifs import UbiFS
 from .romfs import RomFS
 from .cromfs import CromFS
 # common executable formats
 from .elf import ELF
--- a/python/matcher/cromfs.py
+++ b/python/matcher/cromfs.py
@ -0,0 +1,153 @@
 import os
 import io
 from pathlib import Path
 import lzf
 from .matcher import SignatureMatcher, Match
 class CromFS(SignatureMatcher):
    """
    CromFS a readonly file system
    little-endian
    https://github.com/deadsy/nuttx/blob/master/tools/gencromfs.c
    this should be more correct?
    https://gitea.hedron.io/pixy2040/nuttx/src/branch/master/tools/gencromfs.c
    this CromFS might be a little bit different from the CROMFS here
    https://bisqwit.iki.fi/src/cromfs-format.txt
    this ^ has the signature CROMFS03 and probably CROMFS02 for v2
    this ^ is big-endian
    """
    def __init__(self, file):
        self.name = "CromFS"
        self.signature = b'CROM'
        super().__init__(file)
    def is_valid(self):
        for match in self.search():
            start = match
            header = io.BytesIO(self.file[start:start+20])
            magic = header.read(4)
            nnodes = header.read(2)
            nblocks = header.read(2)
            root = header.read(4)
            fsize = header.read(4)
            bsize = header.read(4)
            as_num = lambda x: int.from_bytes(x, 'little')
            nnodes = as_num(nnodes)
            nblocks = as_num(nblocks)
            root = as_num(root)
            fsize = as_num(fsize)
            bsize = as_num(bsize)
            if root != 20:
                continue
            data = {
                'nnodes': nnodes,
                'nblocks': nblocks,
                'root': root,
                'bsize': bsize,
            }
            self.matches += [Match(start, fsize, data)]
        return len(self.matches) != 0
    def view(self, match, root_folder):
        def extract(path, is_folder):
            if root_folder is None:
                return
            p = root_folder / ('_' + hex(match.offset))
            p.mkdir(exist_ok=True)
            p = p / path
            if is_folder:
                p.mkdir(exist_ok=True)
            else:
                p.touch(exist_ok=True)
                return open(p, 'wb')
        as_num = lambda x: int.from_bytes(x, 'little')
        root = match.data['root']
        bsize = match.data['bsize']
        nblocks = match.data['nblocks']
        nnodes = match.data['nnodes']
        region = io.BytesIO(self.file[match.offset : match.offset + match.length])
        region.seek(root, os.SEEK_SET)
        def read_str_at(buffer, at=None, recover=False):
            if at == 0 or at is None:
                return ''
            s = b''
            old = buffer.tell()
            if at:
                buffer.seek(at, os.SEEK_SET)
            while True:
                c = buffer.read(1)
                if c == b'\x00':
                    break
                s += c
            if recover:
                buffer.seek(old, os.SEEK_SET)
            return s.decode()
        def read_file_contents(buffer, at=None, recover=False, file=None, total_size=0):
            old = buffer.tell()
            if at:
                buffer.seek(at, os.SEEK_SET)
            file_contents = b''
            while total_size > 0:
                magic = buffer.read(2)
                typ = as_num(buffer.read(1))
                if typ == 0:
                    len = as_num(buffer.read(1)) << 8 | as_num(buffer.read(1))
                    uncompress = buffer.read(len)
                    file_contents += uncompress
                    total_size -= len
                if typ == 1:
                    clen = as_num(buffer.read(1)) << 8 | as_num(buffer.read(1))
                    ulen = as_num(buffer.read(1)) << 8 | as_num(buffer.read(1))
                    compressed = buffer.read(clen)
                    file_contents += lzf.decompress(compressed, ulen)
                    total_size -= ulen
            if file:
                file.write(file_contents)
            if recover:
                buffer.seek(old, os.SEEK_SET)
        def read_nodes(buffer, current):
            mode = as_num(buffer.read(2))
            buffer.read(2)
            name_offset = as_num(buffer.read(4))
            size = as_num(buffer.read(4))
            peer = as_num(buffer.read(4))
            extra = as_num(buffer.read(4))
            name = read_str_at(buffer, at=name_offset, recover=True)
            is_dir = lambda mode: mode & (4 << 12) != 0
            is_reg = lambda mode: mode & (8 << 12) != 0
            is_link = lambda mode: mode & (10 << 12) != 0
            path = current / name
            if is_link(mode):
                pass
            if is_dir(mode):
                extract(path, True)
                # traverse the directory children
                buffer.seek(extra, os.SEEK_SET)
                read_nodes(buffer, path)
            if is_reg(mode) and name != '.' and name != '..':
                f = extract(path, False)
                read_file_contents(buffer, at=extra, recover=True, total_size=size, file=f)
                if f:
                    f.close()
            # traverse its peer
            if peer != 0:
                buffer.seek(peer, os.SEEK_SET)
                read_nodes(buffer, current)
        read_nodes(region, Path(''))
--- a/python/matcher/elf.py
+++ b/python/matcher/elf.py
@ -0,0 +1,17 @@
 import io
 from .matcher import SignatureMatcher, Match
 class ELF(SignatureMatcher):
    def __init__(self, file):
        self.name = "ELF"
        self.signature = b'\x7f\x45\x4c\x46\x02\x01\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00'
        super().__init__(file)
    def is_valid(self):
        for match in self.search():
            # walk back for the firmware section header
            start = match
            self.matches += [Match(start, 0)]
        return len(self.matches) != 0
--- a/python/matcher/flatten_device_tree.py
+++ b/python/matcher/flatten_device_tree.py
@ -1,7 +1,23 @@
 import os
 import io
 from pyfdt.pyfdt import FdtBlobParse
 from .matcher import SignatureMatcher, Match
 class FlattenDeviceTree(SignatureMatcher):
    """
    https://devicetree-specification.readthedocs.io/en/stable/flattened-format.html
    header
    -> space
    -> memory reservation block
    -> space
    -> structure block
    -> space
    -> strings block
    -> space
    """
    def __init__(self, file):
        self.name = "Flatten Device Tree"
        self.signature = b'\xd0\x0d\xfe\xed'
@ -9,6 +25,9 @@ class FlattenDeviceTree(SignatureMatcher):
    def is_valid(self):
        for match in self.search():
            """
            All the header fields ... stored in big-endian format
            """
            start = match
            header = io.BytesIO(self.file[start:start+4*10])
            magic = header.read(4)
@ -22,7 +41,32 @@ class FlattenDeviceTree(SignatureMatcher):
            size_dt_strings = header.read(4)
            size_dt_struct = header.read(4)
-            totalsize = int.from_bytes(totalsize, 'little')
+            as_num = lambda f: int.from_bytes(f, 'big')
-            self.matches += [Match(start, totalsize)]
+            totalsize = as_num(totalsize)
            data = {
                'off_dt_struct': as_num(off_dt_struct),
                'off_dt_strings': as_num(off_dt_strings),
                'off_mem_rsvmap': as_num(off_mem_rsvmap),
                'version': as_num(version),
                'last_comp_version': as_num(last_comp_version),
                'boot_cpuid_phys': as_num(boot_cpuid_phys),
                'size_dt_strings': as_num(size_dt_strings),
                'size_dt_struct': as_num(size_dt_struct),
            }
            self.matches += [Match(start, totalsize, data)]
        return len(self.matches) != 0
    def view(self, match):
        as_num = lambda n: int.from_bytes(n, 'big')
        data = match.data
        region = io.BytesIO(self.file[match.offset : match.offset + match.length])
        dtb = FdtBlobParse(region)
        s = dtb.to_fdt()
        for name, node in s.rootnode.walk():
            print(name)
            if "kernel" in name:
                if len(list(filter(lambda f: f in name, ["arch", "os", "description"]))) != 0:
                    print("> ", node)
--- a/python/matcher/matcher.py
+++ b/python/matcher/matcher.py
@ -26,3 +26,6 @@ class SignatureMatcher:
    def is_valid(self):
        return False
    def view(self, match, root_folder=None):
        pass
--- a/python/matcher/romfs.py
+++ b/python/matcher/romfs.py
@ -0,0 +1,38 @@
 import io
 from PyRomfsImage import *
 from .matcher import SignatureMatcher, Match
 class RomFS(SignatureMatcher):
    """
    RomFS a readonly file system
    big-endian
    https://www.kernel.org/doc/Documentation/filesystems/romfs.txt
    """
    def __init__(self, file):
        self.name = "RomFS"
        self.signature = b'-rom1fs-'
        super().__init__(file)
    def is_valid(self):
        for match in self.search():
            start = match
            header = io.BytesIO(self.file[start:start+14])
            magic = header.read(8)
            fullsize = header.read(4)
            checksum = header.read(4)
            fullsize = int.from_bytes(fullsize, 'big')
            self.matches += [Match(start, fullsize)]
        return len(self.matches) != 0
    def view(self, match, root_folder):
        start = match.offset
        length = match.length
        raw = io.BytesIO(self.file[start:start+length])
        rom = Romfs(raw)
        root = rom.getRoot()
        print(root.name)
        for ch in root.children:
            print(ch.name)
--- a/python/matcher/zip.py
+++ b/python/matcher/zip.py
@ -29,9 +29,11 @@ class Zip(SignatureMatcher):
            file_name_length = header.read(2)
            extra_field_length = header.read(2)
-            file_name_length = int.from_bytes(file_name_length, 'little')
+            as_num = lambda x: int.from_bytes(x, 'little')
-            extra_field_length = int.from_bytes(extra_field_length, 'little')
+
-            compressed_size = int.from_bytes(compressed_size, 'little')
+            file_name_length = as_num(file_name_length)
            extra_field_length = as_num(extra_field_length)
            compressed_size = as_num(compressed_size)
            header_size = 4*4 + 2*7
            data = {
Author	SHA1	Message	Date
nganhkhoa	ad4857317f	[python] support cromfs extraction	2024-09-19 09:32:02 +07:00
nganhkhoa	558deed4c8	[python] some updates - add ELF - add basic ROMFS - small fixs	2024-08-30 23:03:23 +07:00
nganhkhoa	7774c948d2	[python] add view interface	2024-08-30 21:22:49 +07:00
nganhkhoa	7c2d040195	[python] add cromfs	2024-08-30 21:21:58 +07:00