"That's a huge improvement in generated code size. The above two compiles used the same gcc flags"<p>It would have been awfully nice to state the version of the compiler and the flags being used. With gcc 10 on Linux/AMD64, libc 4.15.0, I get with '-Os':<p><pre><code> --8<--
strlcpy:
.LFB5:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rdi, %rbp
movq %rsi, %rdi
pushq %rbx
.cfi_def_cfa_offset 24
.cfi_offset 3, -24
movq %rdx, %rbx
subq $24, %rsp
.cfi_def_cfa_offset 48
movq %rsi, 8(%rsp)
call strlen
testq %rbx, %rbx
movq 8(%rsp), %rsi
je .L1
leaq -1(%rbx), %rdx
movq %rbp, %rdi
cmpq %rax, %rdx
cmova %rax, %rdx
movq %rdx, %rcx
movq %rdx, %rcx
rep movsb
movb $0, 0(%rbp,%rd
.L1ubq $24, %rsp
.cfi_def_cfa_offset 48
movq %rsi, 8(%rsp)
call strlen
testq %rbx, %rbx
movq 8(%rsp), %rsi
je .L1
leaq -1(%rbx), %rdx
movq %rbp, %rdi
cmpq %rax, %rdx
cmova %rax, %rdx
movq %rdx, %rcx
cmova %rax, %rdx
movq %rdx, %rcx
addq $24, %rsp
.cfi_def_cfa_offset 24
popq %rbx
.cfi_def_cfa_offset 16
popq %rbp
.cfi_def_cfa_offset 8
ret
.cfi_endproc
-->8--
</code></pre>
Which doesn't seem so bad.