From 8be98b1acbed26cb76fc26572f72b733d1eee455 Mon Sep 17 00:00:00 2001 From: Scott Constable Date: Thu, 19 Oct 2023 23:38:23 -0700 Subject: [PATCH] Added a constant-time instruction decoder to supplement the default AEX-Notify mitigation. (#960) Added a constant-time instruction decoder to supplement the default AEX-Notify mitigation. Signed-off-by: Scott Constable --- sdk/simulation/trtssim/linux/Makefile | 11 +- sdk/trts/Makefile | 8 +- sdk/trts/ctd.c | 1347 +++++++++++++++++++++++++ sdk/trts/ctd.h | 59 ++ sdk/trts/linux/trts_mitigation.S | 4 +- sdk/trts/trts_veh.cpp | 47 +- 6 files changed, 1464 insertions(+), 12 deletions(-) create mode 100644 sdk/trts/ctd.c create mode 100644 sdk/trts/ctd.h diff --git a/sdk/simulation/trtssim/linux/Makefile b/sdk/simulation/trtssim/linux/Makefile index 0e8614402..7b23a807d 100644 --- a/sdk/simulation/trtssim/linux/Makefile +++ b/sdk/simulation/trtssim/linux/Makefile @@ -71,7 +71,7 @@ TRTS1_OBJS := init_enclave.o \ trts_emm_sim.o TRTS2_OBJS := trts_nsp.o -TRTS_OBJS := $(TRTS1_OBJS) $(TRTS2_OBJS) +TRTS_OBJS := $(TRTS1_OBJS) $(TRTS2_OBJS) ctd.o TINST_OBJS := t_instructions.o \ deriv.o @@ -93,6 +93,15 @@ TLDR_C_OBJS := elf_parser.o \ TLDR_OBJS := $(TLDR_ASM_OBJS) $(TLDR_C_OBJS) +ctd.o: $(TRTS_DIR)/ctd.c + $(CC) -mavx2 -O3 -masm=intel $(filter-out -O2,$(CFLAGS)) $(TCFLAGS) -c $< -o $@ \ + -I$(COMMON_DIR)/inc/ \ + -I$(COMMON_DIR)/inc/tlibc/ \ + -I$(COMMON_DIR)/inc/internal \ + -I$(LINUX_SDK_DIR)/trts/ \ + -I$(LINUX_SDK_DIR)/pthread/ \ + -I$(LINUX_SDK_DIR)/tlibcxx/include + LIBTRTS := libsgx_trts_sim.a vpath %.cpp $(TRTS_DIR):$(TINST_DIR) diff --git a/sdk/trts/Makefile b/sdk/trts/Makefile index d2478af3b..5750aeced 100644 --- a/sdk/trts/Makefile +++ b/sdk/trts/Makefile @@ -60,7 +60,9 @@ OBJS2 := trts_nsp.o OBJS3 := ema_rt.o -OBJS := $(OBJS1) $(OBJS2) $(OBJ3) +OBJS4 := ctd.o # The CTD requires AVX2 + +OBJS := $(OBJS1) $(OBJS2) $(OBJ3) $(OBJS4) all: $(OBJS) elf_parser @@ -74,6 +76,10 @@ $(OBJS2): %.o: %.cpp $(OBJS3): %.o: %.c $(CC) -c $(TCFLAGS) $(CFLAGS) -fPIC $< -o $@ +$(OBJS4): %.o: %.c + $(CC) -mavx2 -O3 -masm=intel $(filter-out -O2,$(CFLAGS)) $(TCFLAGS) \ + -I$(COMMON_DIR)/inc/internal -I$(COMMON_DIR)/inc -c $< -o $@ + .PHONY: elf_parser elf_parser: $(OBJS) $(MAKE) -C linux diff --git a/sdk/trts/ctd.c b/sdk/trts/ctd.c new file mode 100644 index 000000000..6244fe9f9 --- /dev/null +++ b/sdk/trts/ctd.c @@ -0,0 +1,1347 @@ +/* + * Copyright (C) 2011-2023 Intel Corporation. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +/** + * File: ctd.c + * Description: + * Implements the constant-time instruction decoder for AEX-Notify + */ + +#include "ctd.h" + +#define NUM_MM_REGS 16 + +typedef long long __m128i __attribute__((__vector_size__(16), __may_alias__)); +typedef long long __m256i __attribute__((__vector_size__(32), __may_alias__)); +typedef long long __m512i __attribute__ ((__vector_size__ (64), __may_alias__)); +typedef long long __v2di __attribute__((__vector_size__(16))); +typedef char __v16qi __attribute__((__vector_size__(16))); +typedef int __v4si __attribute__((__vector_size__(16))); +typedef long long __v2di __attribute__((__vector_size__(16))); +typedef unsigned long long __v2du __attribute__((__vector_size__(16))); +typedef long long __m128i_u __attribute__((__vector_size__(16), __may_alias__, __aligned__(1))); +typedef long long __v4di __attribute__((__vector_size__(32))); +typedef char __v32qi __attribute__((__vector_size__(32))); +typedef unsigned long long __v4du __attribute__((__vector_size__(32))); +typedef int __v8si __attribute__((__vector_size__(32))); + +/** + * @brief Lookup table for all prefixes and opcode 1 + * Each entry is 1 byte, 8 bits: + * _ _ _ _ _ _ _ _ + * 1 2 3 4 5 6 7 8 + * a b c d + * Each 2 bits are for one unit, we call it a/b/c/d + * Switch (a) => + * case "0": means a standard prefix, there are 4 groups(refer to IA-32 manual Sec 2.1.1.1) + * then b is it's group number and cd together means its sequence in the group, starting 0 + * e.g. prefix 0x64 is the 5th prefix in group 2 => 0b00010100 0x14 + * case "1": means a rex prefix, other fields are unused + * case "2": means a standard opcode, b is set 1 for normal opcode + * c means accessing mod: + * switch (c) => + * case "0": no explict memory access + * case "1": memory read + * case "2": memory write, here we assume write includes read access + * case "3": reserved, illegal + * d means accessing size: in standard instruction, we only have: byte, word(2 bytes), dword/fword(double word, 4 bytes), qword(quadratic word, 8 bytes) + * switch (d) => + * case "0": byte + * case "1": word + * case "2": dword/fword + * case "3": qword + * b = 2: string related instructions: + * There are 5 sets of instructions(6c/6d/6e/6f/aa/ab/ac/ad/ae/af), both accessing either [RDI] or [RSI]. + * c: means the register to access: + * 0 => RSI + * 1 => RDI + * (in our ctx, rdi is later than rsi by 1, so we use this encoding and base offset to direct get value) + * d: means accessing mod: + * 0 => read + * 1 => write + * b = 3: stack instructions: call/push: + * all these instructions will access [RSP] with WRITE + * + * case "3": extended map for 1 opcode, refer to Section 2 A.4 + * b means different rows: + * switch (b) => + * case "0": row opcode 80 - 83, 8f + * case "1": row opcode c0/c1/d0/d1/d2/d3 + * case "2": row F6/F7 + * case "3": row fe/ff/c6/c7 + * c is either 01B or 10B, since the r/w will depend on one more table lookup + * d is for access size + * switch (d) => + * case "0": byte + * case "1": word + * case "2": dword/fword + * case "3": qword + * + * + * Notes: + * 1. The escape opcode 0xf is regraded as normal, no memory access + * 2. Unused parts are set to 0 + */ +static uint8_t standard[256] = { + 0x98, // 0x0 + 0x9a, // 0x1 + 0x94, // 0x2 + 0x96, // 0x3 + 0x90, // 0x4 + 0x90, // 0x5 + 0x90, // 0x6 + 0x90, // 0x7 + 0x98, // 0x8 + 0x9a, // 0x9 + 0x94, // 0xa + 0x96, // 0xb + 0x90, // 0xc + 0x90, // 0xd + 0x90, // 0xe + 0x80, // 0xf + 0x98, // 0x10 + 0x9a, // 0x11 + 0x94, // 0x12 + 0x96, // 0x13 + 0x90, // 0x14 + 0x90, // 0x15 + 0x90, // 0x16 + 0x90, // 0x17 + 0x98, // 0x18 + 0x9a, // 0x19 + 0x94, // 0x1a + 0x96, // 0x1b + 0x90, // 0x1c + 0x90, // 0x1d + 0x90, // 0x1e + 0x90, // 0x1f + 0x98, // 0x20 + 0x9a, // 0x21 + 0x94, // 0x22 + 0x96, // 0x23 + 0x90, // 0x24 + 0x90, // 0x25 + 0x13, // 0x26 + 0x90, // 0x27 + 0x98, // 0x28 + 0x9a, // 0x29 + 0x94, // 0x2a + 0x96, // 0x2b + 0x90, // 0x2c + 0x90, // 0x2d + 0x10, // 0x2e + 0x90, // 0x2f + 0x98, // 0x30 + 0x9a, // 0x31 + 0x94, // 0x32 + 0x96, // 0x33 + 0x90, // 0x34 + 0x90, // 0x35 + 0x11, // 0x36 + 0x90, // 0x37 + 0x94, // 0x38 + 0x96, // 0x39 + 0x94, // 0x3a + 0x96, // 0x3b + 0x90, // 0x3c + 0x90, // 0x3d + 0x12, // 0x3e + 0x90, // 0x3f + 0x40, // 0x40 + 0x40, // 0x41 + 0x40, // 0x42 + 0x40, // 0x43 + 0x40, // 0x44 + 0x40, // 0x45 + 0x40, // 0x46 + 0x40, // 0x47 + 0x40, // 0x48 + 0x40, // 0x49 + 0x40, // 0x4a + 0x40, // 0x4b + 0x40, // 0x4c + 0x40, // 0x4d + 0x40, // 0x4e + 0x40, // 0x4f + 0xb0, // 0x50 + 0xb0, // 0x51 + 0xb0, // 0x52 + 0xb0, // 0x53 + 0xb0, // 0x54 + 0xb0, // 0x55 + 0xb0, // 0x56 + 0xb0, // 0x57 + 0x90, // 0x58 + 0x90, // 0x59 + 0x90, // 0x5a + 0x90, // 0x5b + 0x90, // 0x5c + 0x90, // 0x5d + 0x90, // 0x5e + 0x90, // 0x5f + 0xb0, // 0x60 + 0x90, // 0x61 + 0x90, // 0x62 + 0x95, // 0x63 + 0x14, // 0x64 + 0x15, // 0x65 + 0x20, // 0x66 + 0x30, // 0x67 + 0xb0, // 0x68 + 0x96, // 0x69 + 0xb0, // 0x6a + 0x96, // 0x6b + 0xa5, // 0x6c + 0xa5, // 0x6d + 0xa0, // 0x6e + 0xa0, // 0x6f + 0x90, // 0x70 + 0x90, // 0x71 + 0x90, // 0x72 + 0x90, // 0x73 + 0x90, // 0x74 + 0x90, // 0x75 + 0x90, // 0x76 + 0x90, // 0x77 + 0x90, // 0x78 + 0x90, // 0x79 + 0x90, // 0x7a + 0x90, // 0x7b + 0x90, // 0x7c + 0x90, // 0x7d + 0x90, // 0x7e + 0x90, // 0x7f + 0xc4, // 0x80 + 0xc6, // 0x81 + 0xc4, // 0x82 + 0xc6, // 0x83 + 0x94, // 0x84 + 0x96, // 0x85 + 0x94, // 0x86 + 0x96, // 0x87 + 0x98, // 0x88 + 0x9a, // 0x89 + 0x94, // 0x8a + 0x96, // 0x8b + 0x99, // 0x8c + 0x90, // 0x8d + 0x95, // 0x8e + 0x9a, // 0x8f + 0x90, // 0x90 + 0x90, // 0x91 + 0x90, // 0x92 + 0x90, // 0x93 + 0x90, // 0x94 + 0x90, // 0x95 + 0x90, // 0x96 + 0x90, // 0x97 + 0x90, // 0x98 + 0x90, // 0x99 + 0xb0, // 0x9a + 0x90, // 0x9b + 0xb0, // 0x9c + 0x90, // 0x9d + 0x90, // 0x9e + 0x90, // 0x9f + 0x90, // 0xa0 + 0x90, // 0xa1 + 0x90, // 0xa2 + 0x90, // 0xa3 + 0xa5, // 0xa4 + 0xa5, // 0xa5 + 0xa0, // 0xa6 + 0xa0, // 0xa7 + 0x90, // 0xa8 + 0x90, // 0xa9 + 0xa5, // 0xaa + 0xa5, // 0xab + 0xa0, // 0xac + 0xa0, // 0xad + 0xa4, // 0xae + 0xa4, // 0xaf + 0x90, // 0xb0 + 0x90, // 0xb1 + 0x90, // 0xb2 + 0x90, // 0xb3 + 0x90, // 0xb4 + 0x90, // 0xb5 + 0x90, // 0xb6 + 0x90, // 0xb7 + 0x90, // 0xb8 + 0x90, // 0xb9 + 0x90, // 0xba + 0x90, // 0xbb + 0x90, // 0xbc + 0x90, // 0xbd + 0x90, // 0xbe + 0x90, // 0xbf + 0xd8, // 0xc0 + 0xda, // 0xc1 + 0x90, // 0xc2 + 0x90, // 0xc3 + 0x90, // 0xc4 + 0x90, // 0xc5 + 0xf8, // 0xc6 + 0xfa, // 0xc7 + 0x90, // 0xc8 + 0x90, // 0xc9 + 0x90, // 0xca + 0x90, // 0xcb + 0x90, // 0xcc + 0x90, // 0xcd + 0x90, // 0xce + 0x90, // 0xcf + 0xd8, // 0xd0 + 0xda, // 0xd1 + 0xd8, // 0xd2 + 0xda, // 0xd3 + 0x90, // 0xd4 + 0x90, // 0xd5 + 0x90, // 0xd6 + 0x90, // 0xd7 + 0x90, // 0xd8 + 0x90, // 0xd9 + 0x90, // 0xda + 0x90, // 0xdb + 0x90, // 0xdc + 0x90, // 0xdd + 0x90, // 0xde + 0x90, // 0xdf + 0x90, // 0xe0 + 0x90, // 0xe1 + 0x90, // 0xe2 + 0x90, // 0xe3 + 0x90, // 0xe4 + 0x90, // 0xe5 + 0x90, // 0xe6 + 0x90, // 0xe7 + 0xb0, // 0xe8 + 0x90, // 0xe9 + 0x90, // 0xea + 0x90, // 0xeb + 0x90, // 0xec + 0x90, // 0xed + 0x90, // 0xee + 0x90, // 0xef + 0x03, // 0xf0 + 0x90, // 0xf1 + 0x01, // 0xf2 + 0x02, // 0xf3 + 0x90, // 0xf4 + 0x90, // 0xf5 + 0xe4, // 0xf6 + 0xe6, // 0xf7 + 0x90, // 0xf8 + 0x90, // 0xf9 + 0x90, // 0xfa + 0x90, // 0xfb + 0x90, // 0xfc + 0x90, // 0xfd + 0xf8, // 0xfe + 0xf6, // 0xff +}; + +/** + * @brief Lookup table for opcode 2 + * this table is different from `standard` table, since we now know all the elements are opcodes, no more prefixes + * also the memory access size increased + * Each entry is 1 byte, 8 bits: + * _ _ _ _ _ _ _ _ + * 1 2 3 4 5 6 7 8 + * a b c + * We have a, b as similiar position as `standard` table, and the final c takes lower 4 bits + * a is demote whether this is a valid opcode or escape opcode(since we are not going to support 3 opcode instructions) + * switch (a): + * case "0": escape opcode, 0x38 or 0x3A, refer to IA-32 manual A.2.4.3 + * case "2": a valid opcode. b is used for memory access + * b has 3 states: no explicit access, read, write + * switch (b) => + * case "0": no memory access + * case "1": memory read + * case "2": memory write(here we assume write includes read permission) + * case "3": unused + * case "1": a valid normal opcode, but b needs further lookup for the normal table + * currently we have 0x7E, 0x1A, 0x1B for this case, because their mod will be changed by prefix, others no matter what prefix is, the mod is the same + * b will be a index meaning the b-th table to lookup in the 2 byte opcode normal lookup table + * b = switch (opcode 2) => { + * case "0x7e" => 0 + * case "0x1a" => 1 + * case "0x1b" => 2 + * } + * c will be the size as above + * case "3": a valid opcode with extension mode, refer toi Table A-3 in Vol 2 + * here we borrow the highest bit from c to b, so we will have b with 3bits and c with 3 bits + * b is used as index in the table to lookup + * switch (b) => + * case "0" => 0x00 + * case "1" => 0xba + * case "2" => 0xc7 + * case "3" => 0x01 + * case "4" => 0xae + * case "5" => 0x18 + * + * Note: + * 1. Unused parts are set to 0 + * + */ +static uint8_t extended[256] = { + 0xc1, // 0x0 __ + 0xd8, // 0x1 + 0x90, // 0x2 + 0x90, // 0x3 + 0x80, // 0x4 + 0x80, // 0x5 + 0x80, // 0x6 + 0x80, // 0x7 + 0x80, // 0x8 + 0x80, // 0x9 + 0x80, // 0xa + 0x80, // 0xb + 0x80, // 0xc + 0x90, // 0xd + 0x80, // 0xe + 0x93, // 0xf + 0x94, // 0x10 + 0xa4, // 0x11 + 0x93, // 0x12 + 0xa3, // 0x13 + 0x94, // 0x14 + 0x94, // 0x15 + 0x93, // 0x16 + 0xa3, // 0x17 + 0xe8, // 0x18 + 0x82, // 0x19 + 0x54, // 0x1a + 0x64, // 0x1b + 0x82, // 0x1c + 0x82, // 0x1d + 0x82, // 0x1e + 0x82, // 0x1f + 0x80, // 0x20 + 0x80, // 0x21 + 0x80, // 0x22 + 0x80, // 0x23 + 0x80, // 0x24 + 0x80, // 0x25 + 0x80, // 0x26 + 0x80, // 0x27 + 0x94, // 0x28 + 0xa4, // 0x29 + 0x93, // 0x2a + 0xa4, // 0x2b + 0x93, // 0x2c + 0x93, // 0x2d + 0x92, // 0x2e + 0x92, // 0x2f + 0x80, // 0x30 + 0x80, // 0x31 + 0x80, // 0x32 + 0x80, // 0x33 + 0x80, // 0x34 + 0x80, // 0x35 + 0x80, // 0x36 + 0x80, // 0x37 + 0x80, // 0x38 + 0x80, // 0x39 + 0x80, // 0x3a + 0x80, // 0x3b + 0x80, // 0x3c + 0x80, // 0x3d + 0x80, // 0x3e + 0x80, // 0x3f + 0x92, // 0x40 + 0x92, // 0x41 + 0x92, // 0x42 + 0x92, // 0x43 + 0x92, // 0x44 + 0x92, // 0x45 + 0x92, // 0x46 + 0x92, // 0x47 + 0x92, // 0x48 + 0x92, // 0x49 + 0x92, // 0x4a + 0x92, // 0x4b + 0x92, // 0x4c + 0x92, // 0x4d + 0x92, // 0x4e + 0x92, // 0x4f + 0x80, // 0x50 + 0x94, // 0x51 + 0x94, // 0x52 + 0x94, // 0x53 + 0x94, // 0x54 + 0x94, // 0x55 + 0x94, // 0x56 + 0x94, // 0x57 + 0x94, // 0x58 + 0x94, // 0x59 + 0x93, // 0x5a + 0x94, // 0x5b + 0x94, // 0x5c + 0x94, // 0x5d + 0x94, // 0x5e + 0x94, // 0x5f + 0x92, // 0x60 + 0x92, // 0x61 + 0x92, // 0x62 + 0x93, // 0x63 + 0x93, // 0x64 + 0x93, // 0x65 + 0x93, // 0x66 + 0x93, // 0x67 + 0x93, // 0x68 + 0x93, // 0x69 + 0x93, // 0x6a + 0x93, // 0x6b + 0x94, // 0x6c + 0x94, // 0x6d + 0x92, // 0x6e + 0x93, // 0x6f + 0x94, // 0x70 + 0x80, // 0x71 + 0x80, // 0x72 + 0x80, // 0x73 + 0x93, // 0x74 + 0x93, // 0x75 + 0x93, // 0x76 + 0x80, // 0x77 + 0x80, // 0x78 + 0x80, // 0x79 + 0x40, // 0x7a + 0x80, // 0x7b + 0x94, // 0x7c + 0x94, // 0x7d + 0x42, // 0x7e + 0xa3, // 0x7f + 0x80, // 0x80 + 0x80, // 0x81 + 0x80, // 0x82 + 0x80, // 0x83 + 0x80, // 0x84 + 0x80, // 0x85 + 0x80, // 0x86 + 0x80, // 0x87 + 0x80, // 0x88 + 0x80, // 0x89 + 0x80, // 0x8a + 0x80, // 0x8b + 0x80, // 0x8c + 0x80, // 0x8d + 0x80, // 0x8e + 0x80, // 0x8f + 0xa0, // 0x90 + 0xa0, // 0x91 + 0xa0, // 0x92 + 0xa0, // 0x93 + 0xa0, // 0x94 + 0xa0, // 0x95 + 0xa0, // 0x96 + 0xa0, // 0x97 + 0xa0, // 0x98 + 0xa0, // 0x99 + 0xa0, // 0x9a + 0xa0, // 0x9b + 0xa0, // 0x9c + 0xa0, // 0x9d + 0xa0, // 0x9e + 0xa0, // 0x9f + 0x80, // 0xa0 + 0x80, // 0xa1 + 0x80, // 0xa2 + 0x92, // 0xa3 + 0xa2, // 0xa4 + 0xa2, // 0xa5 + 0x80, // 0xa6 + 0x80, // 0xa7 + 0x80, // 0xa8 + 0x80, // 0xa9 + 0x80, // 0xaa + 0xa2, // 0xab + 0xa2, // 0xac + 0xa2, // 0xad + 0xe0, // 0xae + 0x92, // 0xaf + 0xa0, // 0xb0 + 0xa2, // 0xb1 + 0x92, // 0xb2 + 0xa2, // 0xb3 + 0x92, // 0xb4 + 0x92, // 0xb5 + 0x90, // 0xb6 + 0x91, // 0xb7 + 0x92, // 0xb8 + 0x80, // 0xb9 + 0xca, // 0xba + 0xa2, // 0xbb + 0x92, // 0xbc + 0x92, // 0xbd + 0x90, // 0xbe + 0x91, // 0xbf + 0xa0, // 0xc0 + 0xa2, // 0xc1 + 0x94, // 0xc2 + 0xa3, // 0xc3 + 0x91, // 0xc4 + 0x80, // 0xc5 + 0x94, // 0xc6 + 0xd3, // 0xc7 + 0x80, // 0xc8 + 0x80, // 0xc9 + 0x80, // 0xca + 0x80, // 0xcb + 0x80, // 0xcc + 0x80, // 0xcd + 0x80, // 0xce + 0x80, // 0xcf + 0x94, // 0xd0 + 0x93, // 0xd1 + 0x93, // 0xd2 + 0x93, // 0xd3 + 0x93, // 0xd4 + 0x93, // 0xd5 + 0xa3, // 0xd6 + 0x80, // 0xd7 + 0x93, // 0xd8 + 0x93, // 0xd9 + 0x93, // 0xda + 0x93, // 0xdb + 0x93, // 0xdc + 0x93, // 0xdd + 0x93, // 0xde + 0x93, // 0xdf + 0x93, // 0xe0 + 0x93, // 0xe1 + 0x93, // 0xe2 + 0x93, // 0xe3 + 0x93, // 0xe4 + 0x93, // 0xe5 + 0x94, // 0xe6 + 0xa3, // 0xe7 + 0x93, // 0xe8 + 0x93, // 0xe9 + 0x93, // 0xea + 0x93, // 0xeb + 0x93, // 0xec + 0x93, // 0xed + 0x93, // 0xee + 0x93, // 0xef + 0x94, // 0xf0 + 0x93, // 0xf1 + 0x93, // 0xf2 + 0x93, // 0xf3 + 0x93, // 0xf4 + 0x93, // 0xf5 + 0x93, // 0xf6 + 0x80, // 0xf7 + 0x93, // 0xf8 + 0x93, // 0xf9 + 0x93, // 0xfa + 0x93, // 0xfb + 0x93, // 0xfc + 0x93, // 0xfd + 0x93, // 0xfe + 0x80, // 0xff +}; + +static uint64_t extension_1op_table = 0x155a55a5aaaa6aaa; + +static inline uint32_t opcode1byte_extension_lookup(uint8_t tblp, uint8_t moderm_reg) +{ + uint32_t tblp_t = tblp; + uint32_t moderm_reg_t = moderm_reg; + tblp_t = (tblp_t >> 4) & 0x3; + uint32_t shift = tblp_t * 16; + shift += moderm_reg_t * 2; + uint64_t res = extension_1op_table >> shift; + return res & 0x3; +} + +static uint32_t normal_2op_table = 0xa051a; + +static inline uint32_t opcode2byte_normal_lookup(uint8_t tblp_t, uint8_t moderm_reg_t) +{ + uint32_t tblp = tblp_t; + uint32_t moderm_reg = moderm_reg_t; + uint32_t shift = tblp * 8; + shift += moderm_reg * 2; + uint32_t res = normal_2op_table >> shift; + return res & 0x3; +} + +static uint8_t extension_2op_table[16] = { + 0x5a, + 0x05, + 0x00, + 0xa9, + 0x08, + 0x00, + 0x5a, + 0x50, + 0x96, + 0x40, + 0x55, + 0x00, + 0, + 0, + 0, + 0}; + +#define _mm_set1_epi32(val) (__extension__(__m128i)(__v4si){val, val, val, val}) + +#define _mm_set_epi64x(val1, val2) (__extension__(__m128i)(__v2di){val2, val1}) + +#define _mm_cmpeq_epi8(val1, val2) ((__m128i)((__v16qi)val1 == (__v16qi)val2)) + +#define _mm_set1_epi64x(val1) (__extension__(__m128i)(__v2di){val1, val1}) + +#define _mm_set1_epi8(val1) (__extension__(__m128i)(__v16qi){ \ + val1, val1, val1, val1, val1, val1, val1, val1, \ + val1, val1, val1, val1, val1, val1, val1, val1}) + +#define _mm_and_si128(val1, val2) ((__m128i)((__v2du)val1 & (__v2du)val2)) + +#define _mm_or_si128(val1, val2) ((__m128i)((__v2du)val1 | (__v2du)val2)) + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_loadu_si128 (__m128i_u const *__P) +{ + return *__P; +} + +#define _mm256_set1_epi64x(val1) (__extension__(__m256i)(__v4di){val1, val1, val1, val1}) + +#define _mm256_set1_epi8(val1) (__extension__(__m256i)(__v32qi){ \ + val1, val1, val1, val1, val1, val1, val1, val1, \ + val1, val1, val1, val1, val1, val1, val1, val1, \ + val1, val1, val1, val1, val1, val1, val1, val1, \ + val1, val1, val1, val1, val1, val1, val1, val1}) + +#define _mm256_and_si256(val1, val2) ((__m256i)((__v4du)val1 & (__v4du)val2)) + +#define _mm256_or_si256(val1, val2) ((__m256i)((__v4du)val1 | (__v4du)val2)) + +#define _mm256_cmpeq_epi8(val1, val2) ((__m256i)((__v32qi)val1 == (__v32qi)val2)) + +static inline uint32_t opcode2byte_extension_lookup(uint8_t tblp_t, uint8_t moderm_reg_t) +{ + uint32_t tblp = tblp_t; + uint32_t moderm_reg = moderm_reg_t; + __m128i num = (__m128i)__builtin_ia32_lddqu((char const *)extension_2op_table); + uint32_t idx = ((tblp * 2)) | ((tblp * 2 + 1) << 8); + __m128i res = (__m128i)__builtin_ia32_pshufb128((__v16qi)num, (__v16qi)_mm_set1_epi32((int)idx)); + uint32_t ans = (uint32_t)res[0]; + ans = ans >> moderm_reg * 2; + return ans & 0x3; +} + +/** + * @brief cselect for uint32_t types + * must be manually inspected to ensure the compiler generates CMOVcc family instructions + * + * @param pred + * @param old_val + * @param new_val + * @return uint32_t + */ +static inline uint32_t cselect(uint64_t pred, uint32_t old_val, uint32_t new_val) +{ + __asm__("cmp %1, 0\n\t" + "cmovne %0, %2" + : "+rm"(new_val) + : "rm"(pred), "rm"(old_val)); + return new_val; +} + +static inline int32_t cselect32(uint64_t pred, int32_t old_val, int32_t new_val) +{ + __asm__("cmp %1, 0\n\t" + "cmovne %0, %2" + : "+rm"(new_val) + : "rm"(pred), "rm"(old_val)); + return new_val; +} +static inline uint64_t cselect64(uint64_t pred, const uint64_t expected, uint64_t old_val, uint64_t new_val) __attribute__((always_inline)); +/** + * @brief cselect for uint64_t types + * must be manually inspected to ensure the compiler generates CMOVcc family instructions + * + * @param pred + * @param old_val + * @param new_val + * @return uint64 + */ +static inline uint64_t cselect64(uint64_t pred, const uint64_t expected, uint64_t old_val, uint64_t new_val) +{ + __asm__("cmp %1, %3\n\t" + "cmove %0, %2" + : "+rm"(new_val) + : "rm"(pred), "rm"(old_val), "irm"(expected)); + return new_val; +} + +static inline int64_t cselect64s(uint64_t pred, int64_t old_val, int64_t new_val) +{ + __asm__("cmp %1, 0\n\t" + "cmovne %0, %2" + : "+rm"(new_val) + : "rm"(pred), "rm"(old_val)); + return new_val; +} + +/** + * @brief load a value as uint64_t from given address + * + * @param ptr + * @return uint64 + */ +static inline uint64_t load_u64(__m128i instr_data) +{ + return (uint64_t)instr_data[0]; +} + +/** + * @brief load a value as uint8_t from given address with idx(secret) + * Normally it will leak the idx if we load the exact result as uint8_t + * So we read the whole uint64_t and use safe shift ops to hide the idx + * + * @param ptr + * @param idx secret, ensured won't out of bounds + * @return uint8_t + */ +static inline uint8_t load_u8_idx(uint8_t *ptr, uint8_t idx) +{ + return (uint8_t)((*((uint64_t *)ptr)) >> 8 * (idx)) & 0xff; +} + +/** + * @brief load an uint32_t from a given address and idx + * Noted the range of idx is [0, 15], so the uint32_t may not be aligned + * Also we need to hide the idx as secret, and extract/shift for __m128 requires an IMM8, not a register value + * So we have to use shuffle here + * Can also try the commented code above, which uses pure uint64_t to do the same thing + * + * @param ptr + * @param idx secret + * @return uint32 + */ +static inline uint32_t load_mm_u32(__m128i instr_data, uint8_t idx_t) +{ + uint64_t idx = idx_t; + __m128i num = instr_data; + uint64_t query = 0x8080808000000000 | (idx) | ((idx + 1) << 8) | (idx + 2) << 16 | (idx + 3) << 24; + __m128i query128 = _mm_set_epi64x((long long)0x8080808080808080, (long long)query); + __m128i res = (__m128i)__builtin_ia32_pshufb128((__v16qi)num, (__v16qi)query128); + return (uint32_t)res[0]; +} + +/** + * @brief load an uint8_t from a given address and idx + * Similar as load_u32, but much easier + * @param ptr + * @param idx secret + * @return uint8_t + */ +static inline uint8_t load_mm_u8(__m128i instr_data, uint8_t idx) +{ + __m128i num = instr_data; + uint64_t query = 0x8080808080808000 | idx; + __m128i query128 = _mm_set_epi64x((long long)0x8080808080808080, (long long)query); + __m128i res = (__m128i)__builtin_ia32_pshufb128((__v16qi)num, (__v16qi)query128); + return (uint8_t)res[0]; +} + +/** + * @brief Store an uint32_t into a given idx + * normally this method will leak the idx, but here we use a fixed index in the output array, so no information leaked + * + * @param ptr + * @param idx + * @param val + */ +static inline void store_u32(uint8_t *ptr, uint8_t idx, uint32_t val) +{ + uint32_t *pptr = (uint32_t *)(ptr + idx); + *pptr = val; +} + +/** + * @brief Check wether a given prefix pattern exists in the given data + * refer to `standard` table encoding for pattern info + * Many prefix are like yes or not booleans, so we only needs whether it exists or not + * + * @param data + * @param target_prefix + * @return uint32 + */ +static inline uint32_t check_prefix(uint64_t data, uint64_t target_prefix) +{ + uint64_t res = (uint64_t)_mm_cmpeq_epi8(_mm_set_epi64x(~0, (long long)data), _mm_set_epi64x(0, (long long)target_prefix))[0]; + uint32_t ans = cselect(res == 0, 0, 1); + return ans; +} + +/** + * @brief lookup prefix & opcode 1 in one batch + * + * Each byte is corresponding to an element in the `standard` table + * We use high 4 bits to serach for blocks, each block has 16 elements, so the lower 4 bits are indexs to search for the exact element in one block + * + * @param prs the lower 6 bytes are valid(at most 5 bytes prefix + 1 byte opcode) + * @return uint64_t The lower 6 bytes are valid, others should be 0 + */ +static inline uint64_t prefix_op1_lookup(uint64_t prs) +{ + __m128i query = _mm_set1_epi64x((long long)prs); + __m128i *table = (__m128i *)standard; + uint64_t ans = 0; + __m128i mask = _mm_set1_epi8((char)0xf0); + __m128i op1_h4b = _mm_and_si128(mask, query); + __m128i op1_l4b = (__m128i)__builtin_ia32_pandn128((__v2di)mask, (__v2di)query); + for (char i = 0; i < 16; i++) + { + __m128i rmask = _mm_cmpeq_epi8(op1_h4b, _mm_set1_epi8((char)(i << 4))); + // here another way is to use _mm_andnot_si128, I tried on test server, doesn't have much differences + rmask = (__m128i)__builtin_ia32_pandn128((__v2di)rmask, (__v2di)_mm_set1_epi8((char)0x80)); + rmask = _mm_or_si128(rmask, op1_l4b); + __m128i tb = _mm_loadu_si128(table + i); + __m128i res = (__m128i)__builtin_ia32_pshufb128((__v16qi)tb, (__v16qi)rmask); + ans |= (uint64_t)res[0]; + } + return ans & 0x0000ffffffffffff; +} + +/** + * @brief lookup opcode 2 in one batch + * + * Each byte is corresponding to an element in the `extended` table + * We use high 4 bits to serach for blocks, each block has 16 elements, so the lower 4 bits are indexs to search for the exact element in one block + * + * @param op2 the lower 1 byte is valid(at most 1 byte opcode 2) + * @return uint8_t Table look up result + */ +static inline uint8_t op2_lookup(uint8_t op2) +{ + __m128i *table = (__m128i *)extended; + uint64_t ans = 0; + for (uint64_t i = 0; i < 16; i++) + { + // since we only need to lookup 1 byte, use this(uint8_t) have no instruction number diff in binary + uint8_t rmask = (uint8_t)cselect((op2 >> 4) == i, 0x00, 0x80); + __m128i tb = _mm_loadu_si128(table + i); + __m128i res = (__m128i)__builtin_ia32_pshufb128((__v16qi)tb, (__v16qi)_mm_set1_epi8((char)(rmask | (op2 & 0xf)))); + ans |= (uint64_t)res[0]; + } + return (uint8_t)ans; +} + +static inline __m128i prefix_op1_op2_lookup(uint64_t op1) +{ + __m256i query = _mm256_set1_epi64x((long long)op1); + __m128i *table1 = (__m128i *)standard; + __m128i *table2 = (__m128i *)extended; + __m256i mask = _mm256_set1_epi8((char)0xf0); + __m256i op1_h4b = _mm256_and_si256(mask, query); + __m256i op1_l4b = (__m256i)__builtin_ia32_andnotsi256((__v4di)mask, (__v4di)query); + uint64_t ans1 = 0; + uint64_t ans2 = 0; + for (uint64_t i = 0; i < 16; i++) + { + __m256i rmask = _mm256_cmpeq_epi8(op1_h4b, _mm256_set1_epi8((char)(i << 4))); + // here another way is to use _mm_andnot_si128, I tried on test server, doesn't have much differences + rmask = (__m256i)__builtin_ia32_andnotsi256(rmask, _mm256_set1_epi8((char)0x80)); + rmask = _mm256_or_si256(rmask, op1_l4b); + __m128i tb1 = _mm_loadu_si128(table1 + i); + __m128i tb2 = _mm_loadu_si128(table2 + i); + __m256i tb12 = (__m256i)__builtin_ia32_vinsertf128_si256((__v8si)__builtin_ia32_si256_si((__v4si)tb1), (__v4si)tb2, 1); + __m256i res = (__m256i)__builtin_ia32_pshufb256((__v32qi)tb12, (__v32qi)rmask); + ans1 |= (uint64_t)res[0]; + ans2 |= (uint64_t)res[2]; + } + return _mm_set_epi64x((long long)ans2, ans1 & 0x0000ffffffffffff); +} + +/** + * @brief Constant-time read the register value while hiding the register we actually read + * Since AVX2 doesn't support much for uint64, we need to manually do it + * + * @param ctx + * @param idx + * @return uint64_t + */ +static uint64_t register_value_select(sgx_cpu_context_t *ctx, uint32_t idx) +{ + uint64_t ans = 0; + ans = cselect64(idx, 0x0, ctx->rax, ans); + ans = cselect64(idx, 0x1, ctx->rcx, ans); + ans = cselect64(idx, 0x2, ctx->rdx, ans); + ans = cselect64(idx, 0x3, ctx->rbx, ans); + ans = cselect64(idx, 0x4, ctx->rsp, ans); + ans = cselect64(idx, 0x5, ctx->rbp, ans); + ans = cselect64(idx, 0x6, ctx->rsi, ans); + ans = cselect64(idx, 0x7, ctx->rdi, ans); + ans = cselect64(idx, 0x8, ctx->r8, ans); + ans = cselect64(idx, 0x9, ctx->r9, ans); + ans = cselect64(idx, 0xa, ctx->r10, ans); + ans = cselect64(idx, 0xb, ctx->r11, ans); + ans = cselect64(idx, 0xc, ctx->r12, ans); + ans = cselect64(idx, 0xd, ctx->r13, ans); + ans = cselect64(idx, 0xe, ctx->r14, ans); + ans = cselect64(idx, 0xf, ctx->r15, ans); + return ans; +} + +/** + * Always load 2 pages data and combine together + * @param ptr + * @return + */ +static inline __m128i load_cross_page(uint8_t* ptr){ + uint8_t *rip_pfn = (uint8_t *) (((uint64_t) ptr) & (uint64_t)(~0xFFF)); + uint8_t *rip_idx = (uint8_t *) (((uint64_t)ptr) & 0xFFF); + + // the data is the original data 8 bytes from the target memory + // first ensure it doesn't cross a page boundary + int64_t rip_idx_masked = cselect64s((uint64_t) rip_idx <= (4096 - 16), (int64_t) rip_idx, 4096 - 16); + __m128i_u * loc = (__m128i_u *) ((uint64_t) rip_pfn | (uint64_t)rip_idx_masked); + __m128i data = _mm_loadu_si128(loc); + + // finally combine both as needed + uint32_t shift_amount = cselect((uint64_t) rip_idx > (4096 - 16), 16 - (uint32_t)(4096 - (uint64_t)rip_idx), 0); + //we need to make a _m256i first then do the shift + //suppose if it across the page, we just pad with 0x00 + data = data << shift_amount; + return data; +} + +/** + * @brief our function to disassemble one instruction from the given input + * One x64 instruction has at most 15 bytes, and our current output takes 12 bytes + * + */ +int ct_decode(sgx_cpu_context_t *ctx, uint64_t *addr) +{ + //save & protect the status of ymm/zmm registers + __m512i mm_regs[NUM_MM_REGS]; + + if (g_cpu_feature_indicator & CPU_FEATURE_AVX512F) + { + // Restore ZMM0-ZMM15 + asm ( // save caller state + "vmovaps [%0+0*64], zmm0\n\t" + "vmovaps [%0+1*64], zmm1\n\t" + "vmovaps [%0+2*64], zmm2\n\t" + "vmovaps [%0+3*64], zmm3\n\t" + "vmovaps [%0+4*64], zmm4\n\t" + "vmovaps [%0+5*64], zmm5\n\t" + "vmovaps [%0+6*64], zmm6\n\t" + "vmovaps [%0+7*64], zmm7\n\t" + "vmovaps [%0+8*64], zmm8\n\t" + "vmovaps [%0+9*64], zmm9\n\t" + "vmovaps [%0+10*64], zmm10\n\t" + "vmovaps [%0+11*64], zmm11\n\t" + "vmovaps [%0+12*64], zmm12\n\t" + "vmovaps [%0+13*64], zmm13\n\t" + "vmovaps [%0+14*64], zmm14\n\t" + "vmovaps [%0+15*64], zmm15\n\t" + : : "r" (mm_regs) : "memory" + ); + } else { + // Restore YMM0-YMM15 + asm ( // save caller state + "vmovaps [%0+0*32], ymm0\n\t" + "vmovaps [%0+1*32], ymm1\n\t" + "vmovaps [%0+2*32], ymm2\n\t" + "vmovaps [%0+3*32], ymm3\n\t" + "vmovaps [%0+4*32], ymm4\n\t" + "vmovaps [%0+5*32], ymm5\n\t" + "vmovaps [%0+6*32], ymm6\n\t" + "vmovaps [%0+7*32], ymm7\n\t" + "vmovaps [%0+8*32], ymm8\n\t" + "vmovaps [%0+9*32], ymm9\n\t" + "vmovaps [%0+10*32], ymm10\n\t" + "vmovaps [%0+11*32], ymm11\n\t" + "vmovaps [%0+12*32], ymm12\n\t" + "vmovaps [%0+13*32], ymm13\n\t" + "vmovaps [%0+14*32], ymm14\n\t" + "vmovaps [%0+15*32], ymm15\n\t" + : : "r" (mm_regs) : "memory" + ); + } + // properly handle the cross page read + uint8_t *input = (uint8_t *)(ctx->rip); + uint8_t *rip_pfn = (uint8_t *) (((uint64_t) input) & (uint64_t)(~0xFFF)); + uint8_t *rip_idx = (uint8_t *) (((uint64_t)input) & 0xFFF); + + // the data is the original data 8 bytes from the target memory + // first ensure it doesn't cross a page boundary + int64_t rip_idx_masked = cselect64s((uint64_t) rip_idx <= (4096 - 16), (int64_t) rip_idx, 4096 - 16); + __m128i_u* loc = (__m128i_u *) ((uint64_t) rip_pfn | (uint64_t)rip_idx_masked); + __m128i instr_data = _mm_loadu_si128(loc); + + // finally combine both as needed + // this is also the K bytes that we failed to protect + uint32_t shift_amount = cselect((uint64_t) rip_idx > (4096 - 16), 16 - (uint32_t)(4096 - (uint64_t)rip_idx), 0); + //we need to make a _m256i first then do the shift + //suppose if it across the page, we just pad with 0x00 + instr_data = instr_data << shift_amount; + + + // the data is the original data 8 bytes from the target memory + uint64_t data = load_u64(instr_data); + // we do the table look up for its first 6 bytes + __m128i op1op2 = prefix_op1_op2_lookup(data); + uint64_t lktb = (uint64_t)op1op2[0]; + uint32_t rmmod = 0; + uint32_t size = 0; + uint32_t rex = 0; + uint32_t extended_opcode = 0; + uint32_t idx = 15; + uint32_t lock_prefix = 0; + uint32_t string_instruction = 0; + uint32_t stack_instruction = 0; + uint32_t stack_instruction_ext = 0; + uint32_t operand_rewrite = 0; + uint32_t addr_rewrite = 0; + uint32_t effective_prefix_extension = 0; + // We need to know the exact idx of our opcode 1 + for (uint32_t i = 0; i < 6; i++) + { + uint64_t shift = (lktb >> (i * 8)); + // to avoid short-circuit evaluation, we need to combine everything into a number to cheat the compiler + uint32_t fg2 = (shift & 0xc0) >= 0x80; + uint32_t fg1 = (idx << 8) | fg2; + idx = cselect(fg1 == 0xf01, i, idx); + } + + // we move everything outside our loop as much as possible, then this can reduce the total number of instructions + // `tblp` is the table lookup result of `op1` + uint8_t tblp = (lktb >> (8 * idx)) & 0xff; + uint8_t op1 = (uint8_t)(data >> (8 * idx)); + // rmmod is: 0 => no memory access 1 => memory read 2 => memory write + rmmod = (tblp >> 2) & 0x3; + // size is the memory access size: byte, word, dword, qward, ... + size = tblp & 0x3; + // check if it's escape opcode, use this as mask for op2 + extended_opcode = cselect(op1 == 0xf, 1, 0); + string_instruction = cselect((tblp & 0xf0) == 0xa0, 1, 0); + stack_instruction = cselect((tblp & 0xf0) == 0xb0, 1, 0); + + // now we can handle the prefixes with the help of idx + uint64_t prefixes = lktb & (uint64_t)(0x000000ffffffffff >> ((5 - idx) * 8)); + // check prefixes, rex is special because if covers 0x40 - 0x4f + rex = check_prefix(prefixes >> ((idx - 1) * 8), 0x4040404040404040); + rex = cselect(rex, (data >> ((idx - 1) * 8)) & 0x4f, 0); + // operand size rewrite + operand_rewrite = check_prefix(prefixes, 0x2020202020202020); + effective_prefix_extension = cselect(operand_rewrite, 1, effective_prefix_extension); + addr_rewrite = check_prefix(prefixes, 0x3030303030303030); + lock_prefix = check_prefix(prefixes, 0x0303030303030303); + // f2 prefix check + effective_prefix_extension = cselect(check_prefix(prefixes, 0x0101010101010101), 3, effective_prefix_extension); + // f3 prefix check + effective_prefix_extension = cselect(check_prefix(prefixes, 0x0202020202020202), 2, effective_prefix_extension); + + // now we need to do an op2 table look up + uint8_t op2tb = (uint8_t)(op1op2[1] >> (idx + 1) * 8); + // rm mod and memory access size + uint32_t op2_rm = (op2tb >> 4) & 0x3; + uint32_t op2_size = op2tb & 0x07; + // use mask result to decide whether save these result + rmmod = cselect(extended_opcode, op2_rm, rmmod); + size = cselect(extended_opcode, op2_size, size); + // we also need to update idx + idx += extended_opcode; + uint32_t idx_tail = 1; + // we shift the idx to the modrm byte location(suppose it has) + uint8_t modRM = load_mm_u8(instr_data, (uint8_t)(idx + 1)); + //idx_tail += cselect(rmmod == 0, 0, 1); + // now we need to decompose mod rm + uint8_t modRM_mod = (modRM >> 6) & (3); + // for reg, in opcode extension, they works as identifier + uint8_t modRM_reg = (modRM >> 3) & (7); + + uint8_t modRM_rm = modRM & (7); + // create flags and save mod_rm info + uint8_t mod_rm_mod = (uint8_t)((modRM_mod << 6) | modRM_rm); + // one special case for modRM, is when mod isn't 3 and rm is 4, we need to revert rex bit, otherwise we need to add rex bit into modRM + uint32_t big_fg = cselect(modRM_mod != 3, 1, 0); + + // below is the case we handle opcode extension for 1 byte + rmmod = cselect((tblp >> 6) == 0x3, opcode1byte_extension_lookup(tblp, modRM_reg), rmmod); + // call/push instructions under opcode 1 byte extension table + uint32_t potential_stack_instruction = cselect(((modRM_reg == 2) | (modRM_reg == 3) | (modRM_reg == 6)) > 0, 1, 0); + stack_instruction_ext = cselect(( (uint32_t)(op1<<4) | (uint32_t)((tblp >> 6) + potential_stack_instruction) ) == 0xff4, 1, 0); + + // below is the case we handle for normal unusual 2 byte opcode 0x7e, 0x1a, 0x1b + uint32_t tmp_rm = opcode2byte_normal_lookup((uint8_t)op2_rm, (uint8_t)effective_prefix_extension); + // the condtion to apply this is: we are in 2 byte opcode mode and the opcode is valid + rmmod = cselect(((extended_opcode << 4) | (op2tb >> 6)) == 0x11, tmp_rm, rmmod); + + // below is the case we handle opcode extension for 2 byte opcode + tmp_rm = opcode2byte_extension_lookup((op2tb >> 3) & 0x7, modRM_reg); + // condition is : 2 byte opcode and extension mode + rmmod = cselect(((extended_opcode << 4) | (op2tb >> 6)) == 0x13, tmp_rm, rmmod); + rmmod = cselect(((extended_opcode << 12) | (uint32_t)((op2tb >> 6) << 8) | (uint32_t)(modRM_mod << 4) | (uint32_t)((op2tb >> 3) & 0x7)) >= 0x1333, 0, rmmod); + + // save the memory access info and size + rmmod = cselect(modRM_mod == 3, 0, rmmod); + + // Constant time ops, we always need to do them + uint8_t sib = load_mm_u8(instr_data, (uint8_t)(idx + 2)); + uint32_t mod0_rm5 = load_mm_u32(instr_data, (uint8_t)(idx + 2)); + uint32_t sib_base5_mod02 = load_mm_u32(instr_data, (uint8_t)(idx + 3)); + uint8_t sib_mod1 = load_mm_u8(instr_data, (uint8_t)(idx + 3)); + + + // remaining output + uint32_t imm4 = 0; + uint32_t sib_base = 0; + uint32_t sib_index = 0xff; + uint32_t sib_scale = 0xff; + uint32_t imm1 = 0; + + // bit ops + // when modRM_mod is 0 and modRM_rm is 5, it's displacement only + imm4 = cselect(mod_rm_mod == 0x05, mod0_rm5, imm4); + idx_tail += cselect(mod_rm_mod == 0x05, 4, 0); + + // when modRM_mod isn't 3 and modRM_rm is 4, means we have SIB byte + sib_scale = cselect((big_fg << 8 | modRM_rm) == 0x104, (sib >> 6) & (3), sib_scale); + idx_tail += cselect((big_fg << 8 | modRM_rm) == 0x104, 1, 0); + // we can apply rex directly, since no other special cases + sib_index = cselect((big_fg << 8 | modRM_rm) == 0x104, ((sib >> 3) & 7) | ((rex & 0x2) << 2), sib_index); + sib_base = cselect((big_fg << 8 | modRM_rm) == 0x104, (sib & 7) | ((rex & 0x1) << 3), sib_base); + + imm4 = cselect((big_fg << 16 | ((sib_base & 0x7) << 8) | mod_rm_mod) == 0x10504, sib_base5_mod02, imm4); + idx_tail += cselect((big_fg << 16 | ((sib_base & 0x7) << 8) | mod_rm_mod) == 0x10504, 4, 0); + imm1 = cselect((big_fg << 8 | mod_rm_mod) == 0x144, sib_mod1, imm1); + idx_tail += cselect((big_fg << 8 | mod_rm_mod) == 0x144, 1, 0); + imm4 = cselect((big_fg << 8 | mod_rm_mod) == 0x184, sib_base5_mod02, imm4); + idx_tail += cselect((big_fg << 8 | mod_rm_mod) == 0x184, 4, 0); + uint32_t small_fg = cselect(modRM_rm != 4, 1, 0); + imm1 = cselect(((big_fg << 16) | (small_fg << 8) | modRM_mod) == 0x10101, sib, imm1); + idx_tail += cselect(((big_fg << 16) | (small_fg << 8) | modRM_mod) == 0x10101, 1, 0); + imm4 = cselect(((big_fg << 16) | (small_fg << 8) | modRM_mod) == 0x10102, mod0_rm5, imm4); + idx_tail += cselect(((big_fg << 16) | (small_fg << 8) | modRM_mod) == 0x10102, 4, 0); + // apply rex rewrite if necessary + modRM_rm = (uint8_t)cselect(((big_fg << 8) | (uint32_t)modRM_rm) == 0x104, 0x4, modRM_rm | ((rex & 0x1) << 3)); + + // This will be our final output + uint64_t addr_ans = 0; + int64_t disp = (int64_t)((int8_t)imm1); + disp = cselect64s(disp == 0, (int64_t)imm4, disp); + + // handle with the base register + uint32_t base1 = cselect(mod_rm_mod == 0x04, 0xff, modRM_rm); + sib_base = cselect(((uint32_t)(modRM_mod << 4) | (sib_base & 0x7)) == 0x5, 0xff, sib_base); + uint32_t base = cselect(modRM_rm == 0x4, sib_base, base1); + base = cselect(((modRM_mod << 4) | (modRM & (7))) == 0x05, 16, base); + uint64_t base_val = register_value_select(ctx, base); + base_val = cselect64(base, 16, ctx->rip, base_val); + base_val = cselect64(base, 0xff, 0, base_val); + base_val = cselect64(addr_rewrite, 1, (uint64_t)((uint32_t)base_val), base_val); + addr_ans += base_val; + + // handle with the index register & the scale + uint32_t index = cselect((sib_index) == 0x4, 0xff, sib_index); + uint32_t scale = sib_scale; + uint64_t index_val = register_value_select(ctx, index); + index_val = cselect64(index, 16, ctx->rip, index_val); + index_val = cselect64(index, 0xff, 0, index_val); + index_val = cselect64(addr_rewrite, 1, (uint64_t)((uint32_t)index_val), index_val); + uint64_t scale_val = (uint64_t)(1 << scale); + addr_ans += index_val * scale_val; + + idx_tail += cselect(rmmod == 0, 0, 1); + // add the displacement + addr_ans = (uint64_t)((int64_t)addr_ans + disp); + //special case handler for string related instructions + addr_ans = cselect64(string_instruction, 1, register_value_select(ctx, (((uint32_t)tblp>>2) & 0xf) + 0x6 ), addr_ans); + rmmod = cselect(string_instruction, ((uint32_t)tblp & 0x3) + 1, rmmod); //either read or write, no no-access + rmmod = cselect(stack_instruction | stack_instruction_ext, 2, rmmod); + addr_ans = cselect64(stack_instruction | stack_instruction_ext, 1, register_value_select(ctx, 4), addr_ans); + rmmod = cselect(rmmod + lock_prefix > 2, 1, rmmod); + idx_tail = cselect(stack_instruction | string_instruction, 1, idx_tail); + idx_tail = cselect(stack_instruction_ext, 2, idx_tail); + // idx will always be the length of prefixes + opcodes -1 + // idx_tail starting with 1, then includes the length of the rest: modRM, sib, disp + // we won't have IMM lengths, luckily they are not affect much about address accessing + // still under testing the case: IMM may have affect on memory accessing + uint32_t inst_length = idx + idx_tail; + // write the final output to the memory + rmmod = cselect((rmmod & (inst_length + shift_amount < 17)) != 0, rmmod, rmmod); + *addr = addr_ans; + + if (g_cpu_feature_indicator & CPU_FEATURE_AVX512F) + { + asm ( // restore caller state + "vmovaps zmm0, [%0+0*64]\n\t" + "vmovaps zmm1, [%0+1*64]\n\t" + "vmovaps zmm2, [%0+2*64]\n\t" + "vmovaps zmm3, [%0+3*64]\n\t" + "vmovaps zmm4, [%0+4*64]\n\t" + "vmovaps zmm5, [%0+5*64]\n\t" + "vmovaps zmm6, [%0+6*64]\n\t" + "vmovaps zmm7, [%0+7*64]\n\t" + "vmovaps zmm8, [%0+8*64]\n\t" + "vmovaps zmm9, [%0+9*64]\n\t" + "vmovaps zmm10, [%0+10*64]\n\t" + "vmovaps zmm11, [%0+11*64]\n\t" + "vmovaps zmm12, [%0+12*64]\n\t" + "vmovaps zmm13, [%0+13*64]\n\t" + "vmovaps zmm14, [%0+14*64]\n\t" + "vmovaps zmm15, [%0+15*64]\n\t" + : : "r" (mm_regs) : "memory" + ); + } else { + //restore the status of ymm registers + asm ( // restore caller state + "vmovaps ymm0, [%0+0*32]\n\t" + "vmovaps ymm1, [%0+1*32]\n\t" + "vmovaps ymm2, [%0+2*32]\n\t" + "vmovaps ymm3, [%0+3*32]\n\t" + "vmovaps ymm4, [%0+4*32]\n\t" + "vmovaps ymm5, [%0+5*32]\n\t" + "vmovaps ymm6, [%0+6*32]\n\t" + "vmovaps ymm7, [%0+7*32]\n\t" + "vmovaps ymm8, [%0+8*32]\n\t" + "vmovaps ymm9, [%0+9*32]\n\t" + "vmovaps ymm10, [%0+10*32]\n\t" + "vmovaps ymm11, [%0+11*32]\n\t" + "vmovaps ymm12, [%0+12*32]\n\t" + "vmovaps ymm13, [%0+13*32]\n\t" + "vmovaps ymm14, [%0+14*32]\n\t" + "vmovaps ymm15, [%0+15*32]\n\t" + : : "r" (mm_regs) : "memory" + ); + } + + return (int)rmmod; +} diff --git a/sdk/trts/ctd.h b/sdk/trts/ctd.h new file mode 100644 index 000000000..140ad72db --- /dev/null +++ b/sdk/trts/ctd.h @@ -0,0 +1,59 @@ +/* + * Copyright (C) 2011-2023 Intel Corporation. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +/** + * File: ctd.h + * Description: + * Constant-time decoder header + */ + +#ifndef _CTD_H_ +#define _CTD_H_ +#include "se_types.h" +#include "sgx_trts_exception.h" +#include "se_cpu_feature.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * @brief our function to disassemble one instruction from the given input + * One x64 instruction has at most 15 bytes, and our current output takes 12 bytes + * + */ +int ct_decode(sgx_cpu_context_t *ctx, uint64_t *addr); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/sdk/trts/linux/trts_mitigation.S b/sdk/trts/linux/trts_mitigation.S index b419247fb..cbfa6c2f3 100644 --- a/sdk/trts/linux/trts_mitigation.S +++ b/sdk/trts/linux/trts_mitigation.S @@ -65,7 +65,7 @@ aex_notify_c3_cache: * uintptr_t ssa_aexnotify_addr, * uintptr_t stack_tickle_pages, * uintptr_t code_tickle_page, - * uintptr_t data_tickle_page, + * uintptr_t data_tickle_address, * uintptr_t c3_byte_address); * * Function: constant_time_apply_sgxstep_mitigation_and_continue_execution @@ -77,7 +77,7 @@ aex_notify_c3_cache: * - ssa_aexnotify_addr: address of the SSA[0].GPRSGX.AEXNOTIFY byte * - stack_tickle_pages: Base address of stack page(s) to tickle * - code_tickle_page: Base address of code page to tickle - * - data_tickle_page: Base address of data page to tickle + * - data_tickle_address: address of data memory to tickle * - c3_byte_address: Address of a c3 byte in code_tickle_page * There are three additional "implicit" parameters to this function: * 1. The low-order bit of `stack_tickle_pages` is 1 if a second stack diff --git a/sdk/trts/trts_veh.cpp b/sdk/trts/trts_veh.cpp index 46b4a6695..11504d49d 100644 --- a/sdk/trts/trts_veh.cpp +++ b/sdk/trts/trts_veh.cpp @@ -55,6 +55,7 @@ #include "emm_private.h" #include "sgx_mm_rt_abstraction.h" #include "sgx_trts_aex.h" +#include "ctd.h" #include "se_memcpy.h" typedef struct _handler_node_t @@ -199,18 +200,30 @@ extern "C" __attribute__((regparm(1))) void continue_execution(sgx_exception_inf extern "C" void restore_xregs(uint8_t *buf); extern "C" void constant_time_apply_sgxstep_mitigation_and_continue_execution(sgx_exception_info_t *info, - uintptr_t ssa_aexnotify_addr, uintptr_t stack_tickle_pages, uintptr_t code_tickle_page, uintptr_t data_tickle_page, uintptr_t c3_byte_address); + uintptr_t ssa_aexnotify_addr, uintptr_t stack_tickle_pages, uintptr_t code_tickle_page, uintptr_t data_tickle_address, uintptr_t c3_byte_address); +// constant time select based on given condition +static inline uint64_t cselect64(uint64_t pred, const uint64_t expected, uint64_t old_val, uint64_t new_val) +{ + __asm__("cmp %3, %1\n\t" + "cmove %2, %0" + : "+rm"(new_val) + : "rm"(pred), "rm"(old_val), "ri"(expected)); + return new_val; +} // apply the constant time mitigation handler static void apply_constant_time_sgxstep_mitigation_and_continue_execution(sgx_exception_info_t *info) { thread_data_t *thread_data = get_thread_data(); - uintptr_t code_tickle_page, c3_byte_address, stack_tickle_pages, data_tickle_page, + int ct_result; + uint64_t data_address; + uintptr_t code_tickle_page, c3_byte_address, stack_tickle_pages, data_tickle_address, stack_base_page = ((thread_data->stack_base_addr & ~0xFFF) == 0) ? (thread_data->stack_base_addr) - 0x1000 : (thread_data->stack_base_addr & ~0xFFF), stack_limit_page = thread_data->stack_limit_addr & ~0xFFF; + int data_tickle_address_is_within_enclave; // Determine which stack pages can be tickled if (((uintptr_t)info & ~0xFFF) == stack_base_page) { @@ -231,11 +244,6 @@ static void apply_constant_time_sgxstep_mitigation_and_continue_execution(sgx_ex stack_tickle_pages = (((uintptr_t)info & ~0xFFF) + 0x1000) | 1; } - // Determine which data page can be tickled. - // FIXME: This will eventually call the CTD to obtain the page. For - // now, we redundantly tickle a stack page. - data_tickle_page = stack_tickle_pages & ~1; - // Look up the code page in the c3 cache code_tickle_page = info->cpu_context.REG(ip) & ~0xFFF; c3_byte_address = code_tickle_page + *(aex_notify_c3_cache + ((code_tickle_page >> 12) & 0x07FF)); @@ -251,6 +259,29 @@ static void apply_constant_time_sgxstep_mitigation_and_continue_execution(sgx_ex } } + ct_result = ct_decode(&info->cpu_context, &data_address); + + data_tickle_address = stack_tickle_pages & ~0x1; + data_tickle_address = cselect64(ct_result, 1, data_address, data_tickle_address); + data_tickle_address = cselect64(ct_result, 2, data_address, data_tickle_address); + data_tickle_address_is_within_enclave = + sgx_is_within_enclave((void*) data_tickle_address, sizeof(uint8_t)); + + /* + * Ensure the tickle page dereferenced by the mitigation lies _inside_ the enclave. + * + * NOTE: + * - Unguarded user memory accesses can leak through MMIO stale data. + * - User memory accesses are detectable and single-steppable anyway. + * - Below non-cst time check can only ever be false when the next enclave + * instruction will dereference user memory (trivially known to attacker). + */ + data_tickle_address = data_tickle_address_is_within_enclave ? + data_tickle_address : stack_tickle_pages & ~0x1; + + code_tickle_page = cselect64(ct_result, 2, code_tickle_page | 0x1, code_tickle_page); + code_tickle_page = cselect64(data_tickle_address_is_within_enclave, 1, code_tickle_page, code_tickle_page & ~0x1); + // Pop an entropy byte from the entropy cache if (--thread_data->aex_notify_entropy_remaining < 0) { if (0 == do_rdrand(&thread_data->aex_notify_entropy_cache)) @@ -275,7 +306,7 @@ static void apply_constant_time_sgxstep_mitigation_and_continue_execution(sgx_ex constant_time_apply_sgxstep_mitigation_and_continue_execution( info, thread_data->first_ssa_gpr + offsetof(ssa_gpr_t, aex_notify), stack_tickle_pages, code_tickle_page, - data_tickle_page, c3_byte_address); + data_tickle_address, c3_byte_address); } // the 2nd phrase exception handing, which traverse registered exception handlers.