LLM4Binary commited on
Commit
69fbd59
·
verified ·
1 Parent(s): a9a6633

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +126 -49
README.md CHANGED
@@ -14,80 +14,157 @@ LLM4Decompile aims to decompile x86 assembly instructions into C. The newly rele
14
 
15
  ### 2. Evaluation Results
16
 
17
- | Model | HumanEval-Decompile | | | | | ExeBench | | | | |
18
- |:-----------------------:|:-------------------:|:------:|:------:|:------:|:------:|:--------:|:------:|:------:|:------:|:------:|
19
- | opt-level | O0 | O1 | O2 | O3 | Avg. | O0 | O1 | O2 | O3 | Avg. |
20
- | GPT4 | 0.1341 | 0.1890 | 0.1524 | 0.0854 | 0.1402 | TBD | TBD | TBD | TBD | TBD |
21
- | Deepseek-Coder-33B | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
22
- | LLM4Decompile-6.7B-UO | 0.3720 | 0.1585 | 0.2134 | 0.2134 | 0.2393 | 0.0904 | 0.0988 | 0.0988 | 0.0950 | 0.0957 |
23
- | LLM4Decompile-1.3B-V1.5 | 0.4817 | 0.2463 | 0.2329 | 0.2280 | 0.2972 | 0.2076 | 0.1774 | 0.1721 | 0.1728 | 0.1824 |
24
- | LLM4Decompile-6.7B-V1.5 | 0.6927 | 0.4280 | 0.4134 | 0.3732 | 0.4768 | 0.2453 | 0.1999 | 0.1927 | 0.1938 | 0.2079 |
 
25
 
26
  ### 3. How to Use
27
- Here is an example of how to use our model (Revised for V1.5).
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  Note: **Replace** func0 with the function name you want to decompile.
29
 
30
  **Preprocessing:** Compile the C code into binary, and disassemble the binary into assembly instructions.
31
  ```python
32
- import subprocess
33
  import os
 
 
34
 
35
  OPT = ["O0", "O1", "O2", "O3"]
36
- fileName = 'samples/sample' #'path/to/file'
37
- for opt_state in OPT:
38
- output_file = fileName +'_' + opt_state
39
- input_file = fileName+'.c'
40
- compile_command = f'gcc -o {output_file}.o {input_file} -{opt_state} -lm'#compile the code with GCC on Linux
41
- subprocess.run(compile_command, shell=True, check=True)
42
- compile_command = f'objdump -d {output_file}.o > {output_file}.s'#disassemble the binary file into assembly instructions
43
- subprocess.run(compile_command, shell=True, check=True)
44
-
45
- input_asm = ''
46
- with open(output_file+'.s') as f:#asm file
47
- asm= f.read()
48
- if '<'+'func0'+'>:' not in asm: #IMPORTANT replace func0 with the function name
49
- raise ValueError("compile fails")
50
- asm = '<'+'func0'+'>:' + asm.split('<'+'func0'+'>:')[-1].split('\n\n')[0] #IMPORTANT replace func0 with the function name
51
- asm_clean = ""
52
- asm_sp = asm.split("\n")
53
- for tmp in asm_sp:
54
- if len(tmp.split("\t"))<3 and '00' in tmp:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
  continue
56
- idx = min(
57
- len(tmp.split("\t")) - 1, 2
58
- )
59
- tmp_asm = "\t".join(tmp.split("\t")[idx:]) # remove the binary code
60
- tmp_asm = tmp_asm.split("#")[0].strip() # remove the comments
61
- asm_clean += tmp_asm + "\n"
62
- input_asm = asm_clean.strip()
63
- before = f"# This is the assembly code:\n"#prompt
64
- after = "\n# What is the source code?\n"#prompt
65
- input_asm_prompt = before+input_asm.strip()+after
66
- with open(fileName +'_' + opt_state +'.asm','w',encoding='utf-8') as f:
67
- f.write(input_asm_prompt)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
  ```
 
69
 
70
- **Decompilation:** Use LLM4Decompile to translate the assembly instructions into C:
71
  ```python
72
  from transformers import AutoTokenizer, AutoModelForCausalLM
73
  import torch
74
 
75
- model_path = 'LLM4Binary/llm4decompile-6.7b-v1.5' # V1.5 Model
76
  tokenizer = AutoTokenizer.from_pretrained(model_path)
77
- model = AutoModelForCausalLM.from_pretrained(model_path,torch_dtype=torch.bfloat16).cuda()
78
 
79
- with open(fileName +'_' + OPT[0] +'.asm','r') as f:#optimization level O0
80
  asm_func = f.read()
81
  inputs = tokenizer(asm_func, return_tensors="pt").to(model.device)
82
  with torch.no_grad():
83
- outputs = model.generate(**inputs, max_new_tokens=4000)
84
  c_func_decompile = tokenizer.decode(outputs[0][len(inputs[0]):-1])
85
 
86
- with open(fileName +'.c','r') as f:#original file
87
  func = f.read()
88
 
89
- print(f'original function:\n{func}')# Note we only decompile one function, where the original file may contain multiple functions
90
- print(f'decompiled function:\n{c_func_decompile}')
 
91
  ```
92
 
93
  ### 4. License
 
14
 
15
  ### 2. Evaluation Results
16
 
17
+ | Metrics | Re-executability Rate | | | | | Edit Similarity | | | | |
18
+ |:-----------------------:|:---------------------:|:-------:|:-------:|:-------:|:-------:|:---------------:|:-------:|:-------:|:-------:|:-------:|
19
+ | Optimization Level | O0 | O1 | O2 | O3 | AVG | O0 | O1 | O2 | O3 | AVG |
20
+ | LLM4Decompile-End-6.7B | 0.6805 | 0.3951 | 0.3671 | 0.3720 | 0.4537 | 0.1557 | 0.1292 | 0.1293 | 0.1269 | 0.1353 |
21
+ | Ghidra | 0.3476 | 0.1646 | 0.1524 | 0.1402 | 0.2012 | 0.0699 | 0.0613 | 0.0619 | 0.0547 | 0.0620 |
22
+ | +GPT-4o | 0.4695 | 0.3415 | 0.2866 | 0.3110 | 0.3522 | 0.0660 | 0.0563 | 0.0567 | 0.0499 | 0.0572 |
23
+ | +LLM4Decompile-Ref-1.3B | 0.6890 | 0.3720 | 0.4085 | 0.3720 | 0.4604 | 0.1517 | 0.1325 | 0.1292 | 0.1267 | 0.1350 |
24
+ | +LLM4Decompile-Ref-6.7B | 0.7439 | 0.4695 | 0.4756 | 0.4207 | 0.5274 | 0.1559 | 0.1353 | 0.1342 | 0.1273 | 0.1382 |
25
+ | +LLM4Decompile-Ref-33B | 0.7073 | 0.4756 | 0.4390 | 0.4146 | 0.5091 | 0.1540 | 0.1379 | 0.1363 | 0.1307 | 0.1397 |
26
 
27
  ### 3. How to Use
28
+ Here is an example of how to use our model (Only for V2. For previous models, please check the corresponding model page at HF).
29
+
30
+ 1. Install Ghidra
31
+ Download [Ghidra](https://github.com/NationalSecurityAgency/ghidra/releases/download/Ghidra_11.0.3_build/ghidra_11.0.3_PUBLIC_20240410.zip) to the current folder. You can also check the [page](https://github.com/NationalSecurityAgency/ghidra/releases) for other versions. Unzip the package to the current folder.
32
+ In bash, you can use the following:
33
+ ```bash
34
+ cd LLM4Decompile/ghidra
35
+ wget https://github.com/NationalSecurityAgency/ghidra/releases/download/Ghidra_11.0.3_build/ghidra_11.0.3_PUBLIC_20240410.zip
36
+ unzip ghidra_11.0.3_PUBLIC_20240410.zip
37
+ ```
38
+ 2. Install Java-SDK-17
39
+ Ghidra 11 is dependent on Java-SDK-17, a simple way to install the SDK on Ubuntu:
40
+ ```bash
41
+ apt-get update
42
+ apt-get upgrade
43
+ apt install openjdk-17-jdk openjdk-17-jre
44
+ ```
45
+ Please check [Ghidra install guide](https://htmlpreview.github.io/?https://github.com/NationalSecurityAgency/ghidra/blob/Ghidra_11.1.1_build/GhidraDocs/InstallationGuide.html) for other platforms.
46
+
47
+ 3. Use Ghidra Headless to decompile binary (demo.py)
48
+
49
  Note: **Replace** func0 with the function name you want to decompile.
50
 
51
  **Preprocessing:** Compile the C code into binary, and disassemble the binary into assembly instructions.
52
  ```python
 
53
  import os
54
+ import subprocess
55
+ from tqdm import tqdm,trange
56
 
57
  OPT = ["O0", "O1", "O2", "O3"]
58
+ timeout_duration = 10
59
+
60
+ ghidra_path = "./ghidra_11.0.3_PUBLIC/support/analyzeHeadless"#path to the headless analyzer, change the path accordingly
61
+ postscript = "./decompile.py"#path to the decompiler helper function, change the path accordingly
62
+ project_path = "."#path to temp folder for analysis, change the path accordingly
63
+ project_name = "tmp_ghidra_proj"
64
+ func_path = "../samples/sample.c"#path to c code for compiling and decompiling, change the path accordingly
65
+ fileName = "sample"
66
+
67
+ with tempfile.TemporaryDirectory() as temp_dir:
68
+ pid = os.getpid()
69
+ asm_all = {}
70
+ for opt in [OPT[0]]:
71
+ executable_path = os.path.join(temp_dir, f"{pid}_{opt}.o")
72
+ cmd = f'gcc -{opt} -o {executable_path} {func_path} -lm'
73
+ subprocess.run(
74
+ cmd.split(' '),
75
+ check=True,
76
+ stdout=subprocess.DEVNULL, # Suppress stdout
77
+ stderr=subprocess.DEVNULL, # Suppress stderr
78
+ timeout=timeout_duration,
79
+ )
80
+
81
+ output_path = os.path.join(temp_dir, f"{pid}_{opt}.c")
82
+ command = [
83
+ ghidra_path,
84
+ temp_dir,
85
+ project_name,
86
+ "-import", executable_path,
87
+ "-postScript", postscript, output_path,
88
+ "-deleteProject", # WARNING: This will delete the project after analysis
89
+ ]
90
+ result = subprocess.run(command, text=True, capture_output=True, check=True)
91
+ with open(output_path,'r') as f:
92
+ c_decompile = f.read()
93
+ c_func = []
94
+ flag = 0
95
+ for line in c_decompile.split('\n'):
96
+ if "Function: func0" in line:#**Replace** func0 with the function name you want to decompile.
97
+ flag = 1
98
+ c_func.append(line)
99
  continue
100
+ if flag:
101
+ if '// Function:' in line:
102
+ if len(c_func) > 1:
103
+ break
104
+ c_func.append(line)
105
+ if flag == 0:
106
+ raise ValueError('bad case no function found')
107
+ for idx_tmp in range(1,len(c_func)):##########remove the comments
108
+ if 'func0' in c_func[idx_tmp]:
109
+ break
110
+ c_func = c_func[idx_tmp:]
111
+ input_asm = '\n'.join(c_func).strip()
112
+
113
+ before = f"# This is the assembly code:\n"#prompt
114
+ after = "\n# What is the source code?\n"#prompt
115
+ input_asm_prompt = before+input_asm.strip()+after
116
+ with open(fileName +'_' + opt +'.pseudo','w',encoding='utf-8') as f:
117
+ f.write(input_asm_prompt)
118
+ ```
119
+
120
+ Ghidra pseudo-code may look like this:
121
+ ```c
122
+ undefined4 func0(float param_1,long param_2,int param_3)
123
+ {
124
+ int local_28;
125
+ int local_24;
126
+
127
+ local_24 = 0;
128
+ do {
129
+ local_28 = local_24;
130
+ if (param_3 <= local_24) {
131
+ return 0;
132
+ }
133
+ while (local_28 = local_28 + 1, local_28 < param_3) {
134
+ if ((double)((ulong)(double)(*(float *)(param_2 + (long)local_24 * 4) -
135
+ *(float *)(param_2 + (long)local_28 * 4)) &
136
+ SUB168(_DAT_00402010,0)) < (double)param_1) {
137
+ return 1;
138
+ }
139
+ }
140
+ local_24 = local_24 + 1;
141
+ } while( true );
142
+ }
143
  ```
144
+ 4. Refine pseudo-code using LLM4Decompile (demo.py)
145
 
146
+ **Decompilation:** Use LLM4Decompile-Ref to refine the Ghidra pseudo-code into C:
147
  ```python
148
  from transformers import AutoTokenizer, AutoModelForCausalLM
149
  import torch
150
 
151
+ model_path = 'LLM4Binary/llm4decompile-6.7b-v2' # V2 Model
152
  tokenizer = AutoTokenizer.from_pretrained(model_path)
153
+ model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.bfloat16).cuda()
154
 
155
+ with open(fileName +'_' + OPT[0] +'.pseudo','r') as f:#optimization level O0
156
  asm_func = f.read()
157
  inputs = tokenizer(asm_func, return_tensors="pt").to(model.device)
158
  with torch.no_grad():
159
+ outputs = model.generate(**inputs, max_new_tokens=2048)### max length to 4096, max new tokens should be below the range
160
  c_func_decompile = tokenizer.decode(outputs[0][len(inputs[0]):-1])
161
 
162
+ with open(fileName +'_' + OPT[0] +'.pseudo','r') as f:#original file
163
  func = f.read()
164
 
165
+ print(f'pseudo function:\n{func}')# Note we only decompile one function, where the original file may contain multiple functions
166
+ print(f'refined function:\n{c_func_decompile}')
167
+
168
  ```
169
 
170
  ### 4. License