|
27 | 27 | metadata_blob BLOB NOT NULL, |
28 | 28 | UNIQUE(binary_version) |
29 | 29 | ); |
| 30 | +
|
| 31 | +CREATE TABLE IF NOT EXISTS diff_results ( |
| 32 | + id INTEGER PRIMARY KEY AUTOINCREMENT, |
| 33 | + function_name TEXT NOT NULL, |
| 34 | + old_pseudocode BLOB NOT NULL, |
| 35 | + new_pseudocode BLOB NOT NULL, |
| 36 | + old_address INTEGER, |
| 37 | + new_address INTEGER, |
| 38 | + old_blocks INTEGER, |
| 39 | + new_blocks INTEGER, |
| 40 | + old_signature TEXT, |
| 41 | + new_signature TEXT, |
| 42 | + ratio REAL, |
| 43 | + smart_ratio REAL, |
| 44 | + modification_level TEXT, |
| 45 | + UNIQUE(function_name) |
| 46 | +); |
30 | 47 | """ |
31 | 48 |
|
32 | 49 | def compress_pseudo(pseudo_lines: list[str]) -> bytes: |
@@ -55,6 +72,17 @@ def init_db(db_path: str): |
55 | 72 | conn.execute(stmt) |
56 | 73 | except Exception as e: |
57 | 74 | log.warning(f"Migration step failed: {stmt}: {e}") |
| 75 | + # Ensure diff_results table exists (older DBs won't have it) |
| 76 | + conn.execute("CREATE TABLE IF NOT EXISTS diff_results (\n id INTEGER PRIMARY KEY AUTOINCREMENT,\n function_name TEXT NOT NULL,\n old_pseudocode BLOB NOT NULL,\n new_pseudocode BLOB NOT NULL,\n old_address INTEGER,\n new_address INTEGER,\n old_blocks INTEGER,\n new_blocks INTEGER,\n old_signature TEXT,\n new_signature TEXT,\n ratio REAL,\n smart_ratio REAL,\n modification_level TEXT,\n UNIQUE(function_name)\n)") |
| 77 | + # Add modification_level column if it doesn't exist (migration) |
| 78 | + try: |
| 79 | + # Check if modification_level column exists |
| 80 | + cols = {r[1] for r in conn.execute("PRAGMA table_info(diff_results)").fetchall()} |
| 81 | + if "modification_level" not in cols: |
| 82 | + conn.execute("ALTER TABLE diff_results ADD COLUMN modification_level TEXT") |
| 83 | + log.info("Added modification_level column to diff_results table") |
| 84 | + except Exception as e: |
| 85 | + log.warning(f"Could not add modification_level column: {e}") |
58 | 86 | except Exception as e: |
59 | 87 | log.warning(f"Could not run PRAGMA table_info migration checks: {e}") |
60 | 88 | conn.commit() |
@@ -104,3 +132,141 @@ def upsert_binary_metadata(conn, version: str, address_min: int, address_max: in |
104 | 132 | (version, address_min, address_max, function_count, metadata_blob), |
105 | 133 | ) |
106 | 134 | conn.commit() |
| 135 | + |
| 136 | + |
| 137 | +def _safe_ratio(a: str | None, b: str | None) -> float: |
| 138 | + try: |
| 139 | + import difflib |
| 140 | + if not a or not b: |
| 141 | + return 0.0 |
| 142 | + if a == b: |
| 143 | + return 1.0 |
| 144 | + return difflib.SequenceMatcher(None, a, b).ratio() |
| 145 | + except Exception: |
| 146 | + return 0.0 |
| 147 | + |
| 148 | + |
| 149 | +def _compute_smart_ratio(text_old: str | None, text_new: str | None, blocks_old: int | None, blocks_new: int | None) -> float: |
| 150 | + try: |
| 151 | + base_sim = _safe_ratio(text_old, text_new) |
| 152 | + if blocks_old is None or blocks_new is None: |
| 153 | + return 1.0 - base_sim |
| 154 | + if blocks_old == 0 or blocks_new == 0: |
| 155 | + return 1.0 - base_sim |
| 156 | + |
| 157 | + delta_blocks = abs(blocks_old - blocks_new) |
| 158 | + |
| 159 | + if delta_blocks == 0: |
| 160 | + change_score = (1.0 - base_sim) * 0.05 # Very low for no block changes |
| 161 | + else: |
| 162 | + # Use absolute block delta as primary score |
| 163 | + block_score = delta_blocks / 50.0 # Scale down for readability |
| 164 | + text_score = (1.0 - base_sim) * 0.2 |
| 165 | + change_score = block_score + text_score |
| 166 | + |
| 167 | + return change_score |
| 168 | + except Exception: |
| 169 | + return 0.0 |
| 170 | + |
| 171 | + |
| 172 | +def _determine_modification_level(score: float) -> str: |
| 173 | + """Categorize the modification level based on score""" |
| 174 | + if score == 0.0: |
| 175 | + return "unchanged" |
| 176 | + elif score < 0.1: |
| 177 | + return "minor" |
| 178 | + elif score < 0.3: |
| 179 | + return "moderate" |
| 180 | + elif score < 0.6: |
| 181 | + return "significant" |
| 182 | + else: |
| 183 | + return "major" |
| 184 | + |
| 185 | + |
| 186 | +def compute_and_store_diffs(conn: sqlite3.Connection): |
| 187 | + """ |
| 188 | + Populate diff_results with pairs that exist in both old and new and differ. |
| 189 | + Leaves unmatched and unchanged entries in the original functions table. |
| 190 | + """ |
| 191 | + # Find candidate names with both versions |
| 192 | + cursor = conn.execute( |
| 193 | + """ |
| 194 | + SELECT f_old.function_name |
| 195 | + FROM functions AS f_old |
| 196 | + INNER JOIN functions AS f_new |
| 197 | + ON f_new.function_name = f_old.function_name |
| 198 | + AND f_new.binary_version = 'new' |
| 199 | + WHERE f_old.binary_version = 'old' |
| 200 | + """ |
| 201 | + ) |
| 202 | + names = [r[0] for r in cursor.fetchall()] |
| 203 | + print("\n[+] Preparing diff computation …") |
| 204 | + print(f"[+] Total matched functions: {len(names)}") |
| 205 | + if not names: |
| 206 | + print("[+] No matched functions found. Skipping diff computation.") |
| 207 | + return |
| 208 | + |
| 209 | + inserted_names: list[str] = [] |
| 210 | + for name in names: |
| 211 | + # Fetch both rows parameterized |
| 212 | + old_row = conn.execute( |
| 213 | + "SELECT pseudocode, address, blocks, signature FROM functions WHERE function_name = ? AND binary_version = 'old'", |
| 214 | + (name,), |
| 215 | + ).fetchone() |
| 216 | + new_row = conn.execute( |
| 217 | + "SELECT pseudocode, address, blocks, signature FROM functions WHERE function_name = ? AND binary_version = 'new'", |
| 218 | + (name,), |
| 219 | + ).fetchone() |
| 220 | + if not old_row or not new_row: |
| 221 | + continue |
| 222 | + try: |
| 223 | + text_old = decompress_pseudo(old_row[0]) if old_row[0] is not None else None |
| 224 | + text_new = decompress_pseudo(new_row[0]) if new_row[0] is not None else None |
| 225 | + except Exception: |
| 226 | + # Skip corrupt entries |
| 227 | + continue |
| 228 | + # Only store when contents differ |
| 229 | + if not text_old or not text_new or text_old == text_new: |
| 230 | + continue |
| 231 | + ratio = _safe_ratio(text_old, text_new) |
| 232 | + smart = _compute_smart_ratio(text_old, text_new, old_row[2], new_row[2]) |
| 233 | + modification_score = 1.0 - ratio |
| 234 | + level = _determine_modification_level(modification_score) |
| 235 | + try: |
| 236 | + conn.execute( |
| 237 | + """ |
| 238 | + INSERT OR IGNORE INTO diff_results ( |
| 239 | + function_name, |
| 240 | + old_pseudocode, new_pseudocode, |
| 241 | + old_address, new_address, |
| 242 | + old_blocks, new_blocks, |
| 243 | + old_signature, new_signature, |
| 244 | + ratio, smart_ratio, modification_level |
| 245 | + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) |
| 246 | + """, |
| 247 | + ( |
| 248 | + name, |
| 249 | + old_row[0], new_row[0], |
| 250 | + old_row[1], new_row[1], |
| 251 | + old_row[2], new_row[2], |
| 252 | + old_row[3], new_row[3], |
| 253 | + ratio, smart, level, |
| 254 | + ), |
| 255 | + ) |
| 256 | + inserted_names.append(name) |
| 257 | + except Exception as e: |
| 258 | + log.warning(f"Failed inserting diff_results for {name}: {e}") |
| 259 | + conn.commit() |
| 260 | + |
| 261 | + print(f"[+] Diff computation completed, found {len(inserted_names)} functions as changed") |
| 262 | + |
| 263 | + if inserted_names: |
| 264 | + # Delete matched rows from functions (leave unmatched and unchanged) |
| 265 | + try: |
| 266 | + conn.executemany( |
| 267 | + "DELETE FROM functions WHERE function_name = ?", |
| 268 | + [(n,) for n in inserted_names], |
| 269 | + ) |
| 270 | + conn.commit() |
| 271 | + except Exception as e: |
| 272 | + log.warning(f"Failed to prune matched rows from functions: {e}") |
0 commit comments