From 2d362ef96662296fe97163898889f10b9e43f0f9 Mon Sep 17 00:00:00 2001 From: Romain Thouvenin Date: Fri, 4 May 2018 17:50:21 +0200 Subject: [PATCH 1/2] Allow self references but exclude them from sampling --- lib/database_sampler.rb | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/lib/database_sampler.rb b/lib/database_sampler.rb index 8d5aff4..8085ddf 100644 --- a/lib/database_sampler.rb +++ b/lib/database_sampler.rb @@ -121,8 +121,7 @@ def get_foreign_keys(source=true) ON (t.table_name = c.table_name) WHERE t.table_schema = 'public' AND t.table_name NOT ilike 'list_members_part_list_ids_%' - AND (c.parent_table IS NULL OR t.table_name != c.parent_table) - ORDER BY t.table_name} # To avoid loops we remove self-references for now + ORDER BY t.table_name} if source @foreign_keys_source ||= (@source_conn.exec(sql).to_a + @manual_links).reject{ |fk| @exclude_tables.include?(fk['table_name']) || @exclude_tables.include?(fk['parent_table'])} @@ -135,7 +134,7 @@ def get_foreign_keys(source=true) def get_children(table_name) foreign_keys = get_foreign_keys(!@use_fks_from_target) - foreign_keys.select{ |r| r['parent_table'] == table_name }.map{ |r| r['table_name'] } + foreign_keys.select{ |r| r['parent_table'] == table_name and r['parent_table'] != r['table_name'] }.map{ |r| r['table_name'] } end @@ -146,7 +145,7 @@ def get_network network = {} foreign_keys.each do |row| data = network[row['table_name']] || {parents_count: 0, parents_remaining: 0, parents: {}, children: []} - if row['parent_table'] + if row['parent_table'] and row['parent_table'] != row['table_name'] data[:parents_count] += 1 data[:parents_remaining] += 1 data[:parents][row['parent_table']] = { source_column: row['column'], target_column: row['parent_column'] } @@ -270,6 +269,11 @@ def get_max_id_for_table(table) end def make_sample_tables + # Reject self_relations from samples + foreign_keys = get_foreign_keys(!@use_fks_from_target) + self_relations = foreign_keys.select { |r| r['table_name'] == r['parent_table'] }.map { |r| r['table_name'] } + @samples.reject! { |s| self_relations.include? s['table'] } + @samples.each do |sample| # Sampling a large table with ORDER BY RANDOM() is slow. So we use a faster method: generating a set of numbers in the range 1..max(id) for the table, and selecting those. # To calculate the range and number of IDs we need, we need to know two things: the max_id and what proportion of the IDs between 1 and max(id) actually exist (the 'density') From b1c27f15c6626cfbb8e14008c9a80c8dac5ad348 Mon Sep 17 00:00:00 2001 From: Romain Thouvenin Date: Fri, 4 May 2018 18:16:22 +0200 Subject: [PATCH 2/2] Update documentation for self-references --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index a31cb10..748368f 100644 --- a/README.md +++ b/README.md @@ -67,7 +67,7 @@ You may have a schema in the target database which is stricter than the source d - Only supports foreign keys that reference the `id` field of the parent table. - If the schemas of the tables differ at all you'll get errors - you can use the post_copy_sql parameter to add SQL that fixes this, but it's still a manual process. -- Self relations aren't properly supported - so you need to make sure there aren't any self-relations using conditions +- Self relations aren't properly supported, they are excluding from sampling. If you don't want such tables to be copied without sampling, add them to the exclude list. ## Recommendations