Draft: Deduplicate project_authorizations
What does this MR do and why?
todo: rewrite the sampler so that it doesn't use those costly OFFSET queries.
Deduplicates the project_authorizations
with a keyset background migration (#418205).
Ports !82460 (merged) to a batched background migration.
A unique index was dropped from the table and installations now possibly contain duplicated rows. Since the table is large, we cannot use a post-deployment migration to deduplicate it. The table was previously deduplicated with a post-deployment migration in !82460 (merged), which this MR ports to the existing batched background migrations framework.
How to set up and validate locally
Sampling the migration
bin/rails db:migrate:redo:main VERSION=20240531000004
mkdir -p tmp/migration-testing/main/background_migrations
echo "0" > tmp/migration-testing/main/background_migrations/last-batched-background-migration-id.txt
bin/rake gitlab:db:migration_testing:sample_batched_background_migrations:main
ls tmp/migration-testing/main/background_migrations/DeduplicateProjectAuthorizations
Finalizing the migration
bin/rails db:migrate:redo:main VERSION=20240104000004
bin/rake gitlab:background_migrations:keyset:finalize:main[DeduplicateProjectAuthorizations,project_authorizations,'[{\"order\":\"asc\"\\,\"column\":\"user_id\"}\\,{\"order\":\"asc\"\\,\"column\":\"project_id\"}]']
Compare with the migration's finalize_command
:
[1] pry(main)> mig = Gitlab::Database::BackgroundMigration::BatchedMigration.find_by(job_class_name: "DeduplicateProjectAuthorizations")
[2] pry(main)> mig.finalize_command # => "sudo gitlab-rake gitlab:background_migrations:keyset:finalize[...]"
Validate the keyset ranges:
SELECT
min_keyset_cursor,
max_keyset_cursor
FROM
batched_background_migration_jobs
JOIN
batched_background_migrations
ON batched_background_migration_id = batched_background_migrations.id
WHERE
batched_background_migrations.job_class_name = 'DeduplicateProjectAuthorizations'
ORDER BY
batched_background_migration_jobs.id ASC;
Database queries
#last_keyset_job
The exact order condition varies by keyset definition.
https://console.postgres.ai/gitlab/gitlab-production-tunnel-pg12/sessions/25185/commands/80077
SELECT
"batched_background_migration_jobs".*
FROM
"batched_background_migration_jobs"
WHERE
"batched_background_migration_jobs"."batched_background_migration_id" = 123
AND "batched_background_migration_jobs"."max_keyset_cursor" IS NOT NULL
ORDER BY
max_keyset_cursor -> 0 ASC,
max_keyset_cursor -> 1 DESC
LIMIT 1;
Toggle setup script
-- Columns
ALTER TABLE batched_background_migration_jobs
ADD COLUMN min_keyset_cursor JSONB,
ADD COLUMN max_keyset_cursor JSONB;
ALTER TABLE batched_background_migration_jobs
ALTER COLUMN min_value DROP NOT NULL;
ALTER TABLE batched_background_migration_jobs
ALTER COLUMN max_value DROP NOT NULL;
-- Indexes
CREATE INDEX index_batched_jobs_by_max_keyset_cursor_1 ON batched_background_migration_jobs USING btree (batched_background_migration_id, ((max_keyset_cursor -> 0)), ((max_keyset_cursor -> 1)))
WHERE (max_keyset_cursor IS NOT NULL);
CREATE INDEX index_batched_jobs_by_max_keyset_cursor_2 ON batched_background_migration_jobs USING btree (batched_background_migration_id, ((max_keyset_cursor -> 0)), ((max_keyset_cursor -> 1)) DESC)
WHERE (max_keyset_cursor IS NOT NULL);
-- Write rows
DO $$
DECLARE
i int := 1;
BEGIN
WHILE i <= 10000 LOOP
INSERT INTO batched_background_migration_jobs (
id,
created_at,
updated_at,
batched_background_migration_id,
batch_size,
sub_batch_size,
status,
attempts,
metrics,
pause_ms,
min_keyset_cursor,
max_keyset_cursor
) VALUES (
nextval('batched_background_migration_jobs_id_seq'),
NOW(),
NOW(),
1000476, -- BackfillOwaspTopTenOfVulnerabilityReads
1, -- batch_size
1, -- sub_batch_size
0, -- status
0, -- attempts
'{}'::jsonb, -- metrics
100, -- pause_ms
jsonb_build_array(i, 42),
jsonb_build_array(i + 1, 42)
);
i := i + 1;
END LOOP;
END $$;
::for_keyset
SELECT
"batched_background_migrations".*
FROM
"batched_background_migrations"
WHERE
"batched_background_migrations"."keyset_order" IS NOT NULL
AND "batched_background_migrations"."job_class_name" = 'TestJob'
AND "batched_background_migrations"."table_name" = 'events'
AND (keyset_order = '[{"column":"id","order":"desc"}]')
AND "batched_background_migrations"."gitlab_schema" = 'gitlab_main';
https://console.postgres.ai/gitlab/gitlab-production-tunnel-pg12/sessions/25854/commands/81523
::delete_batched_background_keyset_migration
DELETE FROM "batched_background_migrations"
WHERE "batched_background_migrations"."keyset_order" IS NOT NULL
AND "batched_background_migrations"."job_class_name" = 'TestJob'
AND "batched_background_migrations"."table_name" = 'events'
AND (keyset_order = '[{"column":"id","order":"desc"}]')
AND "batched_background_migrations"."gitlab_schema" = 'gitlab_main';
https://console.postgres.ai/gitlab/gitlab-production-tunnel-pg12/sessions/25854/commands/81524
MR acceptance checklist
This checklist encourages us to confirm any changes have been analyzed to reduce risks in quality, performance, reliability, security, and maintainability.
-
I have evaluated the MR acceptance checklist for this MR.
Related to #418205