[Qa-jenkins-scm] [jenkins.debian.net] 05/06: temporary: script to migrate from sqlite to postgres db

Holger Levsen holger at layer-acht.org
Mon Dec 19 11:23:36 UTC 2016


This is an automated email from the git hooks/post-receive script.

holger pushed a commit to branch master
in repository jenkins.debian.net.

commit fc17c87a536bb52eaa3bf49dbcaf6824fce7b0f4
Author: Valerie R Young <spectranaut at riseup.net>
Date:   Tue Aug 23 20:32:48 2016 -0400

    temporary: script to migrate from sqlite to postgres db
    
    Signed-off-by: Holger Levsen <holger at layer-acht.org>
---
 bin/reproducible_migrate_db.py | 154 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 154 insertions(+)

diff --git a/bin/reproducible_migrate_db.py b/bin/reproducible_migrate_db.py
new file mode 100755
index 0000000..575b04b
--- /dev/null
+++ b/bin/reproducible_migrate_db.py
@@ -0,0 +1,154 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+#
+# Copyright © 2016 Valerie Young <spectranaut at riseup.net>
+# Licensed under GPL-2
+#
+# Depends: sqlalchemy, reproducible_common
+
+from sqlalchemy import *
+from sqlalchemy.schema import *
+import sqlalchemy.util
+from pprint import pprint
+import subprocess
+import sys
+import os
+import csv
+
+# We skip the database connection because the database
+# may not exist yet, but we would like to use the constants
+# available in reproducible_common.py
+sys.argv.append('--skip-database-connection')
+from reproducible_common import *
+
+# Database defined in reproducible_common.py
+DEST_DB_NAME = PGDATABASE
+DEST_DB_USER = os.environ['USER']
+
+# Old SQLite database
+SOURCE_DB_NAME = '/var/lib/jenkins/reproducible.db'
+
+try:
+    # check is the user exists, if not created it
+    log.info("Checking if postgres role exists...")
+    query = "SELECT 1 FROM pg_roles WHERE rolname='%s';" % DEST_DB_USER
+    command = ['sudo', '-u', 'postgres', 'psql', '-tAc', query]
+    result = subprocess.check_output(command).decode("utf-8").strip()
+
+    if result == '1':
+        log.info("user exists.")
+    else:
+        log.info("Postgres role %s does not exist. Creating role."
+                 % DEST_DB_USER)
+        check_call(['sudo', '-u', 'postgres', 'createuser', '-w', DEST_DB_USER])
+
+    # check is the database exists, if not created it
+    log.info("Checking if postgres database exists...")
+    query = "SELECT 1 FROM pg_database WHERE datname='%s'" % DEST_DB_NAME
+    command = ['sudo', '-u', 'postgres', 'psql', '-tAc', query]
+    result = subprocess.check_output(command).decode("utf-8").strip()
+
+    if result == '1':
+        log.info("database exists.")
+    else:
+        log.info("Postgres database %s does not exist. Creating database."
+                 % DEST_DB_NAME)
+        check_call(['sudo', '-u', 'postgres', 'createdb', '-O', DEST_DB_USER,
+                    '-w', DEST_DB_NAME])
+
+except FileNotFoundError:
+    print_critical_message("Postgres is not installed. Install postgres before continuing.")
+    sys.exit(1)
+
+# Run reproducible_db_maintenance. This will create the appropriate schema.
+db_maintenance = os.path.join(BIN_PATH, "reproducible_db_maintenance.py")
+check_call([db_maintenance])
+
+# Connect to both databases
+dest_engine = create_engine("postgresql:///%s" % DEST_DB_NAME)
+dest_conn = dest_engine.connect()
+source_engine = create_engine("sqlite:///%s" % SOURCE_DB_NAME)
+source_conn = source_engine.connect()
+
+# Load all table definitions for both databases. They should be identical
+# (both up to date according to reproducible_db_maintenance.py)
+source_metadata = MetaData(source_engine)
+source_metadata.reflect()
+dest_metadata = MetaData(dest_engine)
+dest_metadata.reflect()
+
+# The order in which we will copy the table
+all_tables = ['sources', 'issues', 'notes', 'removed_packages', 'results',
+              'stats_bugs', 'stats_build', 'stats_builds_age', 'stats_notes',
+              'stats_issues', 'stats_meta_pkg_state', 'stats_builds_per_day',
+              'stats_pkg_state']
+
+# Get all table definitions in source and destination. If the table doesn't
+# exist in one of these two places, an error will occur.
+dest_tables = {t: Table(t, dest_metadata, autoload=True) for t in all_tables}
+source_tables = {t: Table(t, source_metadata, autoload=True) for t in all_tables}
+
+for table in all_tables:
+    log.info("Copying table: "  + table)
+    dest_table = dest_tables[table]
+    dest_columns = dest_table.columns.keys()
+    source_table = source_tables[table]
+    source_columns = source_table.columns.keys()
+
+    if table in ['notes', 'results']:
+        sources = Table('sources', source_metadata, autoload=True)
+        # only select rows with correct foreign references to the SOURCES table
+        query = sql.select(source_table.c).select_from(
+            source_table.join(sources))
+
+        # save rows with incorrect foreign references to the SOURCES table
+        ignored_query = select(source_table.c).select_from(source_table).where(
+            source_table.c.package_id.notin_(select([sources.c.id])))
+        ignored_results = source_conn.execute(ignored_query).fetchall()
+        if len(ignored_results):
+            log.info('Ignoring bad foreign keys in %s. Dumping rows to '
+                     'ignored_rows_%s.csv' % (table, table))
+            with open('ignored_rows_' + table + '.csv', 'w') as f:
+                writer = csv.DictWriter(f, fieldnames=ignored_results[0].keys())
+                writer.writeheader()
+                for row in ignored_results:
+                    writer.writerow(dict(row))
+    else:
+        query = "select * from %s" % table
+
+    # Perform each table copy in a single transaction
+    transaction = dest_conn.begin()
+    try:
+        for record in source_conn.execute(query):
+            data = {}
+            for c in source_columns:
+                col = c.lower()
+                value = getattr(record, c)
+                if str(dest_table.c[col].type) == 'INTEGER' and value == "":
+                    # there exist empty string values in the sqlite database
+                    # for integers. This will result in an error if we try to
+                    # write these values to the postgres db.
+                    log.info("column %s has empty string/null value" % col)
+                else:
+                    data[col] = value
+
+            try:
+                dest_conn.execute(dest_table.insert(), [data])
+            except:
+                log.critical("Could not insert: %s" % data)
+                raise
+
+        # Commit the whole table at once
+        transaction.commit()
+    except:
+        transaction.rollback()
+        log.error("Transaction rolled back")
+        raise
+
+# For the autoincrementing columns to work correctly, we must set the next
+# value of the sequence to be the next highest available ID.
+table_sequences = {t : t + "_id_seq" for t in ['sources', 'results']}
+
+for table in table_sequences:
+    query = "select setval('%s', (select max(id)+1 from %s))"
+    dest_conn.execute(query % (table_sequences[table], table))

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/qa/jenkins.debian.net.git



More information about the Qa-jenkins-scm mailing list