diff --git a/load_into_pg.py b/load_into_pg.py index 33be75c..66b651d 100755 --- a/load_into_pg.py +++ b/load_into_pg.py @@ -30,7 +30,7 @@ def _createCmdTuple(cursor, keys, templ, attribs, insertJson): """Use the cursor to mogrify a tuple of data. The passed data in `attribs` is augmented with default data (NULLs) and the order of data in the tuple is the same as in the list of `keys`. The - `cursor` is used toe mogrify the data and the `templ` is the template used + `cursor` is used to mogrify the data and the `templ` is the template used for the mogrification. """ defs = _makeDefValues(keys) @@ -45,8 +45,114 @@ def _createCmdTuple(cursor, keys, templ, attribs, insertJson): values_to_insert = cursor.mogrify(templ, defs) return cursor.mogrify(templ, defs) -def handleTable(table, keys, insertJson, dbname, mbDbFile, mbHost, mbPort, mbUsername, mbPassword): +def _getTableKeys(table): + """Return an array of the keys for a given table""" + keys = None + if table == 'Users': + keys = [ + 'Id' + , 'Reputation' + , 'CreationDate' + , 'DisplayName' + , 'LastAccessDate' + , 'WebsiteUrl' + , 'Location' + , 'AboutMe' + , 'Views' + , 'UpVotes' + , 'DownVotes' + , 'ProfileImageUrl' + , 'Age' + , 'AccountId' + ] + elif table == 'Badges': + keys = [ + 'Id' + , 'UserId' + , 'Name' + , 'Date' + ] + elif table == 'PostLinks': + keys = [ + 'Id' + , 'CreationDate' + , 'PostId' + , 'RelatedPostId' + , 'LinkTypeId' + ] + elif table == 'Comments': + keys = [ + 'Id' + , 'PostId' + , 'Score' + , 'Text' + , 'CreationDate' + , 'UserId' + ] + elif table == 'Votes': + keys = [ + 'Id' + , 'PostId' + , 'VoteTypeId' + , 'UserId' + , 'CreationDate' + , 'BountyAmount' + ] + elif table == 'Posts': + keys = [ + 'Id' + , 'PostTypeId' + , 'AcceptedAnswerId' + , 'ParentId' + , 'CreationDate' + , 'Score' + , 'ViewCount' + , 'Body' + , 'OwnerUserId' + , 'LastEditorUserId' + , 'LastEditorDisplayName' + , 'LastEditDate' + , 'LastActivityDate' + , 'Title' + , 'Tags' + , 'AnswerCount' + , 'CommentCount' + , 'FavoriteCount' + , 'ClosedDate' + , 'CommunityOwnedDate' + ] + elif table == 'Tags': + keys = [ + 'Id' + , 'TagName' + , 'Count' + , 'ExcerptPostId' + , 'WikiPostId' + ] + elif table == 'PostHistory': + keys = [ + 'Id', + 'PostHistoryTypeId', + 'PostId', + 'RevisionGUID', + 'CreationDate', + 'UserId', + 'Text' + ] + elif table == 'Comments': + keys = [ + 'Id', + 'PostId', + 'Score', + 'Text', + 'CreationDate', + 'UserId', + ] + return keys + +def handleTable(table, insertJson, createFk, dbname, mbDbFile, mbHost, mbPort, mbUsername, mbPassword): """Handle the table including the post/pre processing.""" + keys = _getTableKeys(table) dbFile = mbDbFile if mbDbFile is not None else table + '.xml' tmpl = _createMogrificationTemplate(table, keys, insertJson) start_time = time.time() @@ -54,8 +160,9 @@ def handleTable(table, keys, insertJson, dbname, mbDbFile, mbHost, mbPort, mbUse try: pre = open('./sql/' + table + '_pre.sql').read() post = open('./sql/' + table + '_post.sql').read() + fk = open('./sql/' + table + '_fk.sql').read() except IOError as e: - six.print_("Could not load pre/post sql. Are you running from the correct path?", file=sys.stderr) + six.print_("Could not load pre/post/fk sql. Are you running from the correct path?", file=sys.stderr) sys.exit(-1) dbConnectionParam = "dbname={}".format(dbname) @@ -74,6 +181,7 @@ def handleTable(table, keys, insertJson, dbname, mbDbFile, mbHost, mbPort, mbUse if mbPassword is not None: dbConnectionParam += ' password={}'.format(mbPassword) + try: with pg.connect(dbConnectionParam) as conn: with conn.cursor() as cur: @@ -95,13 +203,12 @@ def handleTable(table, keys, insertJson, dbname, mbDbFile, mbHost, mbPort, mbUse for row_attribs in rows ] ) - if len(valuesStr) > 0: cmd = 'INSERT INTO ' + table + \ ' VALUES\n' + valuesStr + ';' cur.execute(cmd) conn.commit() - six.print_('Table processing took {:.1f} seconds'.format(time.time() - start_time)) + six.print_('Table {0} processing took {1:.1f} seconds'.format(table, time.time() - start_time)) # Post-processing (creation of indexes) start_time = time.time() @@ -110,6 +217,14 @@ def handleTable(table, keys, insertJson, dbname, mbDbFile, mbHost, mbPort, mbUse cur.execute(post) conn.commit() six.print_('Post processing took {} seconds'.format(time.time() - start_time)) + if createFk: + # fk-processing (creation of foreign keys) + start_time = time.time() + six.print_('fk processing ...') + if post != '': + cur.execute(fk) + conn.commit() + six.print_('fk processing took {} seconds'.format(time.time() - start_time)) except IOError as e: six.print_("Could not read from file {}.".format(dbFile), file=sys.stderr) @@ -122,8 +237,6 @@ def handleTable(table, keys, insertJson, dbname, mbDbFile, mbHost, mbPort, mbUse six.print_("Warning from the database.", file=sys.stderr) six.print_("pg.Warning: {0}".format(str(w)), file=sys.stderr) - - ############################################################# parser = argparse.ArgumentParser() @@ -173,116 +286,16 @@ def handleTable(table, keys, insertJson, dbname, mbDbFile, mbHost, mbPort, mbUse , action = 'store_true' , default = False ) -args = parser.parse_args() -table = args.table -keys = None - -if table == 'Users': - keys = [ - 'Id' - , 'Reputation' - , 'CreationDate' - , 'DisplayName' - , 'LastAccessDate' - , 'WebsiteUrl' - , 'Location' - , 'AboutMe' - , 'Views' - , 'UpVotes' - , 'DownVotes' - , 'ProfileImageUrl' - , 'Age' - , 'AccountId' - ] -elif table == 'Badges': - keys = [ - 'Id' - , 'UserId' - , 'Name' - , 'Date' - ] -elif table == 'PostLinks': - keys = [ - 'Id' - , 'CreationDate' - , 'PostId' - , 'RelatedPostId' - , 'LinkTypeId' - ] -elif table == 'Comments': - keys = [ - 'Id' - , 'PostId' - , 'Score' - , 'Text' - , 'CreationDate' - , 'UserId' - ] -elif table == 'Votes': - keys = [ - 'Id' - , 'PostId' - , 'VoteTypeId' - , 'UserId' - , 'CreationDate' - , 'BountyAmount' - ] -elif table == 'Posts': - keys = [ - 'Id' - , 'PostTypeId' - , 'AcceptedAnswerId' - , 'ParentId' - , 'CreationDate' - , 'Score' - , 'ViewCount' - , 'Body' - , 'OwnerUserId' - , 'LastEditorUserId' - , 'LastEditorDisplayName' - , 'LastEditDate' - , 'LastActivityDate' - , 'Title' - , 'Tags' - , 'AnswerCount' - , 'CommentCount' - , 'FavoriteCount' - , 'ClosedDate' - , 'CommunityOwnedDate' - ] +parser.add_argument( '--foreign-keys' + , help = 'Create foreign keys.' + , action = 'store_true' + , default = False + ) - # If the user has not explicitly asked for loading the body, we replace it with NULL - if not args.with_post_body: - specialRules[('Posts', 'Body')] = 'NULL' +args = parser.parse_args() -elif table == 'Tags': - keys = [ - 'Id' - , 'TagName' - , 'Count' - , 'ExcerptPostId' - , 'WikiPostId' - ] -elif table == 'PostHistory': - keys = [ - 'Id', - 'PostHistoryTypeId', - 'PostId', - 'RevisionGUID', - 'CreationDate', - 'UserId', - 'Text' - ] -elif table == 'Comments': - keys = [ - 'Id', - 'PostId', - 'Score', - 'Text', - 'CreationDate', - 'UserId', - ] +table = args.table try: # Python 2/3 compatibility @@ -290,10 +303,14 @@ def handleTable(table, keys, insertJson, dbname, mbDbFile, mbHost, mbPort, mbUse except NameError: pass -choice = input('This will drop the {} table. Are you sure [y/n]?'.format(table)) +if table == 'Posts': + # If the user has not explicitly asked for loading the body, we replace it with NULL + if not args.with_post_body: + specialRules[('Posts', 'Body')] = 'NULL' + +choice = input('This will drop the {} table. Are you sure [y/n]? '.format(table)) if len(choice) > 0 and choice[0].lower() == 'y': - handleTable(table, keys, args.insert_json, args.dbname, args.file, args.host, args.port, args.username, args.password) + handleTable(table, args.insert_json, args.foreign_keys, args.dbname, args.file, args.host, args.port, args.username, args.password) else: six.print_("Cancelled.") - diff --git a/sql/Badges_fk.sql b/sql/Badges_fk.sql new file mode 100644 index 0000000..b5a4e3f --- /dev/null +++ b/sql/Badges_fk.sql @@ -0,0 +1 @@ +ALTER TABLE badges ADD CONSTRAINT fk_badges_userid FOREIGN KEY (userid) REFERENCES users (id); diff --git a/sql/Comments_fk.sql b/sql/Comments_fk.sql new file mode 100644 index 0000000..aea00c9 --- /dev/null +++ b/sql/Comments_fk.sql @@ -0,0 +1,2 @@ +ALTER TABLE Comments ADD CONSTRAINT fk_comments_userid FOREIGN KEY (userid) REFERENCES users (id); +ALTER TABLE Comments ADD CONSTRAINT fk_comments_postid FOREIGN KEY (postid) REFERENCES posts (id); diff --git a/sql/Comments_post.sql b/sql/Comments_post.sql index e19e8b8..2c3e7a2 100644 --- a/sql/Comments_post.sql +++ b/sql/Comments_post.sql @@ -6,4 +6,4 @@ CREATE INDEX cmnts_postid_idx ON Comments USING hash (PostId) CREATE INDEX cmnts_creation_date_idx ON Comments USING btree (CreationDate) WITH (FILLFACTOR = 100); CREATE INDEX cmnts_userid_idx ON Comments USING btree (UserId) - WITH (FILLFACTOR = 100); \ No newline at end of file + WITH (FILLFACTOR = 100); diff --git a/sql/PostHistory_fk.sql b/sql/PostHistory_fk.sql new file mode 100644 index 0000000..91379eb --- /dev/null +++ b/sql/PostHistory_fk.sql @@ -0,0 +1,2 @@ +ALTER TABLE Posthistory ADD CONSTRAINT fk_posthistory_userid FOREIGN KEY (userid) REFERENCES users (id); +ALTER TABLE Posthistory ADD CONSTRAINT fk_posthistory_postid FOREIGN KEY (postid) REFERENCES posts (id); diff --git a/sql/PostLinks_fk.sql b/sql/PostLinks_fk.sql new file mode 100644 index 0000000..5c40cb4 --- /dev/null +++ b/sql/PostLinks_fk.sql @@ -0,0 +1,13 @@ +-- impossible to enforce these constraints, set as 'not valid' to disable +-- initial test. +-- +-- These constaints can be forced running the following queries: +-- ALTER TABLE postlinks ALTER postid DROP NOT NULL; +-- UPDATE postlinks SET postid=NULL WHERE postid NOT IN (SELECT DISTINCT id FROM Posts); +-- ALTER TABLE postlinks VALIDATE CONSTRAINT fk_postlinks_postid; +-- ALTER TABLE postlinks ALTER relatedpostid DROP NOT NULL; +-- UPDATE postlinks SET relatedpostid=NULL WHERE relatedpostid NOT IN (SELECT DISTINCT id FROM Posts); +-- ALTER TABLE postlinks VALIDATE CONSTRAINT fk_postlinks_relatedpostid; +-- +ALTER TABLE Postlinks ADD CONSTRAINT fk_postlinks_postid FOREIGN KEY (postid) REFERENCES posts (id) NOT VALID; +ALTER TABLE Postlinks ADD CONSTRAINT fk_postlinks_relatedpostid FOREIGN KEY (relatedpostid) REFERENCES posts (id) NOT VALID; diff --git a/sql/Posts_fk.sql b/sql/Posts_fk.sql new file mode 100644 index 0000000..65fea37 --- /dev/null +++ b/sql/Posts_fk.sql @@ -0,0 +1,3 @@ +ALTER TABLE Posts ADD CONSTRAINT fk_posts_parentid FOREIGN KEY (parentid) REFERENCES posts (id); +ALTER TABLE Posts ADD CONSTRAINT fk_posts_owneruserid FOREIGN KEY (owneruserid) REFERENCES users (id); +ALTER TABLE Posts ADD CONSTRAINT fk_posts_lasteditoruserid FOREIGN KEY (lasteditoruserid) REFERENCES users (id); diff --git a/sql/Tags_fk.sql b/sql/Tags_fk.sql new file mode 100644 index 0000000..ca4ca40 --- /dev/null +++ b/sql/Tags_fk.sql @@ -0,0 +1,2 @@ +-- dummy query +SELECT 1; diff --git a/sql/Users_fk.sql b/sql/Users_fk.sql new file mode 100644 index 0000000..ca4ca40 --- /dev/null +++ b/sql/Users_fk.sql @@ -0,0 +1,2 @@ +-- dummy query +SELECT 1; diff --git a/sql/Votes_fk.sql b/sql/Votes_fk.sql new file mode 100644 index 0000000..a52a2a1 --- /dev/null +++ b/sql/Votes_fk.sql @@ -0,0 +1,10 @@ +ALTER TABLE Votes ADD CONSTRAINT fk_votes_userid FOREIGN KEY (userid) REFERENCES users (id); +-- impossible to enforce this constraint, set as 'not valid' to disable +-- initial test. +-- +-- This constaint can be forced running the following queries: +-- ALTER TABLE votes ALTER PostId DROP NOT NULL; +-- UPDATE votes SET postid=NULL WHERE postid NOT IN (SELECT DISTINCT id FROM Posts); +-- ALTER TABLE votes VALIDATE CONSTRAINT fk_votes_postid; +-- +ALTER TABLE Votes ADD CONSTRAINT fk_votes_postid FOREIGN KEY (postid) REFERENCES posts (id) NOT VALID;