diff --git a/load_into_pg.py b/load_into_pg.py index c65e854..33be75c 100755 --- a/load_into_pg.py +++ b/load_into_pg.py @@ -5,6 +5,7 @@ import psycopg2 as pg import row_processor as Processor import six +import json # Special rules needed for certain tables (esp. for old database dumps) specialRules = { @@ -15,17 +16,17 @@ def _makeDefValues(keys): """Returns a dictionary containing None for all keys.""" return dict(( (k, None) for k in keys )) -def _createMogrificationTemplate(table, keys): +def _createMogrificationTemplate(table, keys, insertJson): """Return the template string for mogrification for the given keys.""" - return ( '(' + - ', '.join( [ '%(' + k + ')s' if (table, k) not in specialRules else specialRules[table, k] - for k in keys - ] - ) + - ')' - ) - -def _createCmdTuple(cursor, keys, templ, attribs): + table_keys = ', '.join( [ '%(' + k + ')s' if (table, k) not in specialRules + else specialRules[table, k] + for k in keys ]) + if insertJson: + return ('(' + table_keys + ', %(jsonfield)s' + ')') + else: + return ('(' + table_keys + ')') + +def _createCmdTuple(cursor, keys, templ, attribs, insertJson): """Use the cursor to mogrify a tuple of data. The passed data in `attribs` is augmented with default data (NULLs) and the order of data in the tuple is the same as in the list of `keys`. The @@ -34,12 +35,20 @@ def _createCmdTuple(cursor, keys, templ, attribs): """ defs = _makeDefValues(keys) defs.update(attribs) + + if insertJson: + dict_attribs = { } + for name, value in attribs.items(): + dict_attribs[name] = value + defs['jsonfield'] = json.dumps(dict_attribs) + + values_to_insert = cursor.mogrify(templ, defs) return cursor.mogrify(templ, defs) -def handleTable(table, keys, dbname, mbDbFile, mbHost, mbPort, mbUsername, mbPassword): +def handleTable(table, keys, insertJson, dbname, mbDbFile, mbHost, mbPort, mbUsername, mbPassword): """Handle the table including the post/pre processing.""" dbFile = mbDbFile if mbDbFile is not None else table + '.xml' - tmpl = _createMogrificationTemplate(table, keys) + tmpl = _createMogrificationTemplate(table, keys, insertJson) start_time = time.time() try: @@ -82,7 +91,7 @@ def handleTable(table, keys, dbname, mbDbFile, mbHost, mbPort, mbUsername, mbPas six.print_('Processing data ...') for rows in Processor.batch(Processor.parse(xml), 500): valuesStr = ',\n'.join( - [ _createCmdTuple(cur, keys, tmpl, row_attribs).decode('utf-8') + [ _createCmdTuple(cur, keys, tmpl, row_attribs, insertJson).decode('utf-8') for row_attribs in rows ] ) @@ -159,6 +168,11 @@ def handleTable(table, keys, dbname, mbDbFile, mbHost, mbPort, mbUsername, mbPas , default = False ) +parser.add_argument( '-j', '--insert-json' + , help = 'Insert raw data as JSON.' + , action = 'store_true' + , default = False + ) args = parser.parse_args() table = args.table @@ -279,7 +293,7 @@ def handleTable(table, keys, dbname, mbDbFile, mbHost, mbPort, mbUsername, mbPas choice = input('This will drop the {} table. Are you sure [y/n]?'.format(table)) if len(choice) > 0 and choice[0].lower() == 'y': - handleTable(table, keys, args.dbname, args.file, args.host, args.port, args.username, args.password) + handleTable(table, keys, args.insert_json, args.dbname, args.file, args.host, args.port, args.username, args.password) else: six.print_("Cancelled.") diff --git a/sql/Badges_pre.sql b/sql/Badges_pre.sql index 98a2b34..65944d9 100644 --- a/sql/Badges_pre.sql +++ b/sql/Badges_pre.sql @@ -3,5 +3,6 @@ CREATE TABLE Badges ( Id int PRIMARY KEY , UserId int not NULL , Name text not NULL , - Date timestamp not NULL + Date timestamp not NULL , + jsonfield jsonb ); diff --git a/sql/Comments_pre.sql b/sql/Comments_pre.sql index 6942df6..43f166c 100644 --- a/sql/Comments_pre.sql +++ b/sql/Comments_pre.sql @@ -5,5 +5,6 @@ CREATE TABLE Comments ( Score int not NULL , Text text , CreationDate timestamp not NULL , - UserId int + UserId int , + jsonfield jsonb ); diff --git a/sql/PostHistory_pre.sql b/sql/PostHistory_pre.sql index 24684d1..361dd3d 100644 --- a/sql/PostHistory_pre.sql +++ b/sql/PostHistory_pre.sql @@ -6,5 +6,6 @@ CREATE TABLE PostHistory ( RevisionGUID text , CreationDate timestamp not NULL , UserId int , - PostText text + PostText text , + jsonfield jsonb ); diff --git a/sql/PostLinks_pre.sql b/sql/PostLinks_pre.sql index aaa258c..3793522 100644 --- a/sql/PostLinks_pre.sql +++ b/sql/PostLinks_pre.sql @@ -4,5 +4,6 @@ CREATE TABLE PostLinks ( CreationDate timestamp not NUll , PostId int not NULL , RelatedPostId int not NULL , - LinkTypeId int not Null + LinkTypeId int not Null , + jsonfield jsonb ); diff --git a/sql/Posts_pre.sql b/sql/Posts_pre.sql index 60f7239..ed4d75e 100644 --- a/sql/Posts_pre.sql +++ b/sql/Posts_pre.sql @@ -19,6 +19,7 @@ CREATE TABLE Posts ( CommentCount int , FavoriteCount int , ClosedDate timestamp , - CommunityOwnedDate timestamp + CommunityOwnedDate timestamp , + jsonfield jsonb ); diff --git a/sql/Tags_pre.sql b/sql/Tags_pre.sql index 26979fe..24dd050 100644 --- a/sql/Tags_pre.sql +++ b/sql/Tags_pre.sql @@ -2,7 +2,8 @@ DROP TABLE IF EXISTS Tags CASCADE; CREATE TABLE Tags ( Id int PRIMARY KEY , TagName text not NULL , - Count int, - ExcerptPostId int, - WikiPostId int + Count int , + ExcerptPostId int , + WikiPostId int , + jsonfield jsonb ); diff --git a/sql/Users_pre.sql b/sql/Users_pre.sql index 4246be3..ad188cf 100644 --- a/sql/Users_pre.sql +++ b/sql/Users_pre.sql @@ -13,6 +13,7 @@ CREATE TABLE Users ( DownVotes int not NULL , ProfileImageUrl text , Age int , - AccountId int -- NULL accountId == deleted account? + AccountId int , -- NULL accountId == deleted account? + jsonfield jsonb ); diff --git a/sql/Votes_pre.sql b/sql/Votes_pre.sql index 2a9b5ff..29aebe0 100644 --- a/sql/Votes_pre.sql +++ b/sql/Votes_pre.sql @@ -5,6 +5,7 @@ CREATE TABLE Votes ( VoteTypeId int not NULL , UserId int , CreationDate timestamp not NULL , - BountyAmount int + BountyAmount int , + jsonfield jsonb );