I'm looking for an optimal method for putting unique constraints on strings.
My use case is linking all email messages together by their "Message-ID" and "In-Reply-To" fields on the SMTP feed.
However, because the number of messages can grow into the millions, and we have no plans to delete anything, I need a way to index them very quickly. The problem is that I'm under the impression that strings are inherently slower to index that numbers (If I'm wrong about that, please explain).
My solution so far is to convert the message ID to a sha256 hash, then convert to into an 8 x 32 bit chunk 256 bit number like so:
// Its not actually written in C
struct message_id {
int32_t id;
char[255] originalMessageId;
int32_t p01;
int32_t p02;
int32_t p03;
int32_t p04;
int32_t p05;
int32_t p06;
int32_t p07;
int32_t p08;
}
And then put a unique constraint on all the nodes.
Now before anyone says anything about the quality of uniqueness of the message ID, I am aware, but this system isn't designed to have high integrity, just high performance.
So my question is this:
Will this suffice, or is there a trick to indexing strings in MySql that I've missed?
EDIT: Adding Model Design.
MessageIdentity.php
/**
* MessageIdentity
*
* @ORM\Table(name="inbox_message_identity",
* uniqueConstraints={
* @ORM\UniqueConstraint(columns={
* "p01", "p02", "p03", "p04",
* "p05", "p06", "p07", "p08"
* })
* })
* @ORM\Entity(repositoryClass="AppBundle\Entity\Inbox\MessageIdentityRepository")
*/
class MessageIdentity
{
/**
* @var integer
*
* @ORM\Column(name="id", type="integer")
* @ORM\Id
* @ORM\GeneratedValue(strategy="AUTO")
*/
private $id;
/**
* @var string
*
* @ORM\Column(name="originalMessageId", type="string", length=255)
*/
private $originalMessageId;
/**
* @var integer
*
* @ORM\Column(name="p01", type="integer")
*/
private $p01;
/**
* @var integer
*
* @ORM\Column(name="p02", type="integer")
*/
private $p02;
/**
* @var integer
*
* @ORM\Column(name="p03", type="integer")
*/
private $p03;
/**
* @var integer
*
* @ORM\Column(name="p04", type="integer")
*/
private $p04;
/**
* @var integer
*
* @ORM\Column(name="p05", type="integer")
*/
private $p05;
/**
* @var integer
*
* @ORM\Column(name="p06", type="integer")
*/
private $p06;
/**
* @var integer
*
* @ORM\Column(name="p07", type="integer")
*/
private $p07;
/**
* @var integer
*
* @ORM\Column(name="p08", type="integer")
*/
private $p08;
/**
* @param $string
*/
public function __construct($string)
{
parent::__construct();
$bits = self::createBits($this->originalMessageId = $string);
$this->p01 = $bits[0];
$this->p02 = $bits[1];
$this->p03 = $bits[2];
$this->p04 = $bits[3];
$this->p05 = $bits[4];
$this->p06 = $bits[5];
$this->p07 = $bits[6];
$this->p08 = $bits[7];
}
public static function createBits($string)
{
$hash = hash('sha256', $string);
$bits = array();
// Bits are packed in pairs of 16 bit chunks before unpacking as signed 32 bit chunks
// in order to guarrentee there is no overflow when converting the unsigned hex number into a
// PHP integer on 32 bit machines.
$bits[] = self::pluck(unpack('l', pack('s', hexdec(substr($hash, 0, 4))) . pack('s', hexdec(substr($hash, 4, 4)))));
$bits[] = self::pluck(unpack('l', pack('s', hexdec(substr($hash, 8, 4))) . pack('s', hexdec(substr($hash, 12, 4)))));
$bits[] = self::pluck(unpack('l', pack('s', hexdec(substr($hash, 16, 4))) . pack('s', hexdec(substr($hash, 20, 4)))));
$bits[] = self::pluck(unpack('l', pack('s', hexdec(substr($hash, 24, 4))) . pack('s', hexdec(substr($hash, 28, 4)))));
$bits[] = self::pluck(unpack('l', pack('s', hexdec(substr($hash, 32, 4))) . pack('s', hexdec(substr($hash, 36, 4)))));
$bits[] = self::pluck(unpack('l', pack('s', hexdec(substr($hash, 40, 4))) . pack('s', hexdec(substr($hash, 44, 4)))));
$bits[] = self::pluck(unpack('l', pack('s', hexdec(substr($hash, 48, 4))) . pack('s', hexdec(substr($hash, 52, 4)))));
$bits[] = self::pluck(unpack('l', pack('s', hexdec(substr($hash, 56, 4))) . pack('s', hexdec(substr($hash, 60, 4)))));
return $bits;
}
protected static function pluck($array)
{
return $array[1];
}
}
MessageIdentityRepository.php
class MessageIdentityRepository extends \Doctrine\ORM\EntityRepository
{
public function getExisting($string)
{
$bits = MessageIdentity::createBits($string);
$qb = $this->createQueryBuilder('i');
$qb
->where($qb->expr()->andX(
$qb->expr()->eq('i.p01', $qb->expr()->literal($bits[0])),
$qb->expr()->eq('i.p02', $qb->expr()->literal($bits[1])),
$qb->expr()->eq('i.p03', $qb->expr()->literal($bits[2])),
$qb->expr()->eq('i.p04', $qb->expr()->literal($bits[3])),
$qb->expr()->eq('i.p05', $qb->expr()->literal($bits[4])),
$qb->expr()->eq('i.p06', $qb->expr()->literal($bits[5])),
$qb->expr()->eq('i.p07', $qb->expr()->literal($bits[6])),
$qb->expr()->eq('i.p08', $qb->expr()->literal($bits[7]))
))
->setMaxResults(1)
;
return $qb->getQuery()->getOneOrNullResult();
}
}
MessageRepository.php
class MessageRepository extends \Doctrine\ORM\EntityRepository
{
public function getLastWithMessageID(MessageIdentity $messageIdentity)
{
$qb = $this->createQueryBuilder('m');
$qb
->where('m.messageIdentity = :identity')
->setParameter(':identity', $messageIdentity)
->orderBy('m.date', 'DESC')
->setMaxResults(1)
;
return $qb->getQuery()->getOneOrNullResult();
}
}
This is a model built using Doctrine2. The message itself holds a foreign key to the MessageIdentity table.
The MessageIdentity is searched for by reconstructing the bit set, and searching on all of the columns which should perfectly utilize the unique constraint placed on the table.
The message is searched for based on the mapped identity, order by descending date, and only a single row is fetched.
messagesor something? Showing the structure could help answering your question.UNHEX()MySQL method so you can cut down on storage requirement (no need for varchars).