I threw this together, it's a bit of a mess, but it produces some OK results some of the time. It a mostly-deterministic algorithm, but with some (fixed-seed) randomness added to avoid it producing the same string for similar gaps. It goes to some effort to try to avoid just having columns of spaces either side of the gaps.
I threw this together, it's a bit of a mess, but it produces some OK results some of the time. It a mostly-deterministic algorithm, but with some randomness added to avoid it producing the same string for similar gaps.
I threw this together, it's a bit of a mess, but it produces some OK results some of the time. It a mostly-deterministic algorithm, but with some (fixed-seed) randomness added to avoid it producing the same string for similar gaps. It goes to some effort to try to avoid just having columns of spaces either side of the gaps.
C# 5 massive as ever
I threw this together, it's a bit of a mess, but it produces some OK results some of the time. It a mostly-deterministic algorithm, but with some randomness added to avoid it producing the same string for similar gaps.
It works by tokenizing the input into words and punctuation (punctuation comes from a manually entered list, because I can't be bothered to work out if Unicode can do this for me), so that it can put spaces before words, and not before punctuation, because this is fairly typical. It splits on typical whitespace. In the vein of markov chains (I think), it counts how often each token follows each other token, and then doesn't compute probabilities for this (I figure that because the documents are so tiny, we would do better to bias toward things we see a lot where we can). Then we perform a breadth-first search, filling the space left by the hashes and the 'partial' words either side, with the cost being computed as -fabness(last, cur) * len(cur_with_space), where fabness returns the number of times cur has followed last for each appended token in the generated string. Naturally, we try to minimise the cost. Because we can't always fill the gap with words and punctuation found in the document, it also considers a number of 'special' tokens from certain states, including the partial strings on either side, which we bias against with arbitrarily increased costs.
If the BFS fails to find a solution, then we naively try to pick a random adverb, or just insert spaces to fill the space.
Results
All 6 can be found here: https://gist.github.com/anonymous/5277db726d3f9bdd950b173b19fec82a
The Euclid test-case did not go very well...
Patch the Image
In a popular image editing software there is a feature, that patches (The term
used in image processing is inpainting as @minxomat pointed out.) a selected
area of an image, that patches information outside of that patch. And it does a
quite good job, co the patch a is just a program. As a human, you can sometimes
see that something In a short it if you squeeze your eyes or just take a short
glance, the patch seems to fill in the gap quite well.
Jabberwocky
'Twas brillig, and the slithy toves
Did gyre and the in in the wabe;
All mimsy the mome borogoves,
And the mome raths outgrabe.
Badger
Badger, badger, badger, badger, badger,
badger, badger, badger, badger, badger
Mushroom, mushroom, a-
Badger, badger, badger, badger, badger,
badger, badger, badger, badger, badger
Mushroom, badger, badger
Badger, badger, badger, badger, badger,
badger, badger, badger, badger, badger
Mush-mushroom, a
Badger, badger, badger, badger, badger,
badger, badger, badger, badger, badger
Argh! Snake, a snake!
Snaaake! A snaaaake, oooh its a snake!
_I'm glad with how this one turned out... it's fortuate that "badger, badger," fits, or this one would not have done so well
Code
Run it with
csc ParaPatch.cs
ParaPatch.exe infile outfile
There is quite a lot of it. The only remotely interesting bit is the Fill method. I include the heap implementation, because .NET doesn't have one (WHY MS WHY?!).
using System;
using System.Collections.Generic;
using System.Linq;
namespace ParaPatch
{
class Program
{
private static string[] Filler = new string[] { "may", "will", "maybe", "rather", "perhaps", "reliably", "nineword?", "definitely", "elevenword?", "inexplicably" }; // adverbs
private static char[] Breaking = new char[] { ' ', '\n', '\r', '\t' };
private static char[] Punctuation = new char[] { ',', '.', '{', '}', '(', ')', '/', '?', ':', ';', '\'', '\\', '"', ',', '!', '-', '+', '[', ']', '£', '$', '%', '^', '—' };
private static IEnumerable<string> TokenizeStream(System.IO.StreamReader reader)
{
System.Text.StringBuilder sb = new System.Text.StringBuilder();
HashSet<char> breaking = new HashSet<char>(Breaking);
HashSet<char> punctuation = new HashSet<char>(Punctuation);
while (!reader.EndOfStream)
{
int ci = reader.Read();
if (ci == -1) // sanity
break;
char c = (char)ci;
if (breaking.Contains(c))
{
if (sb.Length > 0)
yield return sb.ToString();
sb.Clear();
}
else if (punctuation.Contains(c))
{
if (sb.Length > 0)
yield return sb.ToString();
yield return ""+c;
sb.Clear();
}
else
{
sb.Append(c);
}
}
if (sb.Length > 0)
yield return sb.ToString();
}
private enum DocTokenTypes
{
Known,
LeftPartial,
RightPartial,
Unknown,
}
private class DocToken
{
public DocTokenTypes TokenType { get; private set; }
public string StringPart { get; private set; }
public int Length { get; private set; }
public DocToken(DocTokenTypes tokenType, string stringPart, int length)
{
TokenType = tokenType;
StringPart = stringPart;
Length = length;
}
}
private static IEnumerable<DocToken> DocumentTokens(IEnumerable<string> tokens)
{
foreach (string token in tokens)
{
if (token.Contains("#"))
{
int l = token.IndexOf("#");
int r = token.LastIndexOf("#");
if (l > 0)
yield return new DocToken(DocTokenTypes.LeftPartial, token.Substring(0, l), l);
yield return new DocToken(DocTokenTypes.Unknown, null, r - l + 1);
if (r < token.Length - 1)
yield return new DocToken(DocTokenTypes.RightPartial, token.Substring(r + 1), token.Length - r - 1);
}
else
yield return new DocToken(DocTokenTypes.Known, token, token.Length);
}
}
private class State : IComparable<State>
{
// missing readonly params already... maybe C#6 isn't so bad
public int Remaining { get; private set; }
public int Position { get; private set; }
public State Prev { get; private set; }
public string Token { get; private set; }
public double H { get; private set; }
public double Fabness { get; private set; }
public string FullFilling { get; private set; }
public State(int remaining, int position, Program.State prev, double fabness, double h, string token, string toAdd)
{
Remaining = remaining;
Position = position;
Prev = prev;
H = h;
Fabness = fabness;
Token = token;
FullFilling = prev != null ? prev.FullFilling + toAdd : toAdd;
}
public int CompareTo(State other)
{
return H.CompareTo(other.H);
}
}
public static void Main(string[] args)
{
if (args.Length < 2)
args = new string[] { "test.txt", "testout.txt" };
List<DocToken> document;
using (System.IO.StreamReader reader = new System.IO.StreamReader(args[0], System.Text.Encoding.UTF8))
{
document = DocumentTokens(TokenizeStream(reader)).ToList();
}
foreach (DocToken cur in document)
{
Console.WriteLine(cur.StringPart + " " + cur.TokenType);
}
// these are small docs, don't bother with more than 1 ply
Dictionary<string, Dictionary<string, int>> FollowCounts = new Dictionary<string, Dictionary<string, int>>();
Dictionary<string, Dictionary<string, int>> PreceedCounts = new Dictionary<string, Dictionary<string, int>>(); // mirror (might be useful)
HashSet<string> knowns = new HashSet<string>(); // useful to have lying around
// build counts
DocToken last = null;
foreach (DocToken cur in document)
{
if (cur.TokenType == DocTokenTypes.Known)
{
knowns.Add(cur.StringPart);
}
if (last != null && last.TokenType == DocTokenTypes.Known && cur.TokenType == DocTokenTypes.Known)
{
{
Dictionary<string, int> ltable;
if (!FollowCounts.TryGetValue(last.StringPart, out ltable))
{
FollowCounts.Add(last.StringPart, ltable = new Dictionary<string, int>());
}
int count;
if (!ltable.TryGetValue(cur.StringPart, out count))
{
count = 0;
}
ltable[cur.StringPart] = count + 1;
}
{
Dictionary<string, int> ctable;
if (!PreceedCounts.TryGetValue(cur.StringPart, out ctable))
{
PreceedCounts.Add(cur.StringPart, ctable = new Dictionary<string, int>());
}
int count;
if (!ctable.TryGetValue(last.StringPart, out count))
{
count = 0;
}
ctable[last.StringPart] = count + 1;
}
}
last = cur;
}
// build probability grid (none of this efficient table filling dynamic programming nonsense, A* all the way!)
// hmm... can't be bothered
Dictionary<string, Dictionary<string, double>> fabTable = new Dictionary<string, Dictionary<string, double>>();
foreach (var k in FollowCounts)
{
Dictionary<string, double> t = new Dictionary<string, double>();
// very naive
foreach (var k2 in k.Value)
{
t.Add(k2.Key, (double)k2.Value);
}
fabTable.Add(k.Key, t);
}
string[] knarr = knowns.ToArray();
Random rnd = new Random("ParaPatch".GetHashCode());
List<string> fillings = new List<string>();
for (int i = 0; i < document.Count; i++)
{
if (document[i].TokenType == DocTokenTypes.Unknown)
{
// shuffle knarr
for (int j = 0; j < knarr.Length; j++)
{
string t = knarr[j];
int o = rnd.Next(knarr.Length);
knarr[j] = knarr[o];
knarr[o] = t;
}
fillings.Add(Fill(document, fabTable, knarr, i));
Console.WriteLine(fillings.Last());
}
}
string filling = string.Join("", fillings);
int fi = 0;
using (System.IO.StreamWriter writer = new System.IO.StreamWriter(args[1]))
using (System.IO.StreamReader reader = new System.IO.StreamReader(args[0]))
{
while (!reader.EndOfStream)
{
int ci = reader.Read();
if (ci == -1)
break;
char c = (char)ci;
c = c == '#' ? filling[fi++] : c;
writer.Write(c);
Console.Write(c);
}
}
// using (System.IO.StreamWriter writer = new System.IO.StreamWriter(args[1], false, System.Text.Encoding.UTF8))
// using (System.IO.StreamReader reader = new System.IO.StreamReader(args[0]))
// {
// foreach (char cc in reader.ReadToEnd())
// {
// char c = cc;
// c = c == '#' ? filling[fi++] : c;
//
// writer.Write(c);
// Console.Write(c);
// }
// }
if (args[0] == "test.txt")
Console.ReadKey(true);
}
private static string Fill(List<DocToken> document, Dictionary<string, Dictionary<string, double>> fabTable, string[] knowns, int unknownIndex)
{
HashSet<char> breaking = new HashSet<char>(Breaking);
HashSet<char> punctuation = new HashSet<char>(Punctuation);
Heap<State> due = new Heap<Program.State>(knowns.Length);
Func<string, string, double> fabness = (prev, next) =>
{
Dictionary<string, double> table;
if (!fabTable.TryGetValue(prev, out table))
return 0; // not fab
double fab;
if (!table.TryGetValue(next, out fab))
return 0; // not fab
return fab; // yes fab
};
DocToken mostLeft = unknownIndex > 2 ? document[unknownIndex - 2] : null;
DocToken left = unknownIndex > 1 ? document[unknownIndex - 1] : null;
DocToken unknown = document[unknownIndex];
DocToken right = unknownIndex < document.Count - 2 ? document[unknownIndex + 1] : null;
DocToken mostRight = unknownIndex < document.Count - 3 ? document[unknownIndex + 2] : null;
// sum of empty space and partials' lengths
int spaceSize = document[unknownIndex].Length
+ (left != null && left.TokenType == DocTokenTypes.LeftPartial ? left.Length : 0)
+ (right != null && right.TokenType == DocTokenTypes.RightPartial ? right.Length : 0);
int l = left != null && left.TokenType == DocTokenTypes.LeftPartial ? left.Length : 0;
int r = l + unknown.Length;
string defaultPrev =
left != null && left.TokenType == DocTokenTypes.Known ? left.StringPart :
mostLeft != null && mostLeft.TokenType == DocTokenTypes.Known ? mostLeft.StringPart :
"";
string defaultLast =
right != null && right.TokenType == DocTokenTypes.Known ? right.StringPart :
mostRight != null && mostRight.TokenType == DocTokenTypes.Known ? mostRight.StringPart :
"";
Func<string, string> topAndTail = str =>
{
return str.Substring(l, r - l);
};
Func<State, string, double, bool> tryMove = (State prev, string token, double specialFabness) =>
{
bool isPunctionuation = token.Length == 1 && punctuation.Contains(token[0]);
string addStr = isPunctionuation || prev == null ? token : " " + token;
int addLen = addStr.Length;
int newRemaining = prev != null ? prev.Remaining - addLen : spaceSize - addLen;
int oldPosition = prev != null ? prev.Position : 0;
int newPosition = oldPosition + addLen;
// check length
if (newRemaining < 0)
return false;
// check start
if (oldPosition < l) // implies left is LeftPartial
{
int s = oldPosition;
int e = newPosition > l ? l : newPosition;
int len = e - s;
if (addStr.Substring(0, len) != left.StringPart.Substring(s, len))
return false; // doesn't match LeftPartial
}
// check end
if (newPosition > r) // implies right is RightPartial
{
int s = oldPosition > r ? oldPosition : r;
int e = newPosition;
int len = e - s;
if (addStr.Substring(s - oldPosition, len) != right.StringPart.Substring(s - r, len))
return false; // doesn't match RightPartial
}
if (newRemaining == 0)
{
// could try to do something here (need to change H)
}
string prevToken = prev != null ? prev.Token : defaultPrev;
bool isLastunctionuation = prevToken.Length == 1 && punctuation.Contains(prevToken[0]);
if (isLastunctionuation && isPunctionuation) // I hate this check, it's too aggresive to be realistic
specialFabness -= 50;
double fab = fabness(prevToken, token);
if (fab < 1 && (token == prevToken))
fab = -1; // bias against unrecognised repeats
double newFabness = (prev != null ? prev.Fabness : 0.0)
- specialFabness // ... whatever this is
- fab * addLen; // how probabilistic
double h = newFabness; // no h for now
State newState = new Program.State(newRemaining, newPosition, prev, newFabness, h, token, addStr);
// Console.WriteLine((prev != null ? prev.Fabness : 0) + "\t" + specialFabness);
// Console.WriteLine(newFabness + "\t" + h + "\t" + due.Count + "\t" + fab + "*" + addLen + "\t" + newState.FullFilling);
due.Add(newState);
return true;
};
// just try everything everything
foreach (string t in knowns)
tryMove(null, t, 0);
if (left != null && left.TokenType == DocTokenTypes.LeftPartial)
tryMove(null, left.StringPart, -1);
while (!due.Empty)
{
State next = due.RemoveMin();
if (next.Remaining == 0)
{
// we have a winner!!
return topAndTail(next.FullFilling);
}
// just try everything
foreach (string t in knowns)
tryMove(next, t, 0);
if (right != null && right.TokenType == DocTokenTypes.RightPartial)
tryMove(next, right.StringPart, -5); // big bias
}
// make this a tad less stupid, non?
return Filler.FirstOrDefault(f => f.Length == unknown.Length) ?? new String(' ', unknown.Length); // oh dear...
}
}
//
// Ultilities
//
public class Heap<T> : System.Collections.IEnumerable where T : IComparable<T>
{
// arr is treated as offset by 1, all idxes stored need to be -1'd to get index in arr
private T[] arr;
private int end = 0;
private void s(int idx, T val)
{
arr[idx - 1] = val;
}
private T g(int idx)
{
return arr[idx - 1];
}
public Heap(int isize)
{
if (isize < 1)
throw new ArgumentException("Cannot be less than 1", "isize");
arr = new T[isize];
}
private int up(int idx)
{
return idx / 2;
}
private int downLeft(int idx)
{
return idx * 2;
}
private int downRight(int idx)
{
return idx * 2 + 1;
}
private void swap(int a, int b)
{
T t = g(a);
s(a, g(b));
s(b, t);
}
private void moveUp(int idx, T t)
{
again:
if (idx == 1)
{
s(1, t);
return; // at end
}
int nextUp = up(idx);
T n = g(nextUp);
if (n.CompareTo(t) > 0)
{
s(idx, n);
idx = nextUp;
goto again;
}
else
{
s(idx, t);
}
}
private void moveDown(int idx, T t)
{
again:
int nextLeft = downLeft(idx);
int nextRight = downRight(idx);
if (nextLeft > end)
{
s(idx, t);
return; // at end
}
else if (nextLeft == end)
{ // only need to check left
T l = g(nextLeft);
if (l.CompareTo(t) < 0)
{
s(idx, l);
idx = nextLeft;
goto again;
}
else
{
s(idx, t);
}
}
else
{ // check both
T l = g(nextLeft);
T r = g(nextRight);
if (l.CompareTo(r) < 0)
{ // left smaller (favour going right if we can)
if (l.CompareTo(t) < 0)
{
s(idx, l);
idx = nextLeft;
goto again;
}
else
{
s(idx, t);
}
}
else
{ // right smaller or same
if (r.CompareTo(t) < 0)
{
s(idx, r);
idx = nextRight;
goto again;
}
else
{
s(idx, t);
}
}
}
}
public void Clear()
{
end = 0;
}
public void Trim()
{
if (end == 0)
arr = new T[1]; // don't /ever/ make arr len 0
else
{
T[] narr = new T[end];
for (int i = 0; i < end; i++)
narr[i] = arr[i];
arr = narr;
}
}
private void doubleSize()
{
T[] narr = new T[arr.Length * 2];
for (int i = 0; i < end; i++)
narr[i] = arr[i];
arr = narr;
}
public void Add(T item)
{
if (end == arr.Length)
{
// resize
doubleSize();
}
end++;
moveUp(end, item);
}
public T RemoveMin()
{
if (end < 1)
throw new Exception("No items, mate.");
T min = g(1);
end--;
if (end > 0)
moveDown(1, g(end + 1));
return min;
}
public bool Empty
{
get
{
return end == 0;
}
}
public int Count
{
get
{
return end;
}
}
System.Collections.IEnumerator System.Collections.IEnumerable.GetEnumerator()
{
return GetEnumerator();
}
public IEnumerator<T> GetEnumerator()
{
return (IEnumerator<T>)arr.GetEnumerator();
}
}
}