For everybody still looking as of 2025 I just worked out a way to do it reliably:
public static float Round(this float value, float step = 1f)
{
if (step <= 0) throw new System.ArgumentException("Step must be greater than zero.");
float remainder = (value = Mathf.Ceil(value / 0.001f) * 0.001f) % step;
float halfStep = step / 2f;
return
remainder >= halfStep ? value + step - remainder :
remainder < -halfStep ? value - step - remainder :
value - remainder;
}
public static string[] GetLines(this PdfDocument document, int page, char wordSeperator = ' ', float tolerance = 0.01f)
{
var lines = new List<string>();
if (page > 0 && document.NumberOfPages >= page)
{
var wordGroups = document.GetPage(page)
.GetWords()
.GroupBy(w => ((float)(w.BoundingBox.Top + w.BoundingBox.Bottom) * 0.5f).Round(tolerance));
foreach (var group in wordGroups.OrderByDescending(g => g.Key)) // top to bottom
{
var line = string.Join(wordSeperator, group.OrderBy(w => w.BoundingBox.Left));
if (line.NotEmpty()) lines.Add(line);
}
}
return [.. lines];
}
Enjoy. Also take a look at the Cutulu SDK by @narrenschlag on GitHub and leave a star if you liked this. :)
~ Max, der Narr