In C#, 64bit Windows + .NET 4.5 (or later) + enabling gcAllowVeryLargeObjects
in the App.config file allows for objects larger than two gigabyte. That's cool, but unfortunately, the maximum number of elements that C# allows in a character array is still limited to about 2^31 = 2.15 billion chars. Testing confirmed this.
To overcome this, Microsoft recommends in Option B creating the arrays natively (their 'Option C' doesn't even compile). That suits me, as speed is also a concern. Is there some tried and trusted unsafe / native / interop / PInvoke code for .NET out there that can replace and act as an enhanced StringBuilder to get around the 2 billion element limit?
Unsafe/pinvoke code is preferred, but not a deal breaker. Alternatively, is there a .NET (safe) version available?
Ideally, the StringBuilder replacement will start off small (preferably user-defined), and then repeatedly double in size each time the capacity has been exceeded. I'm mostly looking for append()
functionality here. Saving the string to a file would be useful too, though I'm sure I could program that bit if substring()
functionality is also incorporated. If the code uses pinvoke, then obviously some degree of memory management must be taken into account to avoid memory loss.
I don't want to recreate the wheel if some simple code already exists, but on the other hand, I don't want to download and incorporate a DLL just for this simple functionality.
I'm also using .NET 3.5 to cater for users who don't have the latest version of Windows.
So I ended up creating my own BigStringBuilder function in the end. It's a list where each list element (or page) is a char array (type List<char[]>
).
Providing you're using 64 bit Windows, you can now easily surpass the 2 billion character element limit. I managed to test creating a giant string around 32 gigabytes large (needed to increase virtual memory in the OS first, otherwise I could only get around 7GB on my 8GB RAM PC). I'm sure it handles more than 32GB easily. In theory, it should be able to handle around 1,000,000,000 * 1,000,000,000 chars or one quintillion characters, which should be enough for anyone.
Speed-wise, some quick tests show that it's only around 33% slower than a StringBuilder when appending. I got very similar performance if I went for a 2D jagged char array (char[][]
) instead of List<char[]>
, but Lists are simpler to work with, so I stuck with that.
Hope somebody else finds it useful! There may be bugs, so use with caution. I tested it fairly well though.
// A simplified version specially for StackOverflow
public class BigStringBuilder
{
List<char[]> c = new List<char[]>();
private int pagedepth;
private long pagesize;
private long mpagesize; // https://stackoverflow.com/questions/11040646/faster-modulus-in-c-c
private int currentPage = 0;
private int currentPosInPage = 0;
public BigStringBuilder(int pagedepth = 12) { // pagesize is 2^pagedepth (since must be a power of 2 for a fast indexer)
this.pagedepth = pagedepth;
pagesize = (long)Math.Pow(2, pagedepth);
mpagesize = pagesize - 1;
c.Add(new char[pagesize]);
}
// Indexer for this class, so you can use convenient square bracket indexing to address char elements within the array!!
public char this[long n] {
get { return c[(int)(n >> pagedepth)][n & mpagesize]; }
set { c[(int)(n >> pagedepth)][n & mpagesize] = value; }
}
public string[] returnPagesForTestingPurposes() {
string[] s = new string[currentPage + 1];
for (int i = 0; i < currentPage + 1; i++) s[i] = new string(c[i]);
return s;
}
public void clear() {
c = new List<char[]>();
c.Add(new char[pagesize]);
currentPage = 0;
currentPosInPage = 0;
}
public void fileOpen(string path)
{
clear();
StreamReader sw = new StreamReader(path);
int len = 0;
while ((len = sw.ReadBlock(c[currentPage], 0, (int)pagesize)) != 0) {
if (!sw.EndOfStream) {
currentPage++;
if (currentPage > (c.Count - 1)) c.Add(new char[pagesize]);
}
else {
currentPosInPage = len;
break;
}
}
sw.Close();
}
// See: https://stackoverflow.com/questions/373365/how-do-i-write-out-a-text-file-in-c-sharp-with-a-code-page-other-than-utf-8/373372
public void fileSave(string path) {
StreamWriter sw = File.CreateText(path);
for (int i = 0; i < currentPage; i++) sw.Write(new string(c[i]));
sw.Write(new string(c[currentPage], 0, currentPosInPage));
sw.Close();
}
public long length() {
return (long)currentPage * (long)pagesize + (long)currentPosInPage;
}
public string ToString(long max = 2000000000) {
if (length() < max) return substring(0, length());
else return substring(0, max);
}
public string substring(long x, long y) {
StringBuilder sb = new StringBuilder();
for (long n = x; n < y; n++) sb.Append(c[(int)(n >> pagedepth)][n & mpagesize]); //8s
return sb.ToString();
}
public bool match(string find, long start = 0) {
//if (s.Length > length()) return false;
for (int i = 0; i < find.Length; i++) if (i + start == find.Length || this[start + i] != find[i]) return false;
return true;
}
public void replace(string s, long pos) {
for (int i = 0; i < s.Length; i++) {
c[(int)(pos >> pagedepth)][pos & mpagesize] = s[i];
pos++;
}
}
public void Append(string s)
{
for (int i = 0; i < s.Length; i++)
{
c[currentPage][currentPosInPage] = s[i];
currentPosInPage++;
if (currentPosInPage == pagesize)
{
currentPosInPage = 0;
currentPage++;
if (currentPage == c.Count) c.Add(new char[pagesize]);
}
}
}
}