Code Optimization

Interesting things in software development and code optimization

C#, .NET, x32/x64 Assembler and cross-platform code

Hello my dear friends.


Today we will look into such popular thing like cross-platform code, and such powerful thing like assembler. Of course, we will use C#.NET for this all as usually  :)

To make a cross-platform code we have to be aware of two main things:

- method call conventions 

- system API

We need different platform dependent system APIs to allocate and free executing memory and we need universal code to avoid method call convention problems.

To call system dependent APIs we need to understand what OS we are running under, here is code how we can do it:

 

private enum Platform

{

Windows,

Linux,

Mac

}

private static Platform RunningPlatform()

{

switch (Environment.OSVersion.Platform)

{

case PlatformID.Unix: // Well, there are chances MacOSX is reported as Unix instead of MacOSX. // Instead of platform check, we'll do a feature checks (Mac specific root folders) if (Directory.Exists("/Applications")

& Directory.Exists("/System")

& Directory.Exists("/Users") & Directory.Exists("/Volumes")) return Platform.Mac; else return Platform.Linux; case PlatformID.MacOSX: return Platform.Mac; default: return Platform.Windows;

}

}

Now we are going to declare PInvokes for linux and windows and implement logic to allocate and free memory with regards to OS we are running under (linux pinvokes were taken from the Mono source code):

#region Windows

[Flags]

private enum AllocationTypes : uint

{

Commit = 0x1000, Reserve = 0x2000,

Reset = 0x80000, LargePages = 0x20000000,

Physical = 0x400000, TopDown = 0x100000,

WriteWatch = 0x200000

}

[Flags]

private enum MemoryProtections : uint

{

Execute = 0x10, ExecuteRead = 0x20,

ExecuteReadWrite = 0x40, ExecuteWriteCopy = 0x80,

NoAccess = 0x01, ReadOnly = 0x02,

ReadWrite = 0x04, WriteCopy = 0x08,

GuartModifierflag = 0x100, NoCacheModifierflag = 0x200,

WriteCombineModifierflag = 0x400

}

[Flags]

private enum FreeTypes : uint

{

Decommit = 0x4000, Release = 0x8000

}

[DllImport("kernel32.dll", SetLastError = true)]

private static extern IntPtr VirtualAlloc(

IntPtr lpAddress,

UIntPtr dwSize,

AllocationTypes flAllocationType,

MemoryProtections flProtect);

 

[DllImport("kernel32")]

[return: MarshalAs(UnmanagedType.Bool)]

private static extern bool VirtualFree(

IntPtr lpAddress,

uint dwSize,

FreeTypes flFreeType);

#endregion

#region Unix

[AttributeUsage(

AttributeTargets.Class |

AttributeTargets.Delegate |

AttributeTargets.Enum |

AttributeTargets.Field |

AttributeTargets.Struct)]

private class MapAttribute : Attribute

{

private string nativeType;

private string suppressFlags;

public MapAttribute()

{

}

public MapAttribute(string nativeType)

{

this.nativeType = nativeType;

}

public string NativeType

{

get { return nativeType; }

}

public string SuppressFlags

{

get { return suppressFlags; }

set { suppressFlags = value; }

}

}

private const string MPH = "MonoPosixHelper";

private const string LIBC = "msvcrt";

[Map]

[Flags]

private enum MmapProts : int

{

PROT_READ = 0x1, // Page can be read.

PROT_WRITE = 0x2, // Page can be written.

PROT_EXEC = 0x4, // Page can be executed.

PROT_NONE = 0x0, // Page can not be accessed.

PROT_GROWSDOWN = 0x01000000, // Extend change to start of

// growsdown vma (mprotect only).

PROT_GROWSUP = 0x02000000, // Extend change to start of

// growsup vma (mprotect only).

}

[Map]

[Flags]

private enum MmapFlags : int

{

MAP_SHARED = 0x01, // Share changes.

MAP_PRIVATE = 0x02, // Changes are private.

MAP_TYPE = 0x0f, // Mask for type of mapping.

MAP_FIXED = 0x10, // Interpret addr exactly.

MAP_FILE = 0,

MAP_ANONYMOUS = 0x20, // Don't use a file.

MAP_ANON = MAP_ANONYMOUS,

// These are Linux-specific.

MAP_GROWSDOWN = 0x00100, // Stack-like segment.

MAP_DENYWRITE = 0x00800, // ETXTBSY

MAP_EXECUTABLE = 0x01000, // Mark it as an executable.

MAP_LOCKED = 0x02000, // Lock the mapping.

MAP_NORESERVE = 0x04000, // Don't check for reservations.

MAP_POPULATE = 0x08000, // Populate (prefault) pagetables.

MAP_NONBLOCK = 0x10000, // Do not block on IO.

}

[DllImport(MPH, SetLastError = true,

EntryPoint = "Mono_Posix_Syscall_mmap")]

private static extern IntPtr mmap(IntPtr start, ulong length,

MmapProts prot, MmapFlags flags, int fd, long offset);

[DllImport(MPH, SetLastError = true,

EntryPoint = "Mono_Posix_Syscall_munmap")]

public static extern int munmap(IntPtr start, ulong length);

[DllImport(MPH, SetLastError = true,

EntryPoint = "Mono_Posix_Syscall_mprotect")]

private static extern int mprotect(IntPtr start, ulong len, MmapProts prot);


[DllImport(MPH, CallingConvention = CallingConvention.Cdecl,

SetLastError = true, EntryPoint = "Mono_Posix_Stdlib_malloc")] private static extern IntPtr malloc(ulong size);

[DllImport(LIBC, CallingConvention = CallingConvention.Cdecl)] public static extern void free(IntPtr ptr);

#endregion


[UnmanagedFunctionPointerAttribute(CallingConvention.Cdecl)]

public unsafe delegate void asmFunc();

public static IntPtr VirtualAlloc(uint size)

{

IntPtr ptr = IntPtr.Zero;

if (RunningPlatform() == Platform.Windows)

{

ptr = VirtualAlloc(

IntPtr.Zero,

new UIntPtr(size),

AllocationTypes.Commit | AllocationTypes.Reserve,

MemoryProtections.ExecuteReadWrite);

}

else

{

Console.WriteLine("Linux memory allocation...");

ptr = mmap(IntPtr.Zero, 4096, MmapProts.PROT_EXEC | MmapProts.PROT_READ | MmapProts.PROT_WRITE, MmapFlags.MAP_ANONYMOUS | MmapFlags.MAP_PRIVATE, 0, 0);

Console.WriteLine("memory ptr: " + ptr.ToInt64());

}

return ptr;

}


public static void VirtualFree(IntPtr ptr, uint size)

{

if (RunningPlatform() == Platform.Windows)

{

VirtualFree(ptr, size, FreeTypes.Release);

}

else

{

Console.WriteLine("Free memory ptr: " + ptr.ToInt64());

int r = munmap(ptr, size);

Console.WriteLine("memory free status: " + r);

}

}


Ok, we have methods to allocate and free memory and now, we need some predefined assembly code and template to avoid any calling convention problems. To do it we will declare our methods and delegates as parameter-less and will pass parameters as declared bytes:

 

byte[] codeArray = new byte[]

{

0xE8, // call next code after data

0x00,

0x00,

0x00,

0x00,

//

//data will go here

//


0x5B, // pop e/rbx - now e/rbx looks into data address

0xFF, // inc dword [e/rbx]

0x03,

(byte)(IntPtr.Size > 4 ? 0x48 : 0x90),

0xFF, // inc e/rbx

0xC3,

(byte)(IntPtr.Size > 4 ? 0x48 : 0x90),

0xFF, // inc e/rbx

0xC3,

(byte)(IntPtr.Size > 4 ? 0x48 : 0x90),

0xFF, // inc e/rbx

0xC3,

(byte)(IntPtr.Size > 4 ? 0x48 : 0x90),

0xFF, // inc e/rbx

0xC3,

(byte)(IntPtr.Size > 4 ? 0x48 : 0x90), // dec d/qword [rbx]

0xFF,

0x0B,


0xC3 // retn - return from your method

};

byte[] dataArray = new byte[]

{

0xFF, //parameter Int32

0x00,

0x00,

0x00,

0x00, //parameter Int64

0x00,

0x00,

0x00,

0x00,

0x00,

0x00,

0x00

};


Console.WriteLine("Ptr size: " + IntPtr.Size);

//allocate memory for our asm method

IntPtr pp = Native.VirtualAlloc((uint)(codeArray.Length + dataArray.Length));

unsafe

{

IntPtr p = pp;

int n = 0;

byte* bptr = (byte*)p;

Marshal.Copy(codeArray, 0, p, 4);

p += 1;

//write offset to the next code line

Marshal.WriteInt32(p, dataArray.Length);

p += 4;

//copy data

Marshal.Copy(dataArray, 0, p, dataArray.Length);

p += dataArray.Length;

//copy rest of the code

Marshal.Copy(codeArray, 5, p, codeArray.Length - 5);

bptr[n] = bptr[n];

n = 0;

}

Native.asmFunc asmFunc = (Native.asmFunc)System.Runtime.InteropServices.Marshal.GetDelegateForFunctionPointer(pp, typeof(Native.asmFunc));

Console.WriteLine("in param int32 = " + BitConverter.ToInt32(dataArray, 0));

Console.WriteLine("in param int64 = " + BitConverter.ToInt64(dataArray, sizeof(Int32)));

Console.WriteLine("call asm method...");

asmFunc();

Console.WriteLine("exit asm method");

As our methods are parameter-less a system will not use stack and we will not have to care about stack. To get address of our first parameter we will use well-known technic, such as call addr and pop reg, that moves to our next processor instruction and pops back an address, this address will be the address of our first parameter.

To pass back any parameters from our assembly method we will use the same address to put them before return back.

//copy params back to array

Marshal.Copy(pp + 5, dataArray, 0, dataArray.Length);

Console.WriteLine("out param int32 = " + BitConverter.ToInt32(dataArray, 0));

Console.WriteLine("out param int64 = " + BitConverter.ToInt64(dataArray, sizeof(Int32)));

//free allocated memory

Native.VirtualFree(pp, (uint)(codeArray.Length + dataArray.Length));

GC.Collect(GC.MaxGeneration, GCCollectionMode.Forced);

Console.WriteLine("any key to exit");

Console.ReadLine();


Now run it and test it. Remember that running this code, or any assembly code, under visual studio may cause error and exception, so always test your code out of any debugger.


I did test this code under Windows 10 64 bit and compiling in x32 and x64 modes, and Linux Ubuntu 14.04 64 bit mode.


Comments are welcome.


Thank you all,

See you later :)


1vqHSTrq1GEoEF7QsL8dhmJfRMDVxhv2y



Silverligh 3D Icons

Hello,


Today, I'm going to share my experience with the Silverlight and 3D. One time I saw a Microsoft presentation about a game platform or something, and there were 3D buttons like you see bellow, they did attract my attention so much that I started to make something similar. After a few days video review and thinking I started to implement it myself.

The main thing here is adding image layers and rotate them at once, plus some additional visual effects. So, here is the mouse move handler:

p = e.GetPosition(LayoutRoot);

ry = (mx - p.X) / 2;

rx = -((my - p.Y) / 2);

degree = Math.Atan2(p.Y - my, p.X - mx) * -180 / Math.PI;

degree = 180.0 + degree;

LayoutRoot.Effect.SetValue(DropShadowEffect.DirectionProperty, degree);

BorderControl.Projection.SetValue(PlaneProjection.RotationXProperty, rx);

BorderControl.Projection.SetValue(PlaneProjection.RotationYProperty, ry);

bgTranslate.X = offsetBackgroundX + ry / k;

bgTranslate.Y = offsetBackgroundY + (-rx / k);

foreach (IconImage ii in images)

{

ii.Image.Projection.SetValue(PlaneProjection.RotationXProperty, rx + (ii.ZIndex / globalZIndexStart));

ii.Image.Projection.SetValue(PlaneProjection.RotationYProperty, ry + (ii.ZIndex / globalZIndexStart));

ii.Image.Effect.SetValue(DropShadowEffect.DirectionProperty, degree);

}

So, you calculate angle from two points and rotate each image by the angle value.

Source code is bellow, thank you :)

Icon3D.zip (9.3KB)

1vqHSTrq1GEoEF7QsL8dhmJfRMDVxhv2y



C#.Net - Perspective Image Transform

Hello,


Today, I'm going to share one interesting thing that everyone may need - perspective image transformation.

Its 2D transformation that makes image look like in perspective view like 3D but without any 3D:


So, the main thing there is math. We take an image, specify four points and math will do the trick.

First step is to create and calculate matrices from the new four points:

//original points

Point p1 = new Point(0, 0);

Point p2 = new Point(width, 0);

Point p3 = new Point(width, height);

Point p4 = new Point(0, height);

//create matrix

A = Matrix3x3.Homogenous(p1, p2, p3, p4);

//new points

Point n1 = points[0];

Point n2 = points[1];

Point n3 = points[2];

Point n4 = points[3];

B = Matrix3x3.Homogenous(n1, n2, n3, n4);

A.Inverse();

C = B.MultMat(A);

C.Inverse();

Second step is to go through each pixel on your image and calculate new pixel:

Point ptDest = new Point(0, 0);

PointF ptOriginF = new Point(0, 0);

int iOrigX = 0;

Color pix = new Color();

for (int x = 0; x < width; ++x)

{

for (int y = 0; y < height; ++y)

{

ptDest.X = x;

ptDest.Y = y;

ptOriginF = C.Update(ptDest);

if (ptOriginF.X >= -5 && ptOriginF.X < width && ptOriginF.Y >= -5 && ptOriginF.Y < height)

{


iOrigX = (int)ptOriginF.X; // round to lowest integer

int iOrigY = (int)ptOriginF.Y; // round to lowest integer

double dx = ptOriginF.X - iOrigX;

double dy = ptOriginF.Y - iOrigY;

Point ptOrigin = new Point(iOrigX, iOrigY);

if (dx != 0.0f || dy != 0.0)

{

Color pix1 = Color.FromArgb(0, 255, 255, 255);

Color pix2 = Color.FromArgb(0, 255, 255, 255);

Color pix3 = Color.FromArgb(0, 255, 255, 255);

Color pix4 = Color.FromArgb(0, 255, 255, 255);

//

// Correct square's direction

//

int idx = (dx >= 0.0) ? 1 : -1;

int idy = (dy >= 0.0) ? 1 : -1;


dx = Math.Abs(dx);

dy = Math.Abs(dy);

//

// Get pixels of square

//

if (ptOrigin.X >= 0 && ptOrigin.X < width && ptOrigin.Y >= 0 && ptOrigin.Y < height)

pix1 = src.GetPixel(ptOrigin.X, ptOrigin.Y);

if (ptOrigin.X + idx >= 0 && ptOrigin.X + idx < width && ptOrigin.Y >= 0 && ptOrigin.Y < height)

pix2 = src.GetPixel(ptOrigin.X + idx, ptOrigin.Y);

if (ptOrigin.X >= 0 && ptOrigin.X < width && ptOrigin.Y + idy >= 0 && ptOrigin.Y + idy < height)

pix3 = src.GetPixel(ptOrigin.X, ptOrigin.Y + idy);

if (ptOrigin.X + idx >= 0 && ptOrigin.X + idx < width && ptOrigin.Y + idy >= 0 && ptOrigin.Y + idy < height)

pix4 = src.GetPixel(ptOrigin.X + idx, ptOrigin.Y + idy);

//

// Use bilinear interpolation

//

double r = pix1.R + (pix2.R - pix1.R) * dx + (pix3.R - pix1.R) * dy + (pix1.R - pix2.R - pix3.R + pix4.R) * dx * dy;

double g = pix1.G + (pix2.G - pix1.G) * dx + (pix3.G - pix1.G) * dy + (pix1.G - pix2.G - pix3.G + pix4.G) * dx * dy;

double b = pix1.B + (pix2.B - pix1.B) * dx + (pix3.B - pix1.B) * dy + (pix1.B - pix2.B - pix3.B + pix4.B) * dx * dy;

double a = pix1.A + (pix2.A - pix1.A) * dx + (pix3.A - pix1.A) * dy + (pix1.A - pix2.A - pix3.A + pix4.A) * dx * dy;

pix = Color.FromArgb((byte)a, (byte)r, (byte)g, (byte)b);

}

else

{

pix = src.GetPixel(ptOrigin.X, ptOrigin.Y);

}

dst.SetPixel(ptDest.X, ptDest.Y, pix);

}

}

}

So this is bilinear interpolation is the main thing to transform your image.


Thank you.

PS

will extend this and provide source code if you will request.




1vqHSTrq1GEoEF7QsL8dhmJfRMDVxhv2y



Microsoft Expression Encoder - How to use and without installation

Hello friends,


Today I will share my experience with the Microsoft Expression Encoder 4 free version.

Download it and install from the link above, add references to the libraries:


- Microsoft.Expression.Encoder.dll

- Microsoft.Expression.Encoder.Api2.dll

- Microsoft.Expression.Encoder.Types.dll

- Microsoft.Expression.Encoder.Utilities.dll


So the minimum code you need to implement to be able to preview and capture video and audio and list devices is the following:

video.Clear();

audio.Clear();

foreach (var dev in EncoderDevices.FindDevices(EncoderDeviceType.Video))

{

video.Add(new ComboboxItem() { Text = dev.Name, Value = dev })

}


foreach (var dev in EncoderDevices.FindDevices(EncoderDeviceType.Audio))

{

audio.Add(new ComboboxItem() { Text = dev.Name, Value = dev });

}

try

{

if (ljob == null)

{

ljob = new LiveJob();

source = ljob.AddDeviceSource((EncoderDevice)((ComboboxItem)cbVideoDevice.SelectedItem).Value,

(EncoderDevice)((ComboboxItem)cbAudioDevice.SelectedItem).Value);

source.PickBestVideoFormat(videoSize, 400000); //in 100 nanoseconds = 40 ms = 25 frames per seconds

source.SetTransportMode(TransportMode.FastForward);


ljob.OutputFormat.VideoProfile = new Microsoft.Expression.Encoder.Profiles.AdvancedVC1VideoProfile() { };

ljob.OutputFormat.VideoProfile.Size = videoSize;

ljob.OutputFormat.VideoProfile.FrameRate = 25;

ljob.OutputFormat.AudioProfile = new Microsoft.Expression.Encoder.Profiles.WmaAudioProfile() { };

ljob.OutputFormat.AudioProfile.Codec = Microsoft.Expression.Encoder.Profiles.AudioCodec.Wma;

}

source.PreviewWindow = new PreviewWindow(new System.Runtime.InteropServices.HandleRef(capForm.Panel, ca pForm.Panel.Handle));

ljob.ActivateSource(source);

}

catch (Microsoft.Expression.Encoder.SystemErrorException ex)

{

if (ex.ErrorCode == -2126905299)

{

MessageBox.Show("Device in use by another application.", "Test EE 4");

}

}

and to start capture:

ljob.PublishFormats.Add(new FileArchivePublishFormat(System.IO.Path.GetFullPath(filePath + fileName)));

ljob.StartEncoding();


Another important thing is to avoid using the installation package of the Microsoft Expression Encoder but just use those four Dlls.

If you will try to run your app on another PC you will get error that Expression Encoder has no license key or something. So to avoid this you have to cases:


- bring the install package and install it on every PC where is your app

- make some changes into registry


I will show you what changes should we make to be able to use it on every PC without installation of Microsoft Expression Encoder, here is the code:

private void TellExpressionEncoderWhereItIs()

{

try

{

var key = "SOFTWARE\\Microsoft\\Expression\\Encoder\\4.0";

 

using (var registryKey = Registry.LocalMachine.OpenSubKey(key))

{

if (registryKey == null)

{

using (var newKey = Registry.LocalMachine.CreateSubKey(key))

{

CheckInstallKey(newKey);

}

}

}

key = "SOFTWARE\\Microsoft\\Expression\\Encoder\\eaa89a7c-d288-4a52-9b68-54930f18ffb7";


using (var registryKey = Registry.LocalMachine.OpenSubKey(key))

{

if (registryKey == null)

{

using (var newKey = Registry.LocalMachine.CreateSubKey(key))

{

CheckInstallKey(newKey);

}

}

}

}

catch (Exception ex)

{

MessageBox.Show(ex.ToString());

}

}

private void CheckInstallKey(RegistryKey registryKey)

{

var path = "c:\\Program Files\\Microsoft Expression\\Encoder 4\\";

var installKey = "InstallDir";

if (registryKey != null)

{

string text = registryKey.GetValue(installKey) as string;

if (string.IsNullOrEmpty(text))

{

registryKey.SetValue(installKey, path);

}

}

path = "4.0.4276.0";

installKey = "Version";

if (registryKey != null)

{

string text = registryKey.GetValue(installKey) as string;

if (string.IsNullOrEmpty(text))

{

registryKey.SetValue(installKey, path);

}

}

path = "c:\\Program Files\\Microsoft Expression\\Encoder 4\\Encoder.exe";

installKey = "Encoder";

if (registryKey != null)

{

string text = registryKey.GetValue(installKey) as string;

if (string.IsNullOrEmpty(text))

{

registryKey.SetValue(installKey, path);

}

}

path = "c:\\Program Files\\Microsoft Expression\\Encoder 4\\Encoder.exe";

installKey = "InstallPath";

if (registryKey != null)

{

string text = registryKey.GetValue(installKey) as string;

if (string.IsNullOrEmpty(text))

{

registryKey.SetValue(installKey, path);

}

}

}


Also, you will be able to select screen as a video device and capture your screen.


Thank you and let me know if you have questions or better idea :)


1vqHSTrq1GEoEF7QsL8dhmJfRMDVxhv2y



DataGridView and huge amount of data rows

Hello my friends,

Did you have a need to populate the DataGridView control with a lot of data? I'm sure you did have.

If you have a huge amount of rows, like 10 000 and more, you will see a huge problem in performance.

To avoid performance leak - you need to set proper value into the RowHeadersWidthSizeMode property.

So the best way is to disable auto resizing during data binding:

dataGridView1.RowHeadersWidthSizeMode = DataGridViewRowHeadersWidthSizeMode.DisableResizing;

you actually can set EnableResizing but avoid to use the DataGridViewRowHeadersWidthSizeMode.AutoSizeToAllHeaders

The AutoSizeToAllHeaders is most time consumable parameter.

In addition would be better to set the RowHeadersVisible to false

dataGridView1.RowHeadersVisible = false;

Now you can bind data source, and enable it all or set what you want it to be 


Thank you, see you next time.


1vqHSTrq1GEoEF7QsL8dhmJfRMDVxhv2y



C#.NET and unmanaged static C++ library

Hi friends,

Today I'm going to share my experience with c++ static libraries.


Static library differs from dynamic library by that, that parts of static library code will be included into caller code.

For example, I have an exe and my exe code calls a function from a static library. It means that this static library function's code will be included into my exe and I will not need any library to be with the exe.


But with C# such things are going to be more tricky, because of C#.NET is managed code and static library is unmanaged code and thus could not be linked and included.

So we need another way to do it. As you know we can use PInvoke to be able to access exported dll functions, and this is going to help us a lot.

First step is to create a Dll C++ project and link a static library to this Dll, our dll project is going to be something like a wrapper, and add some export method so we would be able to invoke them from C#.NET code.

Here is how to do it:

extern "C" __declspec(dllexport) int InitializeLib2(int type, const char *data, BOOL useFlag)

{

return ::InitializeLib(type, data, useFlag);

}

so we have declared our InitializeLib2 function for export and to be used via PInvoke from C# code, inside of this InitializeLib2 function we have a call to a static library InitializeLib function  and just pass parameters from our method.

Now we build it and get a Dll file that can be pinvoked from our C# code.

Here is how to do it:

[DllImport("MyWrapper.dll", CallingConvention = CallingConvention.Cdecl)]

public static extern int InitializeLib2(int zero, ref byte str, bool b);

and here how to call this method:

byte[] str = ASCIIEncoding.ASCII.GetBytes("my string data" + ((char)0).ToString());

int a = InitializeLib2(0, ref str[0], false);

so we pass an integer value, I like to pass strings as byte array and last parameter is boolean value.


Thats all. At the end of this you will have MyWrapper.dll and managed exe file.

Thank you, and good luck :)


1vqHSTrq1GEoEF7QsL8dhmJfRMDVxhv2y



C#.NET - Fast Memory Copy method with x86 Assembler

Introduction

I'm Oleksandr Karpov and this is my first article here, thanks for reading it.

Here, I'm going to show and explain how to copy data really fast and how to use assembly under C# and .NET. In my case, I use it in a video creating application from images, video and sound.
Also, if you have an assembly method or function that you need to use under C#, it will show you how to do it in a quick and simple way.

Background

To understand it all, it would be great for you to know assembly language, memory alignment and some C#, Windows and .NET advanced techniques.
To be able to copy-paste data really fast, you need it to have 16 byte aligned memory address in other way it will have almost the same speed (in my case, about 1.02 time faster).

The code uses SSE instructions that are supported by processors from Pentium III+ (KNI/MMX2), AMD Athlon (AMD EMMX).

I have tested it on my Pentium Dual-Core E5800 3.2GHz with 4GB RAM in dual mode.
For me, the fast copy method is 1.5 times faster than the standard with 16 byte memory aligned and
almost the same (1.02 times faster) with non-aligned memory addresses.

To be able to allocate 16 byte aligned memory in C# under Windows, we have three ways to do it:

a) On this time it seems that Bitmap object (actually windows itself inside) allocates memory  with 16 byte aligned address, so we can use Bitmap to easy and quick aligned memory allocation;

b) As managed array by adding 8 bytes more (as windows heap is 8 byte aligned) and calculating 16 byte aligned memory point within allocated memory:

int dataLength = 4096;


// +8 bytes as windows heap is 8 byte aligned

byte[] buffer = new byte[dataLength + 8];


IntPtr addr = Marshal.UnsafeAddrOfPinnedArrayElement(buffer, 0);


//(int)(((long)addr + 15) / 16 * 16 - getting point to 16 byte aligned address

int bufferAlignedOffset = (int)(((long)addr + 15) / 16 * 16 - addr);

c) By allocating memory with VirtualAlloc API:

IntPtr addr = VirtualAlloc(IntPtr.Zero,

new UIntPtr(dataLength + 8),

AllocationTypes.Commit | AllocationTypes.Reserve,

MemoryProtections.ExecuteReadWrite);


addr = new IntPtr(((long)addr + 15) / 16 * 16);

Using the Code

This is a complete performance test that will show you performance measurements and how to use it all.

The FastMemCopy class contains all things for fast memory copy logic.

First thing you need is to create a default Windows Forms application project and put two buttons on the form and the PictureBox control as we will test it on images.

Let's declare some fields:

string bitmapPath;

Bitmap bmp, bmp2;

BitmapData bmpd, bmpd2;

byte[] buffer = null;

Now, we will create two methods to handle click events for our buttons.

For standard method:

private void btnStandard_Click(object sender, EventArgs e)

{

using (OpenFileDialog ofd = new OpenFileDialog())

{

if (ofd.ShowDialog() != System.Windows.Forms.DialogResult.OK)

return;

bitmapPath = ofd.FileName;

}


//open a selected image and create an empty image with the same size

OpenImage();


//unlock for read and write images

UnlockBitmap();

//copy data from one image to another by standard method

CopyImage();

//lock images to be able to see them

LockBitmap();

//lets see what we have

pictureBox1.Image = bmp2;

}

and for fast method:

private void btnFast_Click(object sender, EventArgs e)

{

using (OpenFileDialog ofd = new OpenFileDialog())

{

if (ofd.ShowDialog() != System.Windows.Forms.DialogResult.OK)

return;

bitmapPath = ofd.FileName;

}

//open a selected image and create an empty image with the same size

OpenImage();

//unlock for read and write images

UnlockBitmap();

//copy data from one image to another with our fast method

FastCopyImage();

//lock images to be able to see them

LockBitmap();

//lets see what we have

pictureBox1.Image = bmp2;

}

Ok, now we have buttons and event handlers so let's implement methods that will open images, lock, unlock them and standard copy method:

Open an image:

void OpenImage()

{

pictureBox1.Image = null;

buffer = null;

if (bmp != null)

{

bmp.Dispose();

bmp = null;

}

if (bmp2 != null)

{

bmp2.Dispose();

bmp2 = null;

}

GC.Collect(GC.MaxGeneration, GCCollectionMode.Forced);

bmp = (Bitmap)Bitmap.FromFile(bitmapPath);

buffer = new byte[bmp.Width * 4 * bmp.Height];

bmp2 = new Bitmap(bmp.Width, bmp.Height, bmp.Width * 4, PixelFormat.Format32bppArgb,

Marshal.UnsafeAddrOfPinnedArrayElement(buffer, 0));

}

Lock and unlock bitmaps:

void UnlockBitmap()

{

bmpd = bmp.LockBits(new Rectangle(0, 0, bmp.Width, bmp.Height), ImageLockMode.ReadWrite,

PixelFormat.Format32bppArgb);

bmpd2 = bmp2.LockBits(new Rectangle(0, 0, bmp.Width, bmp.Height), ImageLockMode.ReadWrite,

PixelFormat.Format32bppArgb);

}

void LockBitmap()

{

bmp.UnlockBits(bmpd);

bmp2.UnlockBits(bmpd2);

}

and copy data from one image to another and show measured time:

void CopyImage()

{

//start stopwatch

Stopwatch sw = new Stopwatch();

sw.Start();

//copy-past data 10 times

for (int i = 0; i < 10; i++)

{

System.Runtime.InteropServices.Marshal.Copy(bmpd.Scan0, buffer, 0, buffer.Length);

}

//stop stopwatch

sw.Stop();

//show measured time

MessageBox.Show(sw.ElapsedTicks.ToString());

}

That's it for the standard copy-paste method. Actually, there is nothing too complex, we use well-known System.Runtime.InteropServices.Marshal.Copy method.

And one more "middle-method" for the fast copy logic:

void FastCopyImage()

{

FastMemCopy.FastMemoryCopy(bmpd.Scan0, bmpd2.Scan0, buffer.Length);

}

Now, let's implement the FastMemCopy class. Here is the declaration of the class and some types we will use inside of it:

internal static class FastMemCopy

{

[Flags]

private enum AllocationTypes : uint

{

Commit = 0x1000, Reserve = 0x2000,

Reset = 0x80000, LargePages = 0x20000000,

Physical = 0x400000, TopDown = 0x100000,

WriteWatch = 0x200000

}

[Flags]

private enum MemoryProtections : uint

{

Execute = 0x10, ExecuteRead = 0x20,

ExecuteReadWrite = 0x40, ExecuteWriteCopy = 0x80,

NoAccess = 0x01, ReadOnly = 0x02,

ReadWrite = 0x04, WriteCopy = 0x08,

GuartModifierflag = 0x100, NoCacheModifierflag = 0x200,

WriteCombineModifierflag = 0x400

}

[Flags]

private enum FreeTypes : uint

{

Decommit = 0x4000, Release = 0x8000

}

[UnmanagedFunctionPointerAttribute(CallingConvention.Cdecl)]

private unsafe delegate void FastMemCopyDelegate();

private static class NativeMethods

{

[DllImport("kernel32.dll", SetLastError = true)]

internal static extern IntPtr VirtualAlloc(

IntPtr lpAddress,

UIntPtr dwSize,

AllocationTypes flAllocationType,

MemoryProtections flProtect);

[DllImport("kernel32")]

[return: MarshalAs(UnmanagedType.Bool)]

internal static extern bool VirtualFree(

IntPtr lpAddress,

uint dwSize,

FreeTypes flFreeType);

}

Now let's declare the method itself:

public static unsafe void FastMemoryCopy(IntPtr src, IntPtr dst, int nBytes)

{

if (IntPtr.Size == 4)

{

//we are in 32 bit mode

//allocate memory for our asm method

IntPtr p = NativeMethods.VirtualAlloc(

IntPtr.Zero,

new UIntPtr((uint)x86_FastMemCopy_New.Length),

AllocationTypes.Commit | AllocationTypes.Reserve,

MemoryProtections.ExecuteReadWrite);

try

{

//copy our method bytes to allocated memory

Marshal.Copy(x86_FastMemCopy_New, 0, p, x86_FastMemCopy_New.Length);

//make a delegate to our method

FastMemCopyDelegate _fastmemcopy =

(FastMemCopyDelegate)Marshal.GetDelegateForFunctionPointer(p,

typeof(FastMemCopyDelegate));

//offset to the end of our method block

p += x86_FastMemCopy_New.Length;

//store length param

p -= 8;

Marshal.Copy(BitConverter.GetBytes((long)nBytes), 0, p, 4);

//store destination address param

p -= 8;

Marshal.Copy(BitConverter.GetBytes((long)dst), 0, p, 4);

//store source address param

p -= 8;

Marshal.Copy(BitConverter.GetBytes((long)src), 0, p, 4);

//Start stopwatch

Stopwatch sw = new Stopwatch();

sw.Start();

//copy-past all data 10 times

for (int i = 0; i < 10; i++)

_fastmemcopy();

//stop stopwatch

sw.Stop();

//get message with measured time

System.Windows.Forms.MessageBox.Show(sw.ElapsedTicks.ToString());

}

catch (Exception ex)

{

//if any exception

System.Windows.Forms.MessageBox.Show(ex.Message);

}

finally

{

//free allocated memory

NativeMethods.VirtualFree(p, (uint)(x86_FastMemCopy_New.Length),

FreeTypes.Release);

GC.Collect(GC.MaxGeneration, GCCollectionMode.Forced);

}

}

else if (IntPtr.Size == 8)

{

throw new ApplicationException("x64 is not supported yet!");

}

}

and assembly code that is represented as an array of bytes with explanation:

private static byte[] x86_FastMemCopy_New = new byte[]

{

0x90, //nop do nothing

0x60, //pushad store flag register on stack

0x95, //xchg ebp, eax eax contains memory address of our method

0x8B, 0xB5, 0x5A, 0x01, 0x00, 0x00, //mov esi,[ebp][00000015A] get source buffer address

0x89, 0xF0, //mov eax,esi

0x83, 0xE0, 0x0F, //and eax,00F will check if it is 16 byte aligned

0x8B, 0xBD, 0x62, 0x01, 0x00, 0x00, //mov edi,[ebp][000000162] get destination address

0x89, 0xFB, //mov ebx,edi

0x83, 0xE3, 0x0F, //and ebx,00F will check if it is 16 byte aligned

0x8B, 0x8D, 0x6A, 0x01, 0x00, 0x00, //mov ecx,[ebp][00000016A] get number of bytes to copy

0xC1, 0xE9, 0x07, //shr ecx,7 divide length by 128

0x85, 0xC9, //test ecx,ecx check if zero

0x0F, 0x84, 0x1C, 0x01, 0x00, 0x00, //jz 000000146 ? copy the rest

0x0F, 0x18, 0x06, //prefetchnta [esi] pre-fetch non-temporal source data for reading

0x85, 0xC0, //test eax,eax check if source address is 16 byte aligned

0x0F, 0x84, 0x8B, 0x00, 0x00, 0x00, //jz 0000000C0 ? go to copy if aligned

0x0F, 0x18, 0x86, 0x80, 0x02, 0x00, 0x00, //prefetchnta [esi][000000280] pre-fetch more source data

0x0F, 0x10, 0x06, //movups xmm0,[esi] copy 16 bytes of source data

0x0F, 0x10, 0x4E, 0x10, //movups xmm1,[esi][010] copy more 16 bytes

0x0F, 0x10, 0x56, 0x20, //movups xmm2,[esi][020] copy more

0x0F, 0x18, 0x86, 0xC0, 0x02, 0x00, 0x00, //prefetchnta [esi][0000002C0] pre-fetch more

0x0F, 0x10, 0x5E, 0x30, //movups xmm3,[esi][030]

0x0F, 0x10, 0x66, 0x40, //movups xmm4,[esi][040]

0x0F, 0x10, 0x6E, 0x50, //movups xmm5,[esi][050]

0x0F, 0x10, 0x76, 0x60, //movups xmm6,[esi][060]

0x0F, 0x10, 0x7E, 0x70, //movups xmm7,[esi][070] we've copied 128 bytes of source data

0x85, 0xDB, //test ebx,ebx check if destination address is 16 byte aligned

0x74, 0x21, //jz 000000087 ? go to past if aligned

0x0F, 0x11, 0x07, //movups [edi],xmm0 past first 16 bytes to non-aligned destination address

0x0F, 0x11, 0x4F, 0x10, //movups [edi][010],xmm1 past more

0x0F, 0x11, 0x57, 0x20, //movups [edi][020],xmm2

0x0F, 0x11, 0x5F, 0x30, //movups [edi][030],xmm3

0x0F, 0x11, 0x67, 0x40, //movups [edi][040],xmm4

0x0F, 0x11, 0x6F, 0x50, //movups [edi][050],xmm5

0x0F, 0x11, 0x77, 0x60, //movups [edi][060],xmm6

0x0F, 0x11, 0x7F, 0x70, //movups [edi][070],xmm7 we've pasted 128 bytes of source data

0xEB, 0x1F, //jmps 0000000A6 ? continue

0x0F, 0x2B, 0x07, //movntps [edi],xmm0 past first 16 bytes to aligned destination address

0x0F, 0x2B, 0x4F, 0x10, //movntps [edi][010],xmm1 past more

0x0F, 0x2B, 0x57, 0x20, //movntps [edi][020],xmm2

0x0F, 0x2B, 0x5F, 0x30, //movntps [edi][030],xmm3

0x0F, 0x2B, 0x67, 0x40, //movntps [edi][040],xmm4

0x0F, 0x2B, 0x6F, 0x50, //movntps [edi][050],xmm5

0x0F, 0x2B, 0x77, 0x60, //movntps [edi][060],xmm6

0x0F, 0x2B, 0x7F, 0x70, //movntps [edi][070],xmm7 we've pasted 128 bytes of source data

0x81, 0xC6, 0x80, 0x00, 0x00, 0x00, //add esi,000000080 increment source address by 128

0x81, 0xC7, 0x80, 0x00, 0x00, 0x00, //add edi,000000080 increment destination address by 128

0x83, 0xE9, 0x01, //sub ecx,1 decrement counter

0x0F, 0x85, 0x7A, 0xFF, 0xFF, 0xFF, //jnz 000000035 ? continue if not zero

0xE9, 0x86, 0x00, 0x00, 0x00, //jmp 000000146 ? go to copy the rest of data

0x0F, 0x18, 0x86, 0x80, 0x02, 0x00, 0x00, //prefetchnta [esi][000000280] pre-fetch source data

0x0F, 0x28, 0x06, //movaps xmm0,[esi] copy 128 bytes from aligned source address

0x0F, 0x28, 0x4E, 0x10, //movaps xmm1,[esi][010] copy more

0x0F, 0x28, 0x56, 0x20, //movaps xmm2,[esi][020]

0x0F, 0x18, 0x86, 0xC0, 0x02, 0x00, 0x00, //prefetchnta [esi][0000002C0] pre-fetch more data

0x0F, 0x28, 0x5E, 0x30, //movaps xmm3,[esi][030]

0x0F, 0x28, 0x66, 0x40, //movaps xmm4,[esi][040]

0x0F, 0x28, 0x6E, 0x50, //movaps xmm5,[esi][050]

0x0F, 0x28, 0x76, 0x60, //movaps xmm6,[esi][060]

0x0F, 0x28, 0x7E, 0x70, //movaps xmm7,[esi][070] we've copied 128 bytes of source data

0x85, 0xDB, //test ebx,ebx check if destination address is 16 byte aligned

0x74, 0x21, //jz 000000112 ? go to past if aligned

0x0F, 0x11, 0x07, //movups [edi],xmm0 past 16 bytes to non-aligned destination address

0x0F, 0x11, 0x4F, 0x10, //movups [edi][010],xmm1 past more

0x0F, 0x11, 0x57, 0x20, //movups [edi][020],xmm2

0x0F, 0x11, 0x5F, 0x30, //movups [edi][030],xmm3

0x0F, 0x11, 0x67, 0x40, //movups [edi][040],xmm4

0x0F, 0x11, 0x6F, 0x50, //movups [edi][050],xmm5

0x0F, 0x11, 0x77, 0x60, //movups [edi][060],xmm6

0x0F, 0x11, 0x7F, 0x70, //movups [edi][070],xmm7 we've pasted 128 bytes of data

0xEB, 0x1F, //jmps 000000131 ? continue copy-past

0x0F, 0x2B, 0x07, //movntps [edi],xmm0 past 16 bytes to aligned destination address

0x0F, 0x2B, 0x4F, 0x10, //movntps [edi][010],xmm1 past more

0x0F, 0x2B, 0x57, 0x20, //movntps [edi][020],xmm2

0x0F, 0x2B, 0x5F, 0x30, //movntps [edi][030],xmm3

0x0F, 0x2B, 0x67, 0x40, //movntps [edi][040],xmm4

0x0F, 0x2B, 0x6F, 0x50, //movntps [edi][050],xmm5

0x0F, 0x2B, 0x77, 0x60, //movntps [edi][060],xmm6

0x0F, 0x2B, 0x7F, 0x70, //movntps [edi][070],xmm7 we've pasted 128 bytes of data

0x81, 0xC6, 0x80, 0x00, 0x00, 0x00, //add esi,000000080 increment source address by 128

0x81, 0xC7, 0x80, 0x00, 0x00, 0x00, //add edi,000000080 increment destination address by 128

0x83, 0xE9, 0x01, //sub ecx,1 decrement counter

0x0F, 0x85, 0x7A, 0xFF, 0xFF, 0xFF, //jnz 0000000C0 ? continue copy-past if non-zero

0x8B, 0x8D, 0x6A, 0x01, 0x00, 0x00, //mov ecx,[ebp][00000016A] get number of bytes to copy

0x83, 0xE1, 0x7F, //and ecx,07F get rest number of bytes

0x85, 0xC9, //test ecx,ecx check if there are bytes

0x74, 0x02, //jz 000000155 ? exit if there are no more bytes

0xF3, 0xA4, //rep movsb copy rest of bytes

0x0F, 0xAE, 0xF8, //sfence performs a serializing operation on all store-to-memory instructions

0x61, //popad restore flag register

0xC3, //retn return from our method to C#

0x00, 0x00, 0x00, 0x00, //source buffer address

0x00, 0x00, 0x00, 0x00,

0x00, 0x00, 0x00, 0x00, //destination buffer address

0x00, 0x00, 0x00, 0x00,

0x00, 0x00, 0x00, 0x00, //number of bytes to copy-past

0x00, 0x00, 0x00, 0x00

};

We will call this assembly method via delegate we have created earlier.

This method works in 32 bit mode for now and I will implement the 64 bit mode later.
I will add source code if anyone is interested in it (almost all code is there in the article).

Pay attention, the assembly code throws an exception if it is run under Visual Studio, and I still don't understand why.

Points of Interest

During implementation and testing this method, I have found that prefetchnta command is not very clear described even by the Intel specification, so I did try to figure out it myself and via Google.
Also, pay attention to movntps and movaps instructions as they work with 16-byte memory aligned addresses only.

History

  • Bitmap and 16 byte memory alignment
  • Source code and memory alignment samples were added
  • First version - 06/23/2015
FastMemoryCopy_src.zip (14.4KB)

1vqHSTrq1GEoEF7QsL8dhmJfRMDVxhv2y