I have a torchvision model which is mobilenet without the classification head to use as a similarity search. I have saved it as a torchvision model. When I use it within python with the similarity search it gets the right result but within kotlin it does not- I have checked with using the same images and it does not get the same outputs. I am guessing it is todo with my preprocessing but I have tried everything I can find and it does not change anything. Here is my python code:
# Model and transform setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
image_transform = transforms.Compose([
transforms.Resize(256),
transforms.CenterCrop(224),
transforms.ToTensor(),
transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
])
def load_trained_model():
model = torch.jit.load(MODEL_FILE, map_location=device)
model.eval()
return model
def extract_features(pil_img, model):
with torch.no_grad():
tensor = image_transform(pil_img).unsqueeze(0).to(device)
features = model(tensor)
if len(features.shape) > 2:
features = features.view(features.size(0), -1)
return features.cpu().numpy().astype(np.float32)
enter code here
@app.post("/extract_features")
async def extract_image_features(image: UploadFile = File(...)):
try:
image_bytes = await image.read()
with Image.open(BytesIO(image_bytes)) as img:
processed_img = img.convert("RGB")
raw_features = extract_features(processed_img, model)
Kotlin:
fun preprocessImage(bitmap: Bitmap): Tensor {
val rgbBitmap = if (bitmap.config != Bitmap.Config.ARGB_8888) {
bitmap.copy(Bitmap.Config.ARGB_8888, true)
} else {
bitmap
}
val resizedBitmap = resizeWithAspectRatio(rgbBitmap, 256)
val croppedBitmap = centerCrop(resizedBitmap, 224, 224)
val mean = floatArrayOf(0.485f, 0.456f, 0.406f)
val std = floatArrayOf(0.229f, 0.224f, 0.225f)
return TensorImageUtils.bitmapToFloat32Tensor(croppedBitmap, mean, std)
}
private fun resizeWithAspectRatio(bitmap: Bitmap, targetSize: Int): Bitmap {
val width = bitmap.width
val height = bitmap.height
val scale = if (width < height) {
targetSize.toFloat() / width.toFloat()
} else {
targetSize.toFloat() / height.toFloat()
}
val newWidth = (width * scale).roundToInt()
val newHeight = (height * scale).roundToInt()
// Use createScaledBitmap with bilinear filtering (matches PyTorch default)
return bitmap.scale(256, 256)
}
private fun centerCrop(bitmap: Bitmap, targetWidth: Int, targetHeight: Int): Bitmap {
val width = bitmap.width
val height = bitmap.height
val startX = (width - targetWidth) / 2
val startY = (height - targetHeight) / 2
val validStartX = Math.max(0, startX)
val validStartY = Math.max(0, startY)
val validTargetWidth = Math.min(targetWidth, width - validStartX)
val validTargetHeight = Math.min(targetHeight, height - validStartY)
return Bitmap.createBitmap(bitmap, validStartX, validStartY, validTargetWidth, validTargetHeight)
}
fun extractFeatures(bitmap: Bitmap): FloatArray {
if (model == null) {
throw IllegalStateException("Model not loaded. Call loadModel() first.")
}
val inputTensor =
preprocessImage(bitmap)
val output = model!!.forward(IValue.from(inputTensor))
val outputTensor = output.toTensor()
val features = outputTensor.dataAsFloatArray
return features
}
First 5 outputs from kotlin:
[0.7993497, 0.30109355, 0.32214138, 0.47712356, 0.5185487]
Python:
[ 1.2595854 -0.07939269 -0.3717999 0.22528967 0.12919804]
Why are the outputs so different?