diff --git a/README.md b/README.md
index f303afd..e0cf042 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,18 @@
+2018.5.15
+1. 使用 ImageReader 的 Surface 作为录屏输入,然后将输入数据通过 FFmpegRTMPRecorder 发送出去,但是不成功,只有声音,没有画面。
+2. 使用 MediaCodec 的Surface 作为录屏输入,此输入数据已经是 h264 压缩格式,无法传送给 FFmpegRTMPRecorder,没有测试成功;
+3. 使用 FFmpegRTMPRecorder 录屏推流没有成功,成功的例子是使用腾讯直播SDK,见我的另一个例子:https://github.com/smartsharp/TestTencentRtmp。
+4. 使用心得: https://www.jianshu.com/p/2e21d81b350c
+
+
+
+
+
+
+
+
+
+
# RtmpRecoder
Record camera and push stream to rtmp server.
diff --git a/app/build.gradle b/app/build.gradle
index 9c3e808..7ef21f6 100644
--- a/app/build.gradle
+++ b/app/build.gradle
@@ -1,17 +1,21 @@
apply plugin: 'com.android.application'
// This does not break the build when Android Studio is missing the JRebel for Android plugin.
-apply plugin: 'com.zeroturnaround.jrebel.android'
+//apply plugin: 'com.zeroturnaround.jrebel.android'
android {
- compileSdkVersion 23
- buildToolsVersion "23.0.2"
+ compileSdkVersion 26
defaultConfig {
applicationId "cn.campusapp.rtmprecorder"
- minSdkVersion 14
- targetSdkVersion 23
+ minSdkVersion 23
+ targetSdkVersion 26
versionCode 1
versionName "1.0"
+
+ ndk {
+ //abiFilters 'x86', 'x86_64', 'armeabi', 'armeabi-v7a', 'arm64-v8a'
+ abiFilters 'arm64-v8a'
+ }
}
buildTypes {
debug {
@@ -31,14 +35,20 @@ android {
}
dependencies {
- compile fileTree(dir: 'libs', include: ['*.jar'])
- testCompile 'junit:junit:4.12'
- compile 'com.android.support:appcompat-v7:23.1.1'
- compile 'com.android.support:design:23.1.1'
- compile 'com.jakewharton:butterknife:7.0.1'
- compile group: 'org.bytedeco', name: 'javacv', version: '1.1'
- compile group: 'org.bytedeco.javacpp-presets', name: 'opencv', version: '3.0.0-1.1', classifier: 'android-arm'
- compile group: 'org.bytedeco.javacpp-presets', name: 'opencv', version: '3.0.0-1.1', classifier: 'android-x86'
- compile group: 'org.bytedeco.javacpp-presets', name: 'ffmpeg', version: '2.8.1-1.1', classifier: 'android-arm'
- compile group: 'org.bytedeco.javacpp-presets', name: 'ffmpeg', version: '2.8.1-1.1', classifier: 'android-x86'
+ implementation fileTree(include: ['*.jar'], dir: 'libs')
+ testImplementation 'junit:junit:4.12'
+ implementation 'com.android.support:appcompat-v7:26.1.0'
+ implementation 'com.android.support:design:26.1.0'
+ //implementation 'com.jakewharton:butterknife:8.8.1'
+ //implementation 'org.bytedeco:javacv:+'
+ //implementation 'org.bytedeco.javacpp-presets:opencv:3.0.0-1.1:android-x86'
+ //implementation 'org.bytedeco.javacpp-presets:ffmpeg:2.8.1-1.1:android-x86'
+ //implementation 'org.bytedeco.javacpp-presets:opencv:3.0.0-1.1:android-arm'
+ //implementation 'org.bytedeco.javacpp-presets:ffmpeg:2.8.1-1.1:android-arm'
+ implementation 'org.bytedeco:javacv:1.4.1'
+ //implementation group: 'org.bytedeco.javacpp-presets', name: 'opencv-platform', version: '3.4.1-1.4.1'
+ //implementation group: 'org.bytedeco.javacpp-presets', name: 'ffmpeg-platform', version: '3.4.2-1.4.1'
+ implementation 'org.bytedeco.javacpp-presets:opencv:3.4.1-1.4.1:android-arm64'
+ implementation 'org.bytedeco.javacpp-presets:ffmpeg:3.4.2-1.4.1:android-arm64'
+ implementation project(':libyuv')
}
diff --git a/app/src/main/AndroidManifest.xml b/app/src/main/AndroidManifest.xml
index a157ca7..496f920 100644
--- a/app/src/main/AndroidManifest.xml
+++ b/app/src/main/AndroidManifest.xml
@@ -2,12 +2,13 @@
-
-
-
-
-
+
+
+
+
+
+
+ android:theme="@style/AppTheme">
@@ -27,8 +28,15 @@
+ android:label="@string/title_activity_record"/>
+
+
+
diff --git a/app/src/main/java/cn/campusapp/rtmprecorder/ImageUtil.java b/app/src/main/java/cn/campusapp/rtmprecorder/ImageUtil.java
new file mode 100644
index 0000000..7b306fd
--- /dev/null
+++ b/app/src/main/java/cn/campusapp/rtmprecorder/ImageUtil.java
@@ -0,0 +1,55 @@
+package cn.campusapp.rtmprecorder;
+
+public class ImageUtil {
+
+ /**
+ * Converts YUV420 NV21 to ARGB8888
+ *
+ * @param data byte array on YUV420 NV21 format.
+ * @param width pixels width
+ * @param height pixels height
+ * @return a ARGB8888 pixels int array. Where each int is a pixels ARGB.
+ */
+ public static int[] convertYUV420_NV21toARGB8888(byte [] data, int width, int height) {
+ int size = width*height;
+ int offset = size;
+ int[] pixels = new int[size];
+ int u, v, y1, y2, y3, y4;
+
+ // i along Y and the final pixels
+ // k along pixels U and V
+ for(int i=0, k=0; i < size; i+=2, k+=2) {
+ y1 = data[i ]&0xff;
+ y2 = data[i+1]&0xff;
+ y3 = data[width+i ]&0xff;
+ y4 = data[width+i+1]&0xff;
+
+ u = data[offset+k ]&0xff;
+ v = data[offset+k+1]&0xff;
+ u = u-128;
+ v = v-128;
+
+ pixels[i ] = convertYUVtoARGB(y1, u, v);
+ pixels[i+1] = convertYUVtoARGB(y2, u, v);
+ pixels[width+i ] = convertYUVtoARGB(y3, u, v);
+ pixels[width+i+1] = convertYUVtoARGB(y4, u, v);
+
+ if (i!=0 && (i+2)%width==0)
+ i+=width;
+ }
+
+ return pixels;
+ }
+
+ private static int convertYUVtoARGB(int y, int u, int v) {
+ int r,g,b;
+
+ r = y + (int)1.402f*u;
+ g = y - (int)(0.344f*v +0.714f*u);
+ b = y + (int)1.772f*v;
+ r = r>255? 255 : r<0 ? 0 : r;
+ g = g>255? 255 : g<0 ? 0 : g;
+ b = b>255? 255 : b<0 ? 0 : b;
+ return 0xff000000 | (r<<16) | (g<<8) | b;
+ }
+}
diff --git a/app/src/main/java/cn/campusapp/rtmprecorder/MainActivity.java b/app/src/main/java/cn/campusapp/rtmprecorder/MainActivity.java
index 0d02791..a3bae85 100644
--- a/app/src/main/java/cn/campusapp/rtmprecorder/MainActivity.java
+++ b/app/src/main/java/cn/campusapp/rtmprecorder/MainActivity.java
@@ -2,28 +2,45 @@
import android.os.Bundle;
import android.support.v7.app.AppCompatActivity;
+import android.view.View;
+import android.widget.Button;
+import android.widget.Toast;
-import butterknife.ButterKnife;
-import butterknife.OnClick;
+//import butterknife.ButterKnife;
+//import butterknife.OnClick;
-public class MainActivity extends AppCompatActivity {
+public class MainActivity extends AppCompatActivity implements View.OnClickListener {
+
+ private static final String STREAM_URL = "rtmp://192.168.30.77/live/livestream";
- private static final String STREAM_URL = "rtmp://10.10.5.119/live/livestream";
- @OnClick(R.id.record_btn)
- void onRecordClick(){
- startActivity(RecordActivity.makeIntent(STREAM_URL));
- }
@Override
protected void onCreate(Bundle savedInstanceState) {
super.onCreate(savedInstanceState);
setContentView(R.layout.activity_main);
- ButterKnife.bind(this);
+ //ButterKnife.bind(this);
+ Button button = (Button)findViewById(R.id.record_btn);
+ button.setOnClickListener(this);
+ button = (Button)findViewById(R.id.record_btn2);
+ button.setOnClickListener(this);
}
protected String getUrl(){
return STREAM_URL;
}
+
+ @Override
+ public void onClick(View v) {
+ if(PermissionUtil.isGrantPermissionsRequired(this)) {
+ if(v.getId()==R.id.record_btn) {
+ startActivity(RecordActivity.makeIntent(STREAM_URL));
+ }else {
+ startActivity(RecordScreenActivity.makeIntent(STREAM_URL));
+ }
+ }else {
+ Toast.makeText(this, "Permission requested.", Toast.LENGTH_SHORT).show();
+ }
+ }
}
diff --git a/app/src/main/java/cn/campusapp/rtmprecorder/PermissionUtil.java b/app/src/main/java/cn/campusapp/rtmprecorder/PermissionUtil.java
new file mode 100644
index 0000000..639dffa
--- /dev/null
+++ b/app/src/main/java/cn/campusapp/rtmprecorder/PermissionUtil.java
@@ -0,0 +1,48 @@
+/*
+ * Copyright 2014 Pierre Chabardes
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package cn.campusapp.rtmprecorder;
+
+import android.Manifest;
+import android.app.Activity;
+import android.content.pm.PackageManager;
+import android.os.Build;
+
+/**
+ * Created by zhanghongbiao@vargo.com.cn on 18-4-11.
+ */
+
+public class PermissionUtil {
+
+ public static final int PERMISSIONS_CODE = 1;
+
+ public static boolean isGrantPermissionsRequired(Activity activity) {
+ if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.M && activity.checkSelfPermission(
+ Manifest.permission.WRITE_EXTERNAL_STORAGE) != PackageManager.PERMISSION_GRANTED) {
+
+ activity.requestPermissions(new String[]{
+ Manifest.permission.READ_EXTERNAL_STORAGE,
+ Manifest.permission.WRITE_EXTERNAL_STORAGE,
+ Manifest.permission.RECORD_AUDIO,
+ Manifest.permission.CAMERA
+ }, PERMISSIONS_CODE);
+
+ return false;
+ }
+
+ return true;
+ }
+}
diff --git a/app/src/main/java/cn/campusapp/rtmprecorder/RecordActivity.java b/app/src/main/java/cn/campusapp/rtmprecorder/RecordActivity.java
index acf9115..6f76ade 100644
--- a/app/src/main/java/cn/campusapp/rtmprecorder/RecordActivity.java
+++ b/app/src/main/java/cn/campusapp/rtmprecorder/RecordActivity.java
@@ -10,6 +10,7 @@
import android.media.AudioRecord;
import android.media.MediaRecorder;
import android.os.Bundle;
+import android.support.v7.app.AppCompatActivity;
import android.util.Log;
import android.view.Display;
import android.view.KeyEvent;
@@ -33,10 +34,11 @@
import java.util.Comparator;
import java.util.List;
-public class RecordActivity extends Activity implements OnClickListener {
+public class RecordActivity extends AppCompatActivity implements OnClickListener {
- private final static String CLASS_LABEL = "RecordActivity";
+ private final static String CLASS_LABEL = "zhanghb/RecordActivity";
private final static String LOG_TAG = CLASS_LABEL;
+ private final static String TAG = CLASS_LABEL;
/* The number of seconds in the continuous record loop (or 0 to disable loop). */
final int RECORD_LENGTH = 0;
/* layout setting */
@@ -161,7 +163,7 @@ private void initLayout() {
//---------------------------------------
private void initRecorder() {
- Log.w(LOG_TAG, "init recorder");
+ Log.d(LOG_TAG, "init recorder: "+RECORD_LENGTH);
if (RECORD_LENGTH > 0) {
imagesIndex = 0;
@@ -198,11 +200,12 @@ public void startRecording() {
try {
recorder.start();
startTime = System.currentTimeMillis();
- recording = true;
audioThread.start();
+ recording = true;
} catch (FFmpegFrameRecorder.Exception e) {
e.printStackTrace();
+ Log.e(TAG, "startRecording exception "+e+", "+Log.getStackTraceString(e));
}
}
@@ -268,6 +271,7 @@ public void stopRecording() {
recorder.release();
} catch (FFmpegFrameRecorder.Exception e) {
e.printStackTrace();
+ Log.e(TAG, "stopRecording exception "+e+", "+Log.getStackTraceString(e));
}
recorder = null;
@@ -292,15 +296,16 @@ public boolean onKeyDown(int keyCode, KeyEvent event) {
@Override
public void onClick(View v) {
+ Log.i(TAG, "onClick: "+recording);
if (!recording) {
startRecording();
Log.w(LOG_TAG, "Start Button Pushed");
- btnRecorderControl.setText("Stop");
+ if(recording) btnRecorderControl.setText("Stop");
} else {
// This will trigger the audio recording loop to stop and then set isRecorderStart = false;
stopRecording();
Log.w(LOG_TAG, "Stop Button Pushed");
- btnRecorderControl.setText("Start");
+ if(!recording) btnRecorderControl.setText("Start");
}
}
diff --git a/app/src/main/java/cn/campusapp/rtmprecorder/RecordScreenActivity.java b/app/src/main/java/cn/campusapp/rtmprecorder/RecordScreenActivity.java
new file mode 100644
index 0000000..e510ead
--- /dev/null
+++ b/app/src/main/java/cn/campusapp/rtmprecorder/RecordScreenActivity.java
@@ -0,0 +1,162 @@
+package cn.campusapp.rtmprecorder;
+
+import android.app.Activity;
+import android.content.ComponentName;
+import android.content.Context;
+import android.content.Intent;
+import android.content.ServiceConnection;
+import android.content.pm.ActivityInfo;
+import android.media.projection.MediaProjection;
+import android.media.projection.MediaProjectionManager;
+import android.os.Bundle;
+import android.os.Environment;
+import android.os.IBinder;
+import android.support.v7.app.AppCompatActivity;
+import android.util.DisplayMetrics;
+import android.util.Log;
+import android.view.Display;
+import android.view.View;
+import android.view.View.OnClickListener;
+import android.view.WindowManager;
+import android.widget.Button;
+import android.widget.Toast;
+
+import java.io.File;
+
+
+public class RecordScreenActivity extends AppCompatActivity implements OnClickListener {
+
+ private final static String CLASS_LABEL = "zhanghb/RSActivity";
+ private final static String LOG_TAG = CLASS_LABEL;
+ private final static String TAG = CLASS_LABEL;
+
+ private Button btnControl;
+ private static final String KEY_STREAM_URL = "stream_url";
+
+ String ffmpeg_link;
+ RecordScreenService mService;
+ MediaProjectionManager projectionManager;
+
+ public static Intent makeIntent(String streamUrl){
+ Intent intent = new Intent(App.getContext(), RecordScreenActivity.class);
+ intent.putExtra(KEY_STREAM_URL, streamUrl);
+ return intent;
+ }
+
+ @Override
+ public void onCreate(Bundle savedInstanceState) {
+ super.onCreate(savedInstanceState);
+ ffmpeg_link = getIntent().getStringExtra(KEY_STREAM_URL);
+ setRequestedOrientation(ActivityInfo.SCREEN_ORIENTATION_PORTRAIT);
+
+ setContentView(R.layout.activity_record);
+
+ btnControl = (Button) findViewById(R.id.recorder_control);
+ btnControl.setText("Start");
+ btnControl.setOnClickListener(this);
+ btnControl.setEnabled(false);
+
+ projectionManager = (MediaProjectionManager) getSystemService(MEDIA_PROJECTION_SERVICE);
+ Intent intent = new Intent(this, RecordScreenService.class);
+ bindService(intent, mConnection, Context.BIND_AUTO_CREATE);
+ }
+
+ @Override
+ protected void onDestroy() {
+ Log.d(TAG, "onDestroy");
+ unbindService(mConnection);
+ super.onDestroy();
+ }
+
+ @Override
+ public void onResume() {
+ super.onResume();
+ }
+
+ private ServiceConnection mConnection = new ServiceConnection() {
+ @Override
+ public void onServiceConnected(ComponentName name, IBinder service) {
+ try {
+ RecordScreenService.RecordScreenBinder binder =
+ (RecordScreenService.RecordScreenBinder) service;
+ mService = binder.getRecordService();
+ if(mService != null){
+ if(mService.isRunning()) {
+ btnControl.setText(R.string.stop_record);
+ }
+ btnControl.setEnabled(true);
+ }
+ }catch (Exception e){
+ Log.e(TAG, "onServiceConnected exception "+e+","+Log.getStackTraceString(e));
+ }
+ }
+
+ @Override
+ public void onServiceDisconnected(ComponentName name) {
+ mService = null;
+ }
+ };
+
+
+ @Override
+ public void onClick(View v) {
+ Log.i(TAG, "onClick: "+mService.isRunning());
+ if (!mService.isRunning()) {
+ Intent captureIntent= projectionManager.createScreenCaptureIntent();
+ startActivityForResult(captureIntent, 1);
+ } else {
+ mService.stopRecord();
+ btnControl.setText(R.string.start_record);
+ }
+ }
+
+ @Override
+ protected void onActivityResult(int requestCode, int resultCode, Intent data) {
+ //super.onActivityResult(requestCode, resultCode, data);
+ if (requestCode == 1 && resultCode == RESULT_OK) {
+ DisplayMetrics mDisplayMetrics = new DisplayMetrics();//屏幕分辨率容器
+ getWindowManager().getDefaultDisplay().getMetrics(mDisplayMetrics);
+ int width = mDisplayMetrics.widthPixels;
+ int height = mDisplayMetrics.heightPixels;
+ float density = mDisplayMetrics.density;
+ int densityDpi = mDisplayMetrics.densityDpi;
+ Log.d(TAG,"Screen Ratio: ["+width+"x"+height+"],density="+density+",densityDpi="+densityDpi);
+ mService.setDispParams(width, height, densityDpi);
+ if(ffmpeg_link != null) {
+ mService.setOutputUrl(ffmpeg_link);
+ }else {
+ mService.setOutputUrl(getSavedFileUrl());
+ }
+ mService.setMediaProjection(projectionManager.getMediaProjection(resultCode, data));
+ mService.startRecord();
+ if (mService.isRunning()) {
+ btnControl.setText(R.string.stop_record);
+ }
+ }
+ }
+
+ public String getSavedFileUrl() {
+ String filepath = getsaveDirectory() + System.currentTimeMillis() + ".mp4";
+ return "file://"+filepath;
+ }
+
+ public String getsaveDirectory() {
+ if (Environment.getExternalStorageState().equals(Environment.MEDIA_MOUNTED)) {
+ String rootDir = Environment.getExternalStorageDirectory().getAbsolutePath() + "/" + "ScreenRecord" + "/";
+
+ File file = new File(rootDir);
+ if (!file.exists()) {
+ if (!file.mkdirs()) {
+ return null;
+ }
+ }
+
+ Toast.makeText(getApplicationContext(), rootDir, Toast.LENGTH_SHORT).show();
+
+ return rootDir;
+ } else {
+ return null;
+ }
+ }
+
+}
\ No newline at end of file
diff --git a/app/src/main/java/cn/campusapp/rtmprecorder/RecordScreenService.java b/app/src/main/java/cn/campusapp/rtmprecorder/RecordScreenService.java
new file mode 100644
index 0000000..9441e10
--- /dev/null
+++ b/app/src/main/java/cn/campusapp/rtmprecorder/RecordScreenService.java
@@ -0,0 +1,601 @@
+package cn.campusapp.rtmprecorder;
+
+import android.app.Service;
+import android.content.Intent;
+import android.graphics.ImageFormat;
+import android.graphics.PixelFormat;
+import android.graphics.SurfaceTexture;
+import android.hardware.display.DisplayManager;
+import android.hardware.display.VirtualDisplay;
+import android.media.AudioFormat;
+import android.media.AudioRecord;
+import android.media.Image;
+import android.media.ImageReader;
+import android.media.MediaCodec;
+import android.media.MediaCodecInfo;
+import android.media.MediaFormat;
+import android.media.MediaRecorder;
+import android.media.projection.MediaProjection;
+import android.os.Binder;
+import android.os.Handler;
+import android.os.HandlerThread;
+import android.os.IBinder;
+import android.os.Message;
+import android.support.annotation.NonNull;
+import android.util.Log;
+import android.view.Surface;
+
+import com.wuwang.libyuv.Key;
+import com.wuwang.libyuv.YuvUtils;
+
+import org.bytedeco.javacv.FFmpegFrameRecorder;
+import org.bytedeco.javacv.Frame;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.nio.ShortBuffer;
+
+import static org.bytedeco.javacpp.avcodec.AV_CODEC_ID_H264;
+
+public class RecordScreenService extends Service {
+ private static final String TAG = "zhanghb/RSService";
+ private HandlerThread myHandlerThread;
+ private Handler myHandler;
+ private boolean running;
+ private int mWidth, mHeight, mDpi;
+ private MediaProjection mediaProjection;
+ private MediaRecorder mediaRecorder;
+ private VirtualDisplay virtualDisplay;
+ private String outputUrl;
+ private boolean recordToFile = false;
+ // add for rtmp encoder
+ private FFmpegFrameRecorder frameRecorder;
+ private int frameRate = 30;
+
+ private MediaCodec mEncoder;
+ private long mVideoPtsOffset, mAudioPtsOffset;
+
+ private long startTime;
+ private Frame yuvImage = null;
+ private Surface mSurface;
+ private static final int MAX_IMAGE_NUMBER = 25;//30;//这个值代表ImageReader最大的存储图像
+ private ImageReader mImageReader;
+
+ private static final int FRAME_FROM_MY_SURFACE = 0;
+ private static final int FRAME_FROM_IMAGE_READER = 1;
+ private static final int FRAME_FROM_MEDIA_CODEC = 2;
+ private static final boolean RECORD_WITH_AUTIO = true;
+
+ private static final int frame_source = FRAME_FROM_IMAGE_READER;
+
+ /* audio data getting thread */
+ private AudioRecord audioRecord;
+ private AudioRecordRunnable audioRecordRunnable;
+ private Thread audioThread;
+ boolean recording = false;
+ volatile boolean runAudioThread = true;
+ private int sampleAudioRateInHz = 44100;
+
+ private RtmpThread mRtmpThread;
+ private Object frameLock = new Object();
+ private boolean frameAvail = false;
+
+ // mysurface
+ private int mTextureId = 1;
+ private SurfaceTexture mSurfaceTexture;
+
+ public RecordScreenService() {
+ }
+
+ @Override
+ public void onCreate() {
+ super.onCreate();
+ myHandlerThread = new HandlerThread("service_thread",
+ android.os.Process.THREAD_PRIORITY_BACKGROUND);
+ myHandlerThread.start();
+ myHandler = new Handler( myHandlerThread.getLooper() ){
+ @Override
+ public void handleMessage(Message msg) {
+ //super.handleMessage(msg);
+ //这个方法是运行在 handler-thread 线程中的 ,可以执行耗时操作
+ Log.d( "handler " , "消息: " + msg.what + " 线程: " + Thread.currentThread().getName() ) ;
+ }
+ };
+ running = false;
+ }
+
+ @Override
+ public void onDestroy() {
+ myHandlerThread.quit();
+ super.onDestroy();
+ }
+
+ @Override
+ public IBinder onBind(Intent intent) {
+ Log.e(TAG, "onBind");
+ return new RecordScreenBinder();
+ }
+
+ @Override
+ public boolean onUnbind(Intent intent) {
+ Log.e(TAG, "onUnbind");
+ stopRecord();
+ return super.onUnbind(intent);
+ }
+
+ public void setDispParams(int width, int height, int densityDpi) {
+ this.mWidth = width;
+ this.mHeight = height;
+ this.mDpi = densityDpi;
+ }
+
+ public void setOutputUrl(String outputUrl){
+ this.outputUrl = outputUrl;
+ }
+
+ public class RecordScreenBinder extends Binder {
+ public RecordScreenService getRecordService() {
+ return RecordScreenService.this;
+ }
+ }
+
+ public boolean isRunning() {
+ return running;
+ }
+
+ public void setMediaProjection(MediaProjection m){
+ mediaProjection = m;
+ }
+
+ public boolean startRecord() {
+ if (mediaProjection == null || outputUrl == null || running) {
+ return false;
+ }
+ Log.d(TAG, "startRecord "+outputUrl);
+ recordToFile = outputUrl.startsWith("file://");
+ if(recordToFile) {
+ if(initFileRecorder()){
+ mediaRecorder.start();
+ running = true;
+ }
+ }else{
+ if(initRtmpRecorder()) {
+ try {
+ if(mRtmpThread != null){
+ mRtmpThread.cancel();
+ }
+ if(frame_source != FRAME_FROM_MEDIA_CODEC){
+ mRtmpThread = new RtmpThread();
+ mRtmpThread.start();
+ }
+ frameRecorder.start();
+ startTime = System.currentTimeMillis();
+ if(RECORD_WITH_AUTIO) {
+ audioThread.start();
+ }
+ if(frame_source == FRAME_FROM_MEDIA_CODEC){
+ mEncoder.start();
+ }
+ recording = true;
+ running = true;
+ }catch (Exception e){
+ Log.e(TAG, "startRecord exception "+e+","+Log.getStackTraceString(e));
+ deinitRtmpRecorder();
+ }
+ }
+ }
+ Log.d(TAG, "startRecord result="+running);
+ return running;
+ }
+ public boolean stopRecord() {
+ if (!running) {
+ return false;
+ }
+ Log.d(TAG, "stopRecord "+outputUrl);
+ if(recordToFile) {
+ deinitFileRecorder();
+ }else{
+ deinitRtmpRecorder();
+ }
+ running = false;
+
+ return true;
+ }
+ private boolean initFileRecorder() {
+ Log.d(TAG, "initFileRecorder ");
+ try {
+ mediaRecorder = new MediaRecorder();
+ mediaRecorder.setAudioSource(MediaRecorder.AudioSource.MIC);
+ mediaRecorder.setVideoSource(MediaRecorder.VideoSource.SURFACE);
+ mediaRecorder.setOutputFormat(MediaRecorder.OutputFormat.THREE_GPP);
+ mediaRecorder.setOutputFile(outputUrl.substring(7));
+ mediaRecorder.setVideoSize(mWidth, mHeight);
+ mediaRecorder.setVideoEncoder(MediaRecorder.VideoEncoder.H264);
+ mediaRecorder.setAudioEncoder(MediaRecorder.AudioEncoder.AMR_NB);
+ mediaRecorder.setVideoEncodingBitRate(5 * 1024 * 1024);
+ mediaRecorder.setVideoFrameRate(30);
+ mediaRecorder.prepare();
+ mSurface = mediaRecorder.getSurface();
+ virtualDisplay = mediaProjection.createVirtualDisplay("MainScreen", mWidth, mHeight, mDpi,
+ DisplayManager.VIRTUAL_DISPLAY_FLAG_AUTO_MIRROR, mSurface, null, null);
+ return true;
+ } catch (IOException e) {
+ e.printStackTrace();
+ Log.e(TAG, "initFileRecorder exception "+e+","+Log.getStackTraceString(e));
+ deinitFileRecorder();
+ return false;
+ }
+ }
+ private void deinitFileRecorder() {
+ Log.d(TAG, "deinitFileRecorder ");
+ if(mediaRecorder != null) {
+ mediaRecorder.stop();
+ //mediaRecorder.reset();
+ mediaRecorder.release();
+ if(virtualDisplay != null)
+ virtualDisplay.release();
+ mediaProjection.stop();
+ mediaRecorder = null;
+ }
+ }
+ private boolean initRtmpRecorder() {
+ Log.d(TAG, "initRtmpRecorder "+","+frame_source);
+ try {
+ if (mRtmpThread != null) {
+ mRtmpThread.cancel();
+ mRtmpThread = null;
+ }
+ // init rtmp recorder
+ yuvImage = new Frame(mWidth, mHeight, Frame.DEPTH_UBYTE, 4);
+
+ frameRecorder = new FFmpegFrameRecorder(outputUrl, mWidth, mHeight, 1);
+ frameRecorder.setVideoCodec(AV_CODEC_ID_H264);//28);
+ frameRecorder.setFormat("flv");
+ frameRecorder.setSampleRate(sampleAudioRateInHz);
+ // Set in the surface changed method
+ frameRecorder.setFrameRate(frameRate); // 30fps
+ //frameRecorder.setVideoQuality(0);
+
+ if (RECORD_WITH_AUTIO) {
+ audioRecordRunnable = new AudioRecordRunnable();
+ audioThread = new Thread(audioRecordRunnable);
+ runAudioThread = true;
+ }
+
+ if(frame_source == FRAME_FROM_MY_SURFACE){
+ mSurfaceTexture = new SurfaceTexture(mTextureId);
+ mSurface = new Surface(mSurfaceTexture);
+ mSurfaceTexture.setOnFrameAvailableListener(frameListener, myHandler);
+ }else if(frame_source == FRAME_FROM_IMAGE_READER){
+ mImageReader = ImageReader.newInstance(mWidth, mHeight, PixelFormat.RGBA_8888, MAX_IMAGE_NUMBER);
+ mSurface = mImageReader.getSurface();
+ mImageReader.setOnImageAvailableListener(imageListener, myHandler);
+ }else {
+ MediaFormat format = MediaFormat.createVideoFormat("video/avc", mWidth, mHeight); // H.264 Advanced Video Coding
+ format.setInteger(MediaFormat.KEY_COLOR_FORMAT,
+ MediaCodecInfo.CodecCapabilities.COLOR_FormatSurface);
+ format.setInteger(MediaFormat.KEY_BIT_RATE, 1000000); // 500Kbps, 1Mbps
+ format.setInteger(MediaFormat.KEY_FRAME_RATE, 30); // 30fps
+ format.setInteger(MediaFormat.KEY_I_FRAME_INTERVAL, 2); // 2 seconds between I-frames
+ Log.d(TAG, "created video format: " + format);
+ mEncoder = MediaCodec.createEncoderByType("video/avc");
+ mEncoder.configure(format, null, null, MediaCodec.CONFIGURE_FLAG_ENCODE);
+ //mEncoder.setOnFrameRenderedListener(mediaFrameListener, myHandler);
+ mEncoder.setCallback(frameCallback, myHandler);
+ mSurface = mEncoder.createInputSurface();
+ }
+ virtualDisplay = mediaProjection.createVirtualDisplay("MainScreen", mWidth, mHeight, mDpi,
+ DisplayManager.VIRTUAL_DISPLAY_FLAG_AUTO_MIRROR, mSurface, null, null);
+
+ return true;
+ }catch (Exception e){
+ Log.e(TAG,"initRtmpRecorder exception "+e+","+Log.getStackTraceString(e));
+ return false;
+ }
+ }
+ private void deinitRtmpRecorder() {
+ Log.d(TAG, "deinitRtmpRecorder "+FRAME_FROM_IMAGE_READER);
+ if(mRtmpThread != null){
+ mRtmpThread.cancel();
+ mRtmpThread = null;
+ }
+ if(RECORD_WITH_AUTIO) {
+ runAudioThread = false;
+ try {
+ audioThread.join();
+ } catch (InterruptedException e) {
+ e.printStackTrace();
+ }
+ audioRecordRunnable = null;
+ audioThread = null;
+ }
+
+ if (virtualDisplay != null) {
+ mediaProjection.stop();
+ virtualDisplay.release();
+ virtualDisplay = null;
+ }
+ if(frame_source == FRAME_FROM_MY_SURFACE) {
+ if(mSurfaceTexture != null) {
+ mSurfaceTexture.release();
+ mSurfaceTexture = null;
+ }
+ }else if(frame_source == FRAME_FROM_IMAGE_READER){
+ if(mImageReader != null){
+ mImageReader.close();
+ }
+ }else {
+ if (mEncoder != null) {
+ mEncoder.stop();
+ mEncoder.release();
+ mEncoder = null;
+ }
+ }
+ if (frameRecorder != null && recording) {
+ recording = false;
+ try {
+ frameRecorder.stop();
+ frameRecorder.release();
+ } catch (FFmpegFrameRecorder.Exception e) {
+ e.printStackTrace();
+ Log.e(TAG, "deinitRtmpRecorder exception "+e+", "+Log.getStackTraceString(e));
+ }
+ frameRecorder = null;
+ }
+ }
+
+
+ private MediaCodec.Callback frameCallback = new MediaCodec.Callback() {
+ @Override
+ public void onInputBufferAvailable(@NonNull MediaCodec codec, int index) {
+
+ }
+
+ @Override
+ public void onOutputBufferAvailable(@NonNull MediaCodec codec, int index, @NonNull MediaCodec.BufferInfo buffer) {
+ ByteBuffer encodedData = codec.getOutputBuffer(index);
+ //writeSampleData(mVideoTrackIndex, buffer, encodedData);
+ if ((buffer.flags & MediaCodec.BUFFER_FLAG_CODEC_CONFIG) != 0) {
+ // The codec config data was pulled out and fed to the muxer when we got
+ // the INFO_OUTPUT_FORMAT_CHANGED status.
+ // Ignore it.
+ buffer.size = 0;
+ }
+ boolean eos = (buffer.flags & MediaCodec.BUFFER_FLAG_END_OF_STREAM) != 0;
+ if (buffer.size == 0 && !eos) {
+ encodedData = null;
+ } else {
+ if (buffer.presentationTimeUs != 0) { // maybe 0 if eos
+ if (mVideoPtsOffset == 0) {
+ mVideoPtsOffset = buffer.presentationTimeUs;
+ buffer.presentationTimeUs = 0;
+ } else {
+ buffer.presentationTimeUs -= mVideoPtsOffset;
+ }
+ }
+ /*if (VERBOSE)
+ Log.d(TAG, "[" + Thread.currentThread().getId() + "] Got buffer, track=" + track
+ + ", info: size=" + buffer.size
+ + ", presentationTimeUs=" + buffer.presentationTimeUs);
+ if (!eos && mCallback != null) {
+ mCallback.onRecording(buffer.presentationTimeUs);
+ }*/
+ }
+ if (encodedData != null) {
+ encodedData.position(buffer.offset);
+ encodedData.limit(buffer.offset + buffer.size);
+ //byte[] bytes = new byte[encodedData.remaining()];
+ //encodedData.get(bytes);
+ /*((ByteBuffer) yuvImage.image[0].position(0)).put(encodedData);
+ try {
+ long t = 1000 * (System.currentTimeMillis() - startTime);
+ Log.v(TAG, "Writing Frame t=" + t + ", ts=" + frameRecorder.getTimestamp());
+ if (t > frameRecorder.getTimestamp()) {
+ frameRecorder.setTimestamp(t);
+ }
+
+ frameRecorder.record(yuvImage, PixelFormat.);
+ } catch (FFmpegFrameRecorder.Exception e) {
+ Log.e(TAG, "RtmpThread exception " + e + "," + Log.getStackTraceString(e));
+ }*/
+ }
+
+ codec.releaseOutputBuffer(index, true);
+ }
+
+ @Override
+ public void onError(@NonNull MediaCodec codec, @NonNull MediaCodec.CodecException e) {
+
+ }
+
+ @Override
+ public void onOutputFormatChanged(@NonNull MediaCodec codec, @NonNull MediaFormat format) {
+
+ }
+ };
+ private SurfaceTexture.OnFrameAvailableListener frameListener = new SurfaceTexture.OnFrameAvailableListener() {
+
+ @Override
+ public void onFrameAvailable(SurfaceTexture surfaceTexture) {
+ Log.v(TAG, "OnFrameAvail");
+ synchronized (frameLock){
+ frameAvail = true;
+ frameLock.notifyAll();
+ }
+ }
+ };
+ private ImageReader.OnImageAvailableListener imageListener = new ImageReader.OnImageAvailableListener() {
+ @Override
+ public void onImageAvailable(ImageReader reader) {
+ Log.v(TAG, "OnImageAvail");
+ synchronized (frameLock){
+ frameAvail = true;
+ frameLock.notifyAll();
+ }
+ }
+ };
+
+ private MediaCodec.OnFrameRenderedListener mediaFrameListener = new MediaCodec.OnFrameRenderedListener() {
+ @Override
+ public void onFrameRendered(@NonNull MediaCodec codec, long presentationTimeUs, long nanoTime) {
+ Log.v(TAG, "onFrameRendered");
+ synchronized (frameLock){
+ frameAvail = true;
+ frameLock.notifyAll();
+ }
+ }
+ };
+ //---------------------------------------------
+ // audio thread, gets and encodes audio data
+ //---------------------------------------------
+ class AudioRecordRunnable implements Runnable {
+
+ @Override
+ public void run() {
+ android.os.Process.setThreadPriority(android.os.Process.THREAD_PRIORITY_URGENT_AUDIO);
+
+ // Audio
+ int bufferSize;
+ ShortBuffer audioData;
+ int bufferReadResult;
+
+ bufferSize = AudioRecord.getMinBufferSize(sampleAudioRateInHz,
+ AudioFormat.CHANNEL_IN_MONO, AudioFormat.ENCODING_PCM_16BIT);
+ audioRecord = new AudioRecord(MediaRecorder.AudioSource.MIC, sampleAudioRateInHz,
+ AudioFormat.CHANNEL_IN_MONO, AudioFormat.ENCODING_PCM_16BIT, bufferSize);
+
+
+ audioData = ShortBuffer.allocate(bufferSize);
+
+ Log.d(TAG, "audioRecord startRecording()");
+ audioRecord.startRecording();
+
+ /* ffmpeg_audio encoding loop */
+ while (runAudioThread) {
+ //Log.v(LOG_TAG,"recording? " + recording);
+ bufferReadResult = audioRecord.read(audioData.array(), 0, audioData.capacity());
+ audioData.limit(bufferReadResult);
+ if (bufferReadResult > 0) {
+ //Log.v(TAG, "audioRecord bufferReadResult: " + bufferReadResult);
+ // If "recording" isn't true when start this thread, it never get's set according to this if statement...!!!
+ // Why? Good question...
+ if (recording) {
+ try {
+ frameRecorder.recordSamples(audioData);
+ //Log.v(LOG_TAG,"recording " + 1024*i + " to " + 1024*i+1024);
+ } catch (FFmpegFrameRecorder.Exception e) {
+ Log.e(TAG, "audioRecord: exception "+e+","+Log.getStackTraceString(e));
+ }
+ }
+ }
+ }
+ Log.v(TAG, "audioRecord Finished");
+
+ /* encoding finish, release recorder */
+ if (audioRecord != null) {
+ audioRecord.stop();
+ audioRecord.release();
+ audioRecord = null;
+ }
+ }
+ }
+
+ private final class RtmpThread extends Thread {
+ boolean running = false;
+ boolean ended = true;
+ public void cancel(){
+ Log.d(TAG,"cancel scannerthread");
+ long t1 = System.currentTimeMillis();
+ running = false;
+ int count = 0;
+ while(!ended && (count++ <50)){
+ try {
+ Thread.sleep(100);
+ }catch (Exception e) {}
+ }
+ Log.d(TAG,"cancel MarkerThread consume "+(System.currentTimeMillis()-t1)+" ms");
+ }
+
+ @Override
+ public void run() {
+ running = true;
+ ended = false;
+ setName("RtmpThread");
+
+ android.os.Process.setThreadPriority(android.os.Process.THREAD_PRIORITY_URGENT_AUDIO);
+
+ Log.d(TAG,"RtmpThread started.");
+ boolean frameavail = false;
+ while (running) {
+ synchronized (frameLock){
+ try {
+ frameLock.wait();
+ frameavail = frameAvail;
+ }catch (Exception e){}
+ frameAvail = false;
+ }
+ if(running && frameavail){
+ //Log.d(TAG,"RtmpThread check image");
+ if(frame_source == FRAME_FROM_MY_SURFACE) {
+ mSurfaceTexture.updateTexImage();
+ Log.d(TAG,"RtmpThread frame updated.");
+
+ }else if(frame_source == FRAME_FROM_IMAGE_READER) {
+ Image mImage = null;
+ while((mImage = mImageReader.acquireNextImage())!=null) {
+ int mWidth = mImage.getWidth();
+ int mHeight = mImage.getHeight();
+ //Log.d(TAG,"RtmpThread image found " + mWidth+","+mHeight);
+ if (recording) {
+ /*
+ // Four bytes per pixel: width * height * 4.
+ byte[] rgbaBytes = new byte[mWidth * mHeight * 4];
+ // put the data into the rgbaBytes array.
+ mImage.getPlanes()[0].getBuffer().get(rgbaBytes);
+ mImage.close(); // Access to the image is no longer needed, release it.
+
+ // Create a yuv byte array: width * height * 1.5 ().
+ byte[] yuv = new byte[mWidth * mHeight * 3 / 2];
+ int ret = YuvUtils.RgbaToI420(Key.ARGB_TO_I420, rgbaBytes, yuv, mWidth, mHeight);
+ if (ret == 0) {
+ ((ByteBuffer) yuvImage.image[0].position(0)).put(yuv);
+ try {
+ long t = 1000 * (System.currentTimeMillis() - startTime);
+ Log.v(TAG, "Writing Frame t=" + t + ", ts=" + frameRecorder.getTimestamp());
+ if (t > frameRecorder.getTimestamp()) {
+ frameRecorder.setTimestamp(t);
+ }
+
+ frameRecorder.record(yuvImage);
+ } catch (FFmpegFrameRecorder.Exception e) {
+ Log.e(TAG, "RtmpThread exception " + e + "," + Log.getStackTraceString(e));
+ }
+ }*/
+ ((ByteBuffer) yuvImage.image[0].position(0)).put(mImage.getPlanes()[0].getBuffer());
+ mImage.close();
+
+ try {
+ long t = 1000 * (System.currentTimeMillis() - startTime);
+ Log.v(TAG, "Writing Frame t=" + t + ", ts=" + frameRecorder.getTimestamp());
+ if (t > frameRecorder.getTimestamp()) {
+ frameRecorder.setTimestamp(t);
+ }
+
+ frameRecorder.record(yuvImage);
+ } catch (FFmpegFrameRecorder.Exception e) {
+ Log.e(TAG, "RtmpThread exception " + e + "," + Log.getStackTraceString(e));
+ }
+ }
+ }
+ }else{
+ // TBD
+ }
+ }
+ }
+ ended = true;
+ Log.d(TAG,"RtmpThread ended.");
+ }
+ }
+
+
+}
diff --git a/app/src/main/res/layout/activity_main.xml b/app/src/main/res/layout/activity_main.xml
index 1ef2ff7..85a9c1a 100644
--- a/app/src/main/res/layout/activity_main.xml
+++ b/app/src/main/res/layout/activity_main.xml
@@ -1,6 +1,7 @@
-
-
+
+
diff --git a/app/src/main/res/values/strings.xml b/app/src/main/res/values/strings.xml
index 13ca886..132f25b 100644
--- a/app/src/main/res/values/strings.xml
+++ b/app/src/main/res/values/strings.xml
@@ -1,4 +1,7 @@
RtmpRecorder
RecordActivity
+ RecordScreenActivity
+ 开始录制
+ 停止录制
diff --git a/build.gradle b/build.gradle
index 6b7f404..bdb82c0 100644
--- a/build.gradle
+++ b/build.gradle
@@ -2,15 +2,17 @@
buildscript {
repositories {
+ google()
jcenter()
- maven {
- url 'https://repos.zeroturnaround.com/nexus/content/repositories/zt-public-releases'
- }
+
+ //maven {
+ // url 'https://repos.zeroturnaround.com/nexus/content/repositories/zt-public-releases'
+ //}
}
dependencies {
- classpath 'com.android.tools.build:gradle:1.5.0'
+ classpath 'com.android.tools.build:gradle:3.1.2'
// This does not break the build when Android Studio is missing the JRebel for Android plugin.
- classpath 'com.zeroturnaround.jrebel.android:jr-android-gradle:1.0.+'
+ //classpath 'com.zeroturnaround.jrebel.android:jr-android-gradle:1.0.+'
// NOTE: Do not place your application dependencies here; they belong
// in the individual module build.gradle files
@@ -19,8 +21,8 @@ buildscript {
allprojects {
repositories {
+ google()
jcenter()
- mavenCentral()
}
}
diff --git a/gradle/wrapper/gradle-wrapper.properties b/gradle/wrapper/gradle-wrapper.properties
index f23df6e..0c83d56 100644
--- a/gradle/wrapper/gradle-wrapper.properties
+++ b/gradle/wrapper/gradle-wrapper.properties
@@ -1,6 +1,6 @@
-#Wed Oct 21 11:34:03 PDT 2015
+#Thu May 03 15:16:28 CST 2018
distributionBase=GRADLE_USER_HOME
distributionPath=wrapper/dists
zipStoreBase=GRADLE_USER_HOME
zipStorePath=wrapper/dists
-distributionUrl=https\://services.gradle.org/distributions/gradle-2.8-all.zip
+distributionUrl=https\://services.gradle.org/distributions/gradle-4.4-all.zip
diff --git a/libyuv/.gitignore b/libyuv/.gitignore
new file mode 100644
index 0000000..0886f94
--- /dev/null
+++ b/libyuv/.gitignore
@@ -0,0 +1,2 @@
+/build
+/.externalNativeBuild
diff --git a/libyuv/CMakeLists.txt b/libyuv/CMakeLists.txt
new file mode 100644
index 0000000..e69e5d9
--- /dev/null
+++ b/libyuv/CMakeLists.txt
@@ -0,0 +1,8 @@
+cmake_minimum_required(VERSION 3.4.1)
+
+include_directories(src/main/cpp/libyuv/include)
+add_subdirectory(src/main/cpp/libyuv ./build)
+aux_source_directory(src/main/cpp SRC_FILE)
+add_library(DoggyYuv SHARED ${SRC_FILE})
+find_library(log-lib log)
+target_link_libraries(DoggyYuv ${log-lib} yuv)
\ No newline at end of file
diff --git a/libyuv/build.gradle b/libyuv/build.gradle
new file mode 100644
index 0000000..8623cc1
--- /dev/null
+++ b/libyuv/build.gradle
@@ -0,0 +1,43 @@
+apply plugin: 'com.android.library'
+
+android {
+ compileSdkVersion 26
+
+
+
+ defaultConfig {
+ minSdkVersion 23
+ targetSdkVersion 26
+ versionCode 1
+ versionName "1.0"
+
+ testInstrumentationRunner "android.support.test.runner.AndroidJUnitRunner"
+
+ externalNativeBuild {
+ cmake {
+ cppFlags "-std=c++11 -frtti -fexceptions"
+ }
+ }
+ }
+ buildTypes {
+ release {
+ minifyEnabled false
+ proguardFiles getDefaultProguardFile('proguard-android.txt'), 'proguard-rules.pro'
+ }
+ }
+ externalNativeBuild {
+ cmake {
+ path 'CMakeLists.txt'
+ }
+ }
+
+}
+
+dependencies {
+ implementation fileTree(dir: 'libs', include: ['*.jar'])
+
+ implementation 'com.android.support:appcompat-v7:26.1.0'
+ testImplementation 'junit:junit:4.12'
+ androidTestImplementation 'com.android.support.test:runner:1.0.2'
+ androidTestImplementation 'com.android.support.test.espresso:espresso-core:3.0.2'
+}
\ No newline at end of file
diff --git a/libyuv/proguard-rules.pro b/libyuv/proguard-rules.pro
new file mode 100644
index 0000000..d14c450
--- /dev/null
+++ b/libyuv/proguard-rules.pro
@@ -0,0 +1,25 @@
+# Add project specific ProGuard rules here.
+# By default, the flags in this file are appended to flags specified
+# in D:\Android\android-sdk/tools/proguard/proguard-android.txt
+# You can edit the include path and order by changing the proguardFiles
+# directive in build.gradle.
+#
+# For more details, see
+# http://developer.android.com/guide/developing/tools/proguard.html
+
+# Add any project specific keep options here:
+
+# If your project uses WebView with JS, uncomment the following
+# and specify the fully qualified class name to the JavaScript interface
+# class:
+#-keepclassmembers class fqcn.of.javascript.interface.for.webview {
+# public *;
+#}
+
+# Uncomment this to preserve the line number information for
+# debugging stack traces.
+#-keepattributes SourceFile,LineNumberTable
+
+# If you keep the line number information, uncomment this to
+# hide the original source file name.
+#-renamesourcefileattribute SourceFile
diff --git a/libyuv/src/androidTest/java/com/doggycoder/libyuv/ExampleInstrumentedTest.java b/libyuv/src/androidTest/java/com/doggycoder/libyuv/ExampleInstrumentedTest.java
new file mode 100644
index 0000000..f25fb02
--- /dev/null
+++ b/libyuv/src/androidTest/java/com/doggycoder/libyuv/ExampleInstrumentedTest.java
@@ -0,0 +1,26 @@
+package com.doggycoder.libyuv;
+
+import android.content.Context;
+import android.support.test.InstrumentationRegistry;
+import android.support.test.runner.AndroidJUnit4;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+
+import static org.junit.Assert.*;
+
+/**
+ * Instrumentation test, which will execute on an Android device.
+ *
+ * @see Testing documentation
+ */
+@RunWith(AndroidJUnit4.class)
+public class ExampleInstrumentedTest {
+ @Test
+ public void useAppContext() throws Exception {
+ // Context of the app under test.
+ Context appContext = InstrumentationRegistry.getTargetContext();
+
+ assertEquals("com.doggycoder.libyuv.test", appContext.getPackageName());
+ }
+}
diff --git a/libyuv/src/main/AndroidManifest.xml b/libyuv/src/main/AndroidManifest.xml
new file mode 100644
index 0000000..c1e37ab
--- /dev/null
+++ b/libyuv/src/main/AndroidManifest.xml
@@ -0,0 +1,10 @@
+
+
+
+
+
+
+
diff --git a/libyuv/src/main/cpp/YuvJni.cpp b/libyuv/src/main/cpp/YuvJni.cpp
new file mode 100644
index 0000000..f770b59
--- /dev/null
+++ b/libyuv/src/main/cpp/YuvJni.cpp
@@ -0,0 +1,229 @@
+#include
+#include "libyuv.h"
+#include "jni.h"
+#include "android/log.h"
+
+#define YUV_UTILS_JAVA "com/wuwang/libyuv/YuvUtils"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+static int (*rgbaToI420Func[])(const uint8 *,int,uint8 *,int,uint8 *,int ,uint8 *,int,int,int)={
+ libyuv::ABGRToI420,libyuv::RGBAToI420,libyuv::ARGBToI420,libyuv::BGRAToI420,
+ libyuv::RGB24ToI420,libyuv::RGB565ToI420
+};
+
+static int (*i420ToRgbaFunc[])(const uint8 *,int, const uint8 *,int,const uint8 *,int,uint8 *,
+ int ,int ,int )={
+ libyuv::I420ToABGR,libyuv::I420ToRGBA,libyuv::I420ToARGB,libyuv::I420ToBGRA,
+ libyuv::I420ToRGB24,libyuv::I420ToRGB565
+};
+
+static void (*rotatePlaneFunc[])(const uint8* src,int src_stride,uint8* dst,int dst_stride,
+ int width,int height)={
+ libyuv::RotatePlane90,libyuv::RotatePlane180,libyuv::RotatePlane270,
+};
+
+static void (*rotateUVFunc[])(const uint8* src, int src_stride, uint8* dst_a, int dst_stride_a,
+ uint8* dst_b, int dst_stride_b, int width, int height)={
+ libyuv::RotateUV90,libyuv::RotateUV180,libyuv::RotateUV270,
+};
+
+int rgbaToI420(JNIEnv * env,jclass clazz,jbyteArray rgba,jint rgba_stride,
+ jbyteArray yuv,jint y_stride,jint u_stride,jint v_stride,
+ jint width,jint height,
+ int (*func)(const uint8 *,int,uint8 *,int,uint8 *,int ,uint8 *,int,int,int)){
+ size_t ySize=(size_t) (y_stride * height);
+ size_t uSize=(size_t) (u_stride * height >> 1);
+ jbyte * rgbaData= env->GetByteArrayElements(rgba,JNI_FALSE);
+ jbyte * yuvData=env->GetByteArrayElements(yuv,JNI_FALSE);
+ int ret=func((const uint8 *) rgbaData, rgba_stride, (uint8 *) yuvData, y_stride,
+ (uint8 *) (yuvData) + ySize, u_stride, (uint8 *) (yuvData )+ ySize + uSize,
+ v_stride, width, height);
+ env->ReleaseByteArrayElements(rgba,rgbaData,JNI_OK);
+ env->ReleaseByteArrayElements(yuv,yuvData,JNI_OK);
+ return ret;
+}
+
+int i420ToRgba(JNIEnv * env,jclass clazz,jbyteArray yuv,jint y_stride,jint u_stride,jint v_stride,
+ jbyteArray rgba,jint rgba_stride,jint width,jint height,
+ int (*func)(const uint8 *,int, const uint8 *,int,const uint8 *,int,uint8 *,
+ int ,int ,int )){
+ size_t ySize=(size_t) (y_stride * height);
+ size_t uSize=(size_t) (u_stride * height >> 1);
+ jbyte * rgbaData= env->GetByteArrayElements(rgba,JNI_FALSE);
+ jbyte * yuvData=env->GetByteArrayElements(yuv,JNI_FALSE);
+ int ret=func((const uint8 *) yuvData, y_stride, (uint8 *) yuvData + ySize, u_stride,
+ (uint8 *) (yuvData)+ ySize + uSize, v_stride, (uint8 *) (rgbaData),
+ rgba_stride, width, height);
+ env->ReleaseByteArrayElements(rgba,rgbaData,JNI_OK);
+ env->ReleaseByteArrayElements(yuv,yuvData,JNI_OK);
+ return ret;
+}
+
+int Jni_I420ToNV21(JNIEnv * env,jclass clazz,jbyteArray yuv420p,jbyteArray yuv420sp,jint width,jint height,jboolean swapUV){
+ size_t ySize=(size_t) (width * height);
+ size_t uSize=(size_t) (width * height >> 2);
+ size_t stride[]={0,uSize};
+ jbyte * yuv420pData=env->GetByteArrayElements(yuv420p,JNI_FALSE);
+ jbyte * yuv420spData=env->GetByteArrayElements(yuv420sp,JNI_FALSE);
+ int ret=libyuv::I420ToNV21((const uint8 *) yuv420pData, width,
+ (const uint8 *) (yuv420pData + ySize + stride[swapUV]), width >> 1,
+ (const uint8 *) (yuv420pData + ySize + stride[1-swapUV]), width >> 1,
+ (uint8 *) yuv420spData, width, (uint8 *) (yuv420spData + ySize), width, width, height);
+ env->ReleaseByteArrayElements(yuv420p,yuv420pData,JNI_OK);
+ env->ReleaseByteArrayElements(yuv420sp,yuv420spData,JNI_OK);
+ return ret;
+}
+
+int Jni_NV21ToI420(JNIEnv * env,jclass clazz,jbyteArray yuv420p,jbyteArray yuv420sp,jint width,jint height,jboolean swapUV){
+ size_t ySize=(size_t) (width * height);
+ size_t uSize=(size_t) (width * height >> 2);
+ size_t stride[]={0,uSize};
+ jbyte * yuv420pData=env->GetByteArrayElements(yuv420p,JNI_FALSE);
+ jbyte * yuv420spData=env->GetByteArrayElements(yuv420sp,JNI_FALSE);
+ int ret=libyuv::NV21ToI420((const uint8 *) yuv420spData, width, (const uint8 *) (yuv420spData + ySize), width,
+ (uint8 *) yuv420pData, width,
+ (uint8 *) (yuv420pData + ySize + stride[swapUV]), width >> 1,
+ (uint8 *) (yuv420pData + ySize + stride[1-swapUV]), width >> 1, width, height);
+ env->ReleaseByteArrayElements(yuv420p,yuv420pData,JNI_OK);
+ env->ReleaseByteArrayElements(yuv420sp,yuv420spData,JNI_OK);
+ return ret;
+}
+
+void Jni_NV21Scale(JNIEnv * env,jclass clazz,jbyteArray src,jint width,jint height,jbyteArray dst,jint dst_width,jint dst_height,int mode){
+ size_t ySize=(size_t) (width * height);
+ size_t dstYSize=(size_t) (dst_width*dst_height);
+ jbyte * srcData=env->GetByteArrayElements(src,JNI_FALSE);
+ jbyte * dstData=env->GetByteArrayElements(dst,JNI_FALSE);
+ libyuv::ScalePlane((const uint8 *) srcData, width, width, height,
+ (uint8 *) dstData, dst_width, dst_width, dst_height,(libyuv::FilterMode) mode);
+ libyuv::ScalePlane((const uint8 *)(srcData+ySize) , width, width, height>>1,
+ (uint8 *)(dstData+dstYSize), dst_width, dst_width, dst_height>>1,
+ (libyuv::FilterMode) mode);
+ env->ReleaseByteArrayElements(src,srcData,JNI_OK);
+ env->ReleaseByteArrayElements(dst,dstData,JNI_OK);
+}
+
+void Jni_I420Scale(JNIEnv * env,jclass clazz,jbyteArray src,jint width,jint height,jbyteArray dst,jint dst_width,jint dst_height,int mode,jboolean swapUV){
+ int ySize=width * height;
+ int swap[]={0,ySize>>2};
+ size_t dstYSize=(size_t) (dst_width*dst_height);
+ jbyte * srcData=env->GetByteArrayElements(src,JNI_FALSE);
+ jbyte * dstData=env->GetByteArrayElements(dst,JNI_FALSE);
+ libyuv::I420Scale((const uint8 *) srcData, width, (const uint8 *) (srcData + ySize), width >> 1,
+ (uint8 *)(srcData + ySize + (ySize >> 2)), width >> 1, width, height,
+ (uint8 *) dstData, dst_width, (uint8 *) (dstData + dstYSize+swap[swapUV]), dst_width >> 1,
+ (uint8 *) (dstData + dstYSize + swap[1-swapUV]), dst_width >> 1,
+ dst_width, dst_height, (libyuv::FilterMode) mode);
+ env->ReleaseByteArrayElements(src,srcData,JNI_OK);
+ env->ReleaseByteArrayElements(dst,dstData,JNI_OK);
+}
+
+void Jni_RgbaScale(JNIEnv * env,jclass clazz,jint type,jbyteArray src,jint src_width,jint src_height,jbyteArray dst,jint dst_width,jint dst_height,jint mode){
+ int bytes=(type&0x0F000000)>>24;
+ jbyte * srcData=env->GetByteArrayElements(src,JNI_FALSE);
+ jbyte * dstData=env->GetByteArrayElements(dst,JNI_FALSE);
+ libyuv::ARGBScale((const uint8 *) srcData, bytes * src_width, src_width, src_height,
+ (uint8 *) dstData, bytes*dst_width, dst_width, dst_height,
+ (libyuv::FilterMode) mode);
+ env->ReleaseByteArrayElements(src,srcData,JNI_OK);
+ env->ReleaseByteArrayElements(dst,dstData,JNI_OK);
+}
+
+void Jni_NV21ToI420Rotate(JNIEnv * env,jclass clazz,jbyteArray src,jint width,jint height,jbyteArray dst,jint de,jboolean swapUV){
+ int dst_stride[]={height,width,height};
+ size_t ySize=(size_t) (width * height);
+ size_t swap[]={0,ySize>>2};
+ jbyte * srcData=env->GetByteArrayElements(src,JNI_FALSE);
+ jbyte * dstData=env->GetByteArrayElements(dst,JNI_FALSE);
+ rotatePlaneFunc[de]((const uint8 *) srcData, width, (uint8 *) dstData, dst_stride[de], width, height);
+ rotateUVFunc[de]((const uint8 *) (srcData+ySize), width, (uint8 *)(dstData+ySize+swap[swapUV]), dst_stride[de]>>1,
+ (uint8*)(dstData+ySize+swap[1-swapUV]),dst_stride[de]>>1,width>>1, height>>1);
+ env->ReleaseByteArrayElements(src,srcData,JNI_OK);
+ env->ReleaseByteArrayElements(dst,dstData,JNI_OK);
+}
+
+int Jni_RgbaToI420WithStride(JNIEnv * env,jclass clazz,jint type,jbyteArray rgba,jint rgba_stride,
+ jbyteArray yuv,jint y_stride,jint u_stride,jint v_stride,
+ jint width,jint height){
+ uint8 cType= (uint8) (type & 0x0F);
+ return rgbaToI420(env,clazz,rgba,rgba_stride,yuv,y_stride,u_stride,v_stride,width,height,rgbaToI420Func[cType]);
+}
+
+int Jni_RgbaToI420(JNIEnv * env,jclass clazz,jint type,jbyteArray rgba,jbyteArray yuv,jint width,jint height){
+ uint8 cType=(uint8) (type & 0x0F);
+ int rgba_stride= ((type & 0xF0) >> 4)*width;
+ int y_stride=width;
+ int u_stride=width>>1;
+ int v_stride=u_stride;
+ return rgbaToI420(env,clazz,rgba,rgba_stride,yuv,y_stride,u_stride,v_stride,width,height,rgbaToI420Func[cType]);
+}
+
+int Jni_I420ToRgbaWithStride(JNIEnv * env,jclass clazz,jint type,jbyteArray yuv,jint y_stride,jint u_stride,jint v_stride,
+ jbyteArray rgba,jint rgba_stride,
+ jint width,jint height){
+ uint8 cType=(uint8) (type & 0x0F);
+ return i420ToRgba(env,clazz,yuv,y_stride,u_stride,v_stride,rgba,rgba_stride,width,height,i420ToRgbaFunc[cType]);
+}
+
+int Jni_I420ToRgba(JNIEnv * env,jclass clazz,jint type,jbyteArray yuv,jbyteArray rgba,jint width,jint height){
+ uint8 cType=(uint8) (type & 0x0F);
+ int rgba_stride= ((type & 0xF0) >> 4)*width;
+ int y_stride=width;
+ int u_stride=width>>1;
+ int v_stride=u_stride;
+ return i420ToRgba(env,clazz,yuv,y_stride,u_stride,v_stride,rgba,rgba_stride,width,height,i420ToRgbaFunc[cType]);
+}
+
+
+//libyuv中,rgba表示abgrabgrabgr这样的顺序写入文件,java使用的时候习惯rgba表示rgbargbargba写入文件
+static JNINativeMethod g_methods[]={
+ {"RgbaToI420","(I[BI[BIIIII)I", (void *)Jni_RgbaToI420WithStride},
+ {"RgbaToI420","(I[B[BII)I", (void *)Jni_RgbaToI420},
+
+ {"I420ToRgba","(I[BIII[BIII)I", (void *)Jni_I420ToRgbaWithStride},
+ {"I420ToRgba", "(I[B[BII)I", (void *)Jni_I420ToRgba},
+
+ {"I420ToNV21","([B[BIIZ)V",(void *)Jni_I420ToNV21},
+ {"NV21ToI420","([B[BIIZ)V",(void *)Jni_NV21ToI420},
+ {"NV21Scale","([BII[BIII)V",(void *)Jni_NV21Scale},
+ {"I420Scale","([BII[BIIIZ)V",(void *)Jni_I420Scale},
+ {"RgbaScale","([BII[BIII)V",(void *)Jni_RgbaScale},
+ {"NV21ToI420Rotate","([BII[BIZ)V",(void *)Jni_NV21ToI420Rotate},
+};
+
+JNIEXPORT jint JNI_OnLoad(JavaVM *vm, void *reserved)
+{
+ JNIEnv* env = nullptr;
+
+ if (vm->GetEnv((void**)&env, JNI_VERSION_1_4) != JNI_OK) {
+ return JNI_ERR;
+ }
+ assert(env != nullptr);
+ jclass clazz=env->FindClass(YUV_UTILS_JAVA);
+ env->RegisterNatives(clazz, g_methods, (int) (sizeof(g_methods) / sizeof((g_methods)[0])));
+
+ return JNI_VERSION_1_4;
+}
+
+JNIEXPORT void JNI_OnUnload(JavaVM *jvm, void *reserved){
+ JNIEnv* env = nullptr;
+
+ if (jvm->GetEnv((void**)&env, JNI_VERSION_1_4) != JNI_OK) {
+ return;
+ }
+ jclass clazz=env->FindClass(YUV_UTILS_JAVA);
+ env->UnregisterNatives(clazz);
+}
+
+
+#ifdef __cplusplus
+}
+#endif
+
+
+
+
+
diff --git a/libyuv/src/main/cpp/libyuv/.clang-format b/libyuv/src/main/cpp/libyuv/.clang-format
new file mode 100644
index 0000000..59d4870
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/.clang-format
@@ -0,0 +1,6 @@
+# Defines the Chromium style for automatic reformatting.
+# http://clang.llvm.org/docs/ClangFormatStyleOptions.html
+BasedOnStyle: Chromium
+---
+Language: Java
+BasedOnStyle: Google
diff --git a/libyuv/src/main/cpp/libyuv/.gitignore b/libyuv/src/main/cpp/libyuv/.gitignore
new file mode 100644
index 0000000..cdd6046
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/.gitignore
@@ -0,0 +1,35 @@
+*.pyc
+.landmines
+pin-log.txt
+/base
+/build
+/buildtools
+/google_apis
+/links
+/links.db
+/ios
+/mojo
+/native_client
+/net
+/out
+/sde-avx-sse-transition-out.txt
+/testing
+/third_party
+/tools
+
+# Files generated by CMake build
+cmake_install.cmake
+CMakeCache.txt
+CMakeFiles/
+yuvconvert
+libgtest.a
+libyuv.a
+libyuv_unittest
+
+# Files generated by winarm.mk build
+libyuv_arm.lib
+source/*.o
+
+# Files generated by perf
+perf.data
+perf.data.old
diff --git a/libyuv/src/main/cpp/libyuv/.gn b/libyuv/src/main/cpp/libyuv/.gn
new file mode 100644
index 0000000..21e83bd
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/.gn
@@ -0,0 +1,52 @@
+# Copyright 2015 The LibYuv Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style license
+# that can be found in the LICENSE file in the root of the source
+# tree. An additional intellectual property rights grant can be found
+# in the file PATENTS. All contributing project authors may
+# be found in the AUTHORS file in the root of the source tree.
+
+# The location of the build configuration file.
+buildconfig = "//build/config/BUILDCONFIG.gn"
+
+# The secondary source root is a parallel directory tree where
+# GN build files are placed when they can not be placed directly
+# in the source tree, e.g. for third party source trees.
+secondary_source = "//build/secondary/"
+
+# These are the targets to check headers for by default. The files in targets
+# matching these patterns (see "gn help label_pattern" for format) will have
+# their includes checked for proper dependencies when you run either
+# "gn check" or "gn gen --check".
+check_targets = [ "//libyuv/*" ]
+
+# These are the list of GN files that run exec_script. This whitelist exists
+# to force additional review for new uses of exec_script, which is strongly
+# discouraged except for gypi_to_gn calls.
+exec_script_whitelist = [
+ "//build/config/android/BUILD.gn",
+ "//build/config/android/config.gni",
+ "//build/config/android/internal_rules.gni",
+ "//build/config/android/rules.gni",
+ "//build/config/BUILD.gn",
+ "//build/config/compiler/BUILD.gn",
+ "//build/config/gcc/gcc_version.gni",
+ "//build/config/ios/ios_sdk.gni",
+ "//build/config/linux/BUILD.gn",
+ "//build/config/linux/pkg_config.gni",
+ "//build/config/mac/mac_sdk.gni",
+ "//build/config/posix/BUILD.gn",
+ "//build/config/sysroot.gni",
+ "//build/config/win/BUILD.gn",
+ "//build/config/win/visual_studio_version.gni",
+ "//build/gn_helpers.py",
+ "//build/gypi_to_gn.py",
+ "//build/toolchain/concurrent_links.gni",
+ "//build/toolchain/gcc_toolchain.gni",
+ "//build/toolchain/mac/BUILD.gn",
+ "//build/toolchain/win/BUILD.gn",
+]
+
+default_args = {
+ mac_sdk_min = "10.11"
+}
diff --git a/libyuv/src/main/cpp/libyuv/AUTHORS b/libyuv/src/main/cpp/libyuv/AUTHORS
new file mode 100644
index 0000000..9686ac1
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/AUTHORS
@@ -0,0 +1,4 @@
+# Names should be added to this file like so:
+# Name or Organization
+
+Google Inc.
diff --git a/libyuv/src/main/cpp/libyuv/Android.mk b/libyuv/src/main/cpp/libyuv/Android.mk
new file mode 100644
index 0000000..e034592
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/Android.mk
@@ -0,0 +1,102 @@
+# This is the Android makefile for libyuv for NDK.
+LOCAL_PATH:= $(call my-dir)
+
+include $(CLEAR_VARS)
+
+LOCAL_CPP_EXTENSION := .cc
+
+LOCAL_SRC_FILES := \
+ source/compare.cc \
+ source/compare_common.cc \
+ source/compare_gcc.cc \
+ source/compare_neon.cc \
+ source/compare_neon64.cc \
+ source/convert.cc \
+ source/convert_argb.cc \
+ source/convert_from.cc \
+ source/convert_from_argb.cc \
+ source/convert_to_argb.cc \
+ source/convert_to_i420.cc \
+ source/cpu_id.cc \
+ source/planar_functions.cc \
+ source/rotate.cc \
+ source/rotate_any.cc \
+ source/rotate_argb.cc \
+ source/rotate_common.cc \
+ source/rotate_dspr2.cc \
+ source/rotate_gcc.cc \
+ source/rotate_msa.cc \
+ source/rotate_neon.cc \
+ source/rotate_neon64.cc \
+ source/row_any.cc \
+ source/row_common.cc \
+ source/row_dspr2.cc \
+ source/row_gcc.cc \
+ source/row_msa.cc \
+ source/row_neon.cc \
+ source/row_neon64.cc \
+ source/scale.cc \
+ source/scale_any.cc \
+ source/scale_argb.cc \
+ source/scale_common.cc \
+ source/scale_dspr2.cc \
+ source/scale_gcc.cc \
+ source/scale_msa.cc \
+ source/scale_neon.cc \
+ source/scale_neon64.cc \
+ source/video_common.cc
+
+common_CFLAGS := -Wall -fexceptions
+ifneq ($(LIBYUV_DISABLE_JPEG), "yes")
+LOCAL_SRC_FILES += \
+ source/convert_jpeg.cc \
+ source/mjpeg_decoder.cc \
+ source/mjpeg_validate.cc
+common_CFLAGS += -DHAVE_JPEG
+LOCAL_SHARED_LIBRARIES := libjpeg
+endif
+
+LOCAL_CFLAGS += $(common_CFLAGS)
+LOCAL_EXPORT_C_INCLUDES := $(LOCAL_PATH)/include
+LOCAL_C_INCLUDES += $(LOCAL_PATH)/include
+LOCAL_EXPORT_C_INCLUDE_DIRS := $(LOCAL_PATH)/include
+
+LOCAL_MODULE := libyuv_static
+LOCAL_MODULE_TAGS := optional
+
+include $(BUILD_STATIC_LIBRARY)
+
+include $(CLEAR_VARS)
+
+LOCAL_WHOLE_STATIC_LIBRARIES := libyuv_static
+LOCAL_MODULE := libyuv
+ifneq ($(LIBYUV_DISABLE_JPEG), "yes")
+LOCAL_SHARED_LIBRARIES := libjpeg
+endif
+
+include $(BUILD_SHARED_LIBRARY)
+
+include $(CLEAR_VARS)
+LOCAL_STATIC_LIBRARIES := libyuv_static
+LOCAL_SHARED_LIBRARIES := libjpeg
+LOCAL_MODULE_TAGS := tests
+LOCAL_CPP_EXTENSION := .cc
+LOCAL_C_INCLUDES += $(LOCAL_PATH)/include
+LOCAL_SRC_FILES := \
+ unit_test/unit_test.cc \
+ unit_test/basictypes_test.cc \
+ unit_test/color_test.cc \
+ unit_test/compare_test.cc \
+ unit_test/convert_test.cc \
+ unit_test/cpu_test.cc \
+ unit_test/cpu_thread_test.cc \
+ unit_test/math_test.cc \
+ unit_test/planar_test.cc \
+ unit_test/rotate_argb_test.cc \
+ unit_test/rotate_test.cc \
+ unit_test/scale_argb_test.cc \
+ unit_test/scale_test.cc \
+ unit_test/video_common_test.cc
+
+LOCAL_MODULE := libyuv_unittest
+include $(BUILD_NATIVE_TEST)
diff --git a/libyuv/src/main/cpp/libyuv/BUILD.gn b/libyuv/src/main/cpp/libyuv/BUILD.gn
new file mode 100644
index 0000000..ecc51f9
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/BUILD.gn
@@ -0,0 +1,359 @@
+# Copyright 2014 The LibYuv Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style license
+# that can be found in the LICENSE file in the root of the source
+# tree. An additional intellectual property rights grant can be found
+# in the file PATENTS. All contributing project authors may
+# be found in the AUTHORS file in the root of the source tree.
+
+import("libyuv.gni")
+import("//testing/test.gni")
+
+declare_args() {
+ # Set to false to disable building with gflags.
+ libyuv_use_gflags = true
+}
+
+config("libyuv_config") {
+ include_dirs = [ "include" ]
+ if (is_android && current_cpu == "arm64") {
+ ldflags = [ "-Wl,--dynamic-linker,/system/bin/linker64" ]
+ }
+ if (is_android && current_cpu != "arm64") {
+ ldflags = [ "-Wl,--dynamic-linker,/system/bin/linker" ]
+ }
+}
+
+# This target is built when no specific target is specified on the command line.
+group("default") {
+ testonly = true
+ deps = [
+ ":libyuv",
+ ]
+ if (libyuv_include_tests) {
+ deps += [
+ ":compare",
+ ":cpuid",
+ ":libyuv_unittest",
+ ":psnr",
+ ":yuvconvert",
+ ]
+ }
+}
+
+group("libyuv") {
+ public_configs = [ ":libyuv_config" ]
+
+ if (is_win && target_cpu == "x64") {
+ # Compile with clang in order to get inline assembly
+ public_deps = [
+ ":libyuv_internal(//build/toolchain/win:clang_x64)",
+ ]
+ } else {
+ public_deps = [
+ ":libyuv_internal",
+ ]
+ }
+
+ if (!is_ios) {
+ # Make sure that clients of libyuv link with libjpeg. This can't go in
+ # libyuv_internal because in Windows x64 builds that will generate a clang
+ # build of libjpeg, and we don't want two copies.
+ deps = [
+ "//third_party:jpeg",
+ ]
+ }
+}
+
+static_library("libyuv_internal") {
+ visibility = [ ":*" ]
+
+ sources = [
+ # Headers
+ "include/libyuv.h",
+ "include/libyuv/basic_types.h",
+ "include/libyuv/compare.h",
+ "include/libyuv/convert.h",
+ "include/libyuv/convert_argb.h",
+ "include/libyuv/convert_from.h",
+ "include/libyuv/convert_from_argb.h",
+ "include/libyuv/cpu_id.h",
+ "include/libyuv/mjpeg_decoder.h",
+ "include/libyuv/planar_functions.h",
+ "include/libyuv/rotate.h",
+ "include/libyuv/rotate_argb.h",
+ "include/libyuv/rotate_row.h",
+ "include/libyuv/row.h",
+ "include/libyuv/scale.h",
+ "include/libyuv/scale_argb.h",
+ "include/libyuv/scale_row.h",
+ "include/libyuv/version.h",
+ "include/libyuv/video_common.h",
+
+ # Source Files
+ "source/compare.cc",
+ "source/compare_common.cc",
+ "source/compare_gcc.cc",
+ "source/compare_win.cc",
+ "source/convert.cc",
+ "source/convert_argb.cc",
+ "source/convert_from.cc",
+ "source/convert_from_argb.cc",
+ "source/convert_jpeg.cc",
+ "source/convert_to_argb.cc",
+ "source/convert_to_i420.cc",
+ "source/cpu_id.cc",
+ "source/mjpeg_decoder.cc",
+ "source/mjpeg_validate.cc",
+ "source/planar_functions.cc",
+ "source/rotate.cc",
+ "source/rotate_any.cc",
+ "source/rotate_argb.cc",
+ "source/rotate_common.cc",
+ "source/rotate_dspr2.cc",
+ "source/rotate_gcc.cc",
+ "source/rotate_win.cc",
+ "source/row_any.cc",
+ "source/row_common.cc",
+ "source/row_dspr2.cc",
+ "source/row_gcc.cc",
+ "source/row_win.cc",
+ "source/scale.cc",
+ "source/scale_any.cc",
+ "source/scale_argb.cc",
+ "source/scale_common.cc",
+ "source/scale_dspr2.cc",
+ "source/scale_gcc.cc",
+ "source/scale_win.cc",
+ "source/video_common.cc",
+ ]
+
+ configs += [ ":libyuv_config" ]
+ defines = []
+ deps = []
+
+ if (!is_ios) {
+ defines += [ "HAVE_JPEG" ]
+
+ # Needed to pull in libjpeg headers. Can't add //third_party:jpeg to deps
+ # because in Windows x64 build it will get compiled with clang.
+ deps += [ "//third_party:jpeg_includes" ]
+ }
+
+ if (libyuv_use_neon) {
+ deps += [ ":libyuv_neon" ]
+ }
+
+ if (libyuv_use_msa) {
+ deps += [ ":libyuv_msa" ]
+ }
+
+ # Always enable optimization for Release and NaCl builds (to workaround
+ # crbug.com/538243).
+ if (!is_debug || is_nacl) {
+ configs -= [ "//build/config/compiler:default_optimization" ]
+
+ # Enable optimize for speed (-O2) over size (-Os).
+ configs += [ "//build/config/compiler:optimize_max" ]
+ }
+
+ # To enable AVX2 or other cpu optimization, pass flag here
+ # cflags = [ "-mavx2" ]
+ # cflags = [ "-mpopcnt" ]
+}
+
+if (libyuv_use_neon) {
+ static_library("libyuv_neon") {
+ sources = [
+ # ARM Source Files
+ "source/compare_neon.cc",
+ "source/compare_neon64.cc",
+ "source/rotate_neon.cc",
+ "source/rotate_neon64.cc",
+ "source/row_neon.cc",
+ "source/row_neon64.cc",
+ "source/scale_neon.cc",
+ "source/scale_neon64.cc",
+ ]
+
+ public_configs = [ ":libyuv_config" ]
+
+ # Always enable optimization for Release and NaCl builds (to workaround
+ # crbug.com/538243).
+ if (!is_debug) {
+ configs -= [ "//build/config/compiler:default_optimization" ]
+
+ # Enable optimize for speed (-O2) over size (-Os).
+ configs += [ "//build/config/compiler:optimize_max" ]
+ }
+
+ if (current_cpu != "arm64") {
+ configs -= [ "//build/config/compiler:compiler_arm_fpu" ]
+ cflags = [ "-mfpu=neon" ]
+ }
+ }
+}
+
+if (libyuv_use_msa) {
+ static_library("libyuv_msa") {
+ sources = [
+ # MSA Source Files
+ "source/rotate_msa.cc",
+ "source/row_msa.cc",
+ "source/scale_msa.cc",
+ ]
+
+ public_configs = [ ":libyuv_config" ]
+ }
+}
+
+if (libyuv_include_tests) {
+ config("libyuv_unittest_warnings_config") {
+ if (!is_win) {
+ cflags = [
+ # TODO(fbarchard): Fix sign and unused variable warnings.
+ "-Wno-sign-compare",
+ "-Wno-unused-variable",
+ ]
+ }
+ if (is_win) {
+ cflags = [
+ "/wd4245", # signed/unsigned mismatch
+ "/wd4189", # local variable is initialized but not referenced
+ ]
+ }
+ }
+ config("libyuv_unittest_config") {
+ defines = [ "GTEST_RELATIVE_PATH" ]
+ }
+
+ test("libyuv_unittest") {
+ testonly = true
+
+ sources = [
+ # sources
+ # headers
+ "unit_test/basictypes_test.cc",
+ "unit_test/color_test.cc",
+ "unit_test/compare_test.cc",
+ "unit_test/convert_test.cc",
+ "unit_test/cpu_test.cc",
+ "unit_test/cpu_thread_test.cc",
+ "unit_test/math_test.cc",
+ "unit_test/planar_test.cc",
+ "unit_test/rotate_argb_test.cc",
+ "unit_test/rotate_test.cc",
+ "unit_test/scale_argb_test.cc",
+ "unit_test/scale_test.cc",
+ "unit_test/unit_test.cc",
+ "unit_test/unit_test.h",
+ "unit_test/video_common_test.cc",
+ ]
+
+ deps = [
+ ":libyuv",
+ "//testing/gtest",
+ ]
+
+ defines = []
+ if (libyuv_use_gflags) {
+ defines += [ "LIBYUV_USE_GFLAGS" ]
+ deps += [ "//third_party/gflags" ]
+ }
+
+ configs += [ ":libyuv_unittest_warnings_config" ]
+
+ public_deps = [
+ "//testing/gtest",
+ ]
+ public_configs = [ ":libyuv_unittest_config" ]
+
+ if (is_linux) {
+ cflags = [ "-fexceptions" ]
+ }
+ if (is_ios) {
+ configs -= [ "//build/config/compiler:default_symbols" ]
+ configs += [ "//build/config/compiler:symbols" ]
+ cflags = [ "-Wno-sometimes-uninitialized" ]
+ }
+ if (!is_ios && !libyuv_disable_jpeg) {
+ defines += [ "HAVE_JPEG" ]
+ }
+ if (is_android) {
+ deps += [ "//testing/android/native_test:native_test_native_code" ]
+ }
+
+ # TODO(YangZhang): These lines can be removed when high accuracy
+ # YUV to RGB to Neon is ported.
+ if ((target_cpu == "armv7" || target_cpu == "armv7s" ||
+ (target_cpu == "arm" && arm_version >= 7) || target_cpu == "arm64") &&
+ (arm_use_neon || arm_optionally_use_neon)) {
+ defines += [ "LIBYUV_NEON" ]
+ }
+
+ defines += [
+ # Enable the following 3 macros to turn off assembly for specified CPU.
+ # "LIBYUV_DISABLE_X86",
+ # "LIBYUV_DISABLE_NEON",
+ # "LIBYUV_DISABLE_DSPR2",
+ # Enable the following macro to build libyuv as a shared library (dll).
+ # "LIBYUV_USING_SHARED_LIBRARY"
+ ]
+ }
+
+ executable("compare") {
+ sources = [
+ # sources
+ "util/compare.cc",
+ ]
+ deps = [
+ ":libyuv",
+ "//build/config:exe_and_shlib_deps", # for asan on llvm libc++
+ ]
+ if (is_linux) {
+ cflags = [ "-fexceptions" ]
+ }
+ }
+
+ executable("yuvconvert") {
+ sources = [
+ # sources
+ "util/yuvconvert.cc",
+ ]
+ deps = [
+ ":libyuv",
+ "//build/config:exe_and_shlib_deps", # for new[] on llvm libc++
+ ]
+ if (is_linux) {
+ cflags = [ "-fexceptions" ]
+ }
+ }
+
+ executable("psnr") {
+ sources = [
+ # sources
+ "util/psnr.cc",
+ "util/psnr_main.cc",
+ "util/ssim.cc",
+ ]
+ deps = [
+ ":libyuv",
+ "//build/config:exe_and_shlib_deps", # for new[] on llvm libc++
+ ]
+
+ if (!is_ios && !libyuv_disable_jpeg) {
+ defines = [ "HAVE_JPEG" ]
+ }
+ }
+
+ executable("cpuid") {
+ sources = [
+ # sources
+ "util/cpuid.c",
+ ]
+ deps = [
+ ":libyuv",
+ "//build/config:exe_and_shlib_deps", # for asan on llvm libc++
+ ]
+ }
+}
diff --git a/libyuv/src/main/cpp/libyuv/CM_linux_packages.cmake b/libyuv/src/main/cpp/libyuv/CM_linux_packages.cmake
new file mode 100644
index 0000000..5f676f8
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/CM_linux_packages.cmake
@@ -0,0 +1,69 @@
+# determine the version number from the #define in libyuv/version.h
+EXECUTE_PROCESS (
+ COMMAND grep --perl-regex --only-matching "(?<=LIBYUV_VERSION )[0-9]+" include/libyuv/version.h
+ WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
+ OUTPUT_VARIABLE YUV_VERSION_NUMBER
+ OUTPUT_STRIP_TRAILING_WHITESPACE )
+SET ( YUV_VER_MAJOR 0 )
+SET ( YUV_VER_MINOR 0 )
+SET ( YUV_VER_PATCH ${YUV_VERSION_NUMBER} )
+SET ( YUV_VERSION ${YUV_VER_MAJOR}.${YUV_VER_MINOR}.${YUV_VER_PATCH} )
+MESSAGE ( "Building ver.: ${YUV_VERSION}" )
+
+# is this a 32-bit or 64-bit build?
+IF ( CMAKE_SIZEOF_VOID_P EQUAL 8 )
+ SET ( YUV_BIT_SIZE 64 )
+ELSEIF ( CMAKE_SIZEOF_VOID_P EQUAL 4 )
+ SET ( YUV_BIT_SIZE 32 )
+ELSE ()
+ MESSAGE ( FATAL_ERROR "CMAKE_SIZEOF_VOID_P=${CMAKE_SIZEOF_VOID_P}" )
+ENDIF ()
+
+# detect if this is a ARM build
+STRING (FIND "${CMAKE_CXX_COMPILER}" "arm-linux-gnueabihf-g++" pos)
+IF ( ${pos} EQUAL -1 )
+ SET ( YUV_CROSS_COMPILE_FOR_ARM7 FALSE )
+ELSE ()
+ MESSAGE ( "Cross compiling for ARM7" )
+ SET ( YUV_CROSS_COMPILE_FOR_ARM7 TRUE )
+ENDIF ()
+STRING (FIND "${CMAKE_SYSTEM_PROCESSOR}" "arm" pos)
+IF ( ${pos} EQUAL -1 )
+ SET ( YUV_COMPILE_FOR_ARM7 FALSE )
+ELSE ()
+ MESSAGE ( "Compiling for ARM" )
+ SET ( YUV_COMPILE_FOR_ARM7 TRUE )
+ENDIF ()
+
+# setup the sytem name, such as "x86-32", "amd-64", and "arm-32
+IF ( ${YUV_CROSS_COMPILE_FOR_ARM7} OR ${YUV_COMPILE_FOR_ARM7} )
+ SET ( YUV_SYSTEM_NAME "armhf-${YUV_BIT_SIZE}" )
+ELSE ()
+ IF ( YUV_BIT_SIZE EQUAL 32 )
+ SET ( YUV_SYSTEM_NAME "x86-${YUV_BIT_SIZE}" )
+ ELSE ()
+ SET ( YUV_SYSTEM_NAME "amd-${YUV_BIT_SIZE}" )
+ ENDIF ()
+ENDIF ()
+MESSAGE ( "Packaging for: ${YUV_SYSTEM_NAME}" )
+
+# define all the variables needed by CPack to create .deb and .rpm packages
+SET ( CPACK_PACKAGE_VENDOR "Frank Barchard" )
+SET ( CPACK_PACKAGE_CONTACT "fbarchard@chromium.org" )
+SET ( CPACK_PACKAGE_VERSION ${YUV_VERSION} )
+SET ( CPACK_PACKAGE_VERSION_MAJOR ${YUV_VER_MAJOR} )
+SET ( CPACK_PACKAGE_VERSION_MINOR ${YUV_VER_MINOR} )
+SET ( CPACK_PACKAGE_VERSION_PATCH ${YUV_VER_PATCH} )
+SET ( CPACK_RESOURCE_FILE_LICENSE ${PROJECT_SOURCE_DIR}/LICENSE )
+SET ( CPACK_SYSTEM_NAME "linux-${YUV_SYSTEM_NAME}" )
+SET ( CPACK_PACKAGE_NAME "libyuv" )
+SET ( CPACK_PACKAGE_DESCRIPTION_SUMMARY "YUV library" )
+SET ( CPACK_PACKAGE_DESCRIPTION "YUV library and YUV conversion tool" )
+SET ( CPACK_DEBIAN_PACKAGE_SECTION "other" )
+SET ( CPACK_DEBIAN_PACKAGE_PRIORITY "optional" )
+SET ( CPACK_DEBIAN_PACKAGE_MAINTAINER "Frank Barchard " )
+SET ( CPACK_GENERATOR "DEB;RPM" )
+
+# create the .deb and .rpm files (you'll need build-essential and rpm tools)
+INCLUDE( CPack )
+
diff --git a/libyuv/src/main/cpp/libyuv/CMakeLists.txt b/libyuv/src/main/cpp/libyuv/CMakeLists.txt
new file mode 100644
index 0000000..6420371
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/CMakeLists.txt
@@ -0,0 +1,83 @@
+# CMakeLists for libyuv
+# Originally created for "roxlu build system" to compile libyuv on windows
+# Run with -DTEST=ON to build unit tests
+
+PROJECT ( YUV C CXX ) # "C" is required even for C++ projects
+CMAKE_MINIMUM_REQUIRED( VERSION 2.8 )
+OPTION( TEST "Built unit tests" OFF )
+
+SET ( ly_base_dir ${PROJECT_SOURCE_DIR} )
+SET ( ly_src_dir ${ly_base_dir}/source )
+SET ( ly_inc_dir ${ly_base_dir}/include )
+SET ( ly_tst_dir ${ly_base_dir}/unit_test )
+SET ( ly_lib_name yuv )
+SET ( ly_lib_static ${ly_lib_name} )
+SET ( ly_lib_shared ${ly_lib_name}_shared )
+
+FILE ( GLOB_RECURSE ly_source_files ${ly_src_dir}/*.cc )
+LIST ( SORT ly_source_files )
+
+FILE ( GLOB_RECURSE ly_unittest_sources ${ly_tst_dir}/*.cc )
+LIST ( SORT ly_unittest_sources )
+
+INCLUDE_DIRECTORIES( BEFORE ${ly_inc_dir} )
+
+# this creates the static library (.a)
+ADD_LIBRARY ( ${ly_lib_static} STATIC ${ly_source_files} )
+
+# this creates the shared library (.so)
+ADD_LIBRARY ( ${ly_lib_shared} SHARED ${ly_source_files} )
+SET_TARGET_PROPERTIES ( ${ly_lib_shared} PROPERTIES OUTPUT_NAME "${ly_lib_name}" )
+SET_TARGET_PROPERTIES ( ${ly_lib_shared} PROPERTIES PREFIX "lib" )
+
+# this creates the conversion tool
+ADD_EXECUTABLE ( yuvconvert ${ly_base_dir}/util/yuvconvert.cc )
+TARGET_LINK_LIBRARIES ( yuvconvert ${ly_lib_static} )
+
+
+INCLUDE ( FindJPEG )
+if (JPEG_FOUND)
+ include_directories( ${JPEG_INCLUDE_DIR} )
+ target_link_libraries( yuvconvert ${JPEG_LIBRARY} )
+ add_definitions( -DHAVE_JPEG )
+endif()
+
+if(TEST)
+ find_library(GTEST_LIBRARY gtest)
+ if(GTEST_LIBRARY STREQUAL "GTEST_LIBRARY-NOTFOUND")
+ set(GTEST_SRC_DIR /usr/src/gtest CACHE STRING "Location of gtest sources")
+ if(EXISTS ${GTEST_SRC_DIR}/src/gtest-all.cc)
+ message(STATUS "building gtest from sources in ${GTEST_SRC_DIR}")
+ set(gtest_sources ${GTEST_SRC_DIR}/src/gtest-all.cc)
+ add_library(gtest STATIC ${gtest_sources})
+ include_directories(${GTEST_SRC_DIR})
+ include_directories(${GTEST_SRC_DIR}/include)
+ set(GTEST_LIBRARY gtest)
+ else()
+ message(FATAL_ERROR "TEST is set but unable to find gtest library")
+ endif()
+ endif()
+
+ add_executable(libyuv_unittest ${ly_unittest_sources})
+ target_link_libraries(libyuv_unittest ${ly_lib_name} ${GTEST_LIBRARY} pthread)
+ if (JPEG_FOUND)
+ target_link_libraries(libyuv_unittest ${JPEG_LIBRARY})
+ endif()
+
+ if(NACL AND NACL_LIBC STREQUAL "newlib")
+ target_link_libraries(libyuv_unittest glibc-compat)
+ endif()
+
+ target_link_libraries(libyuv_unittest gflags)
+endif()
+
+
+# install the conversion tool, .so, .a, and all the header files
+INSTALL ( PROGRAMS ${CMAKE_BINARY_DIR}/yuvconvert DESTINATION bin )
+INSTALL ( TARGETS ${ly_lib_static} DESTINATION lib )
+INSTALL ( TARGETS ${ly_lib_shared} LIBRARY DESTINATION lib )
+INSTALL ( DIRECTORY ${PROJECT_SOURCE_DIR}/include/ DESTINATION include )
+
+# create the .deb and .rpm packages using cpack
+INCLUDE ( CM_linux_packages.cmake )
+
diff --git a/libyuv/src/main/cpp/libyuv/DEPS b/libyuv/src/main/cpp/libyuv/DEPS
new file mode 100644
index 0000000..3568be4
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/DEPS
@@ -0,0 +1,444 @@
+vars = {
+ 'chromium_git': 'https://chromium.googlesource.com',
+ 'chromium_revision': 'da6245e7c4b489a4dc73be76fcb54483add1eb2b',
+ 'swarming_revision': 'a56c2b39ca23bdf41458421a7f825ddbf3f43f28',
+ # Three lines of non-changing comments so that
+ # the commit queue can handle CLs rolling lss
+ # and whatever else without interference from each other.
+ 'lss_revision': '63f24c8221a229f677d26ebe8f3d1528a9d787ac',
+ # Three lines of non-changing comments so that
+ # the commit queue can handle CLs rolling catapult
+ # and whatever else without interference from each other.
+ 'catapult_revision': '9629af7533357c245a90fb7130fbfba53a162284',
+}
+
+deps = {
+ 'src/build':
+ Var('chromium_git') + '/chromium/src/build' + '@' + 'e3c0667f115674831e9eb2e91ecbec4545b049ab',
+ 'src/buildtools':
+ Var('chromium_git') + '/chromium/buildtools.git' + '@' + '5ad14542a6a74dd914f067b948c5d3e8d170396b',
+ 'src/testing':
+ Var('chromium_git') + '/chromium/src/testing' + '@' + 'ebf1c4622eba176624b5084a727b8fe308d2896b',
+ 'src/third_party':
+ Var('chromium_git') + '/chromium/src/third_party' + '@' + 'd58cf433a5327d71ff38bfeab37bd42932f9740b',
+ 'src/third_party/catapult':
+ Var('chromium_git') + '/external/github.com/catapult-project/catapult.git' + '@' + Var('catapult_revision'),
+ 'src/third_party/colorama/src':
+ Var('chromium_git') + '/external/colorama.git' + '@' + '799604a1041e9b3bc5d2789ecbd7e8db2e18e6b8',
+ 'src/third_party/googletest/src':
+ Var('chromium_git') + '/external/github.com/google/googletest.git' + '@' + '42bc671f47b122fad36db5eccbc06868afdf7862',
+ 'src/third_party/libjpeg_turbo':
+ Var('chromium_git') + '/chromium/deps/libjpeg_turbo.git' + '@' + 'a1750dbc79a8792dde3d3f7d7d8ac28ba01ac9dd',
+ 'src/third_party/yasm/source/patched-yasm':
+ Var('chromium_git') + '/chromium/deps/yasm/patched-yasm.git' + '@' + '7da28c6c7c6a1387217352ce02b31754deb54d2a',
+ 'src/tools':
+ Var('chromium_git') + '/chromium/src/tools' + '@' + '1397f0ed8bd7b907118588800f31382c1da33afc',
+ 'src/tools/gyp':
+ Var('chromium_git') + '/external/gyp.git' + '@' + 'd61a9397e668fa9843c4aa7da9e79460fe590bfb',
+ 'src/tools/swarming_client':
+ Var('chromium_git') + '/external/swarming.client.git' + '@' + Var('swarming_revision'),
+
+ # libyuv-only dependencies (not present in Chromium).
+ 'src/third_party/gflags':
+ Var('chromium_git') + '/external/webrtc/deps/third_party/gflags' + '@' + '892576179b45861b53e04a112996a738309cf364',
+ 'src/third_party/gflags/src':
+ Var('chromium_git') + '/external/github.com/gflags/gflags' + '@' + '03bebcb065c83beff83d50ae025a55a4bf94dfca',
+ 'src/third_party/gtest-parallel':
+ Var('chromium_git') + '/external/webrtc/deps/third_party/gtest-parallel' + '@' + '1dad0e9f6d82ff994130b529d7d814b40eb32b0e',
+}
+
+deps_os = {
+ 'android': {
+ 'src/base':
+ Var('chromium_git') + '/chromium/src/base' + '@' + '8c06e7a9f6d085c70d25f29a1261c4b01bba7470',
+ 'src/third_party/android_tools':
+ Var('chromium_git') + '/android_tools.git' + '@' + 'e9d4018e149d50172ed462a7c21137aa915940ec',
+ 'src/third_party/ced/src':
+ Var('chromium_git') + '/external/github.com/google/compact_enc_det.git' + '@' + '910cca22d881b02cbc8950fa02ccbcdcfb782456',
+ 'src/third_party/icu':
+ Var('chromium_git') + '/chromium/deps/icu.git' + '@' + '1fec0c83e9ad7f5a075ae0b50af9a3889f54be0e',
+ 'src/third_party/jsr-305/src':
+ Var('chromium_git') + '/external/jsr-305.git' + '@' + '642c508235471f7220af6d5df2d3210e3bfc0919',
+ 'src/third_party/junit/src':
+ Var('chromium_git') + '/external/junit.git' + '@' + '64155f8a9babcfcf4263cf4d08253a1556e75481',
+ 'src/third_party/lss':
+ Var('chromium_git') + '/linux-syscall-support.git' + '@' + Var('lss_revision'),
+ 'src/third_party/mockito/src':
+ Var('chromium_git') + '/external/mockito/mockito.git' + '@' + 'de83ad4598ad4cf5ea53c69a8a8053780b04b850',
+ 'src/third_party/requests/src':
+ Var('chromium_git') + '/external/github.com/kennethreitz/requests.git' + '@' + 'f172b30356d821d180fa4ecfa3e71c7274a32de4',
+ 'src/third_party/robolectric/robolectric':
+ Var('chromium_git') + '/external/robolectric.git' + '@' + '2a0b6ba221c14f3371813a676ce06143353e448d',
+ 'src/third_party/ub-uiautomator/lib':
+ Var('chromium_git') + '/chromium/third_party/ub-uiautomator.git' + '@' + '00270549ce3161ae72ceb24712618ea28b4f9434',
+ },
+ 'ios': {
+ 'src/ios':
+ Var('chromium_git') + '/chromium/src/ios' + '@' + 'e5a58b0b437af90c301a3bbbd25f8c620546168b',
+ },
+ 'unix': {
+ 'src/third_party/lss':
+ Var('chromium_git') + '/linux-syscall-support.git' + '@' + Var('lss_revision'),
+ },
+ 'win': {
+ # Dependencies used by libjpeg-turbo
+ 'src/third_party/yasm/binaries':
+ Var('chromium_git') + '/chromium/deps/yasm/binaries.git' + '@' + '52f9b3f4b0aa06da24ef8b123058bb61ee468881',
+ },
+}
+
+# Define rules for which include paths are allowed in our source.
+include_rules = [ '+gflags' ]
+
+pre_deps_hooks = [
+ {
+ # Remove any symlinks from before 177567c518b121731e507e9b9c4049c4dc96e4c8.
+ # TODO(kjellander): Remove this in March 2017.
+ 'name': 'cleanup_links',
+ 'pattern': '.',
+ 'action': ['python', 'src/cleanup_links.py'],
+ },
+]
+
+hooks = [
+ {
+ # This clobbers when necessary (based on get_landmines.py). It should be
+ # an early hook but it will need to be run after syncing Chromium and
+ # setting up the links, so the script actually exists.
+ 'name': 'landmines',
+ 'pattern': '.',
+ 'action': [
+ 'python',
+ 'src/build/landmines.py',
+ '--landmine-scripts',
+ 'src/tools_libyuv/get_landmines.py',
+ '--src-dir',
+ 'src',
+ ],
+ },
+ # Android dependencies. Many are downloaded using Google Storage these days.
+ # They're copied from https://cs.chromium.org/chromium/src/DEPS for all
+ # such dependencies we share with Chromium.
+ {
+ # This downloads SDK extras and puts them in the
+ # third_party/android_tools/sdk/extras directory.
+ 'name': 'sdkextras',
+ 'pattern': '.',
+ # When adding a new sdk extras package to download, add the package
+ # directory and zip file to .gitignore in third_party/android_tools.
+ 'action': ['python',
+ 'src/build/android/play_services/update.py',
+ 'download'
+ ],
+ },
+ {
+ 'name': 'intellij',
+ 'pattern': '.',
+ 'action': ['python',
+ 'src/build/android/update_deps/update_third_party_deps.py',
+ 'download',
+ '-b', 'chromium-intellij',
+ '-l', 'third_party/intellij'
+ ],
+ },
+ {
+ 'name': 'javax_inject',
+ 'pattern': '.',
+ 'action': ['python',
+ 'src/build/android/update_deps/update_third_party_deps.py',
+ 'download',
+ '-b', 'chromium-javax-inject',
+ '-l', 'third_party/javax_inject'
+ ],
+ },
+ {
+ 'name': 'hamcrest',
+ 'pattern': '.',
+ 'action': ['python',
+ 'src/build/android/update_deps/update_third_party_deps.py',
+ 'download',
+ '-b', 'chromium-hamcrest',
+ '-l', 'third_party/hamcrest'
+ ],
+ },
+ {
+ 'name': 'guava',
+ 'pattern': '.',
+ 'action': ['python',
+ 'src/build/android/update_deps/update_third_party_deps.py',
+ 'download',
+ '-b', 'chromium-guava',
+ '-l', 'third_party/guava'
+ ],
+ },
+ {
+ 'name': 'android_support_test_runner',
+ 'pattern': '.',
+ 'action': ['python',
+ 'src/build/android/update_deps/update_third_party_deps.py',
+ 'download',
+ '-b', 'chromium-android-support-test-runner',
+ '-l', 'third_party/android_support_test_runner'
+ ],
+ },
+ {
+ 'name': 'byte_buddy',
+ 'pattern': '.',
+ 'action': ['python',
+ 'src/build/android/update_deps/update_third_party_deps.py',
+ 'download',
+ '-b', 'chromium-byte-buddy',
+ '-l', 'third_party/byte_buddy'
+ ],
+ },
+ {
+ 'name': 'espresso',
+ 'pattern': '.',
+ 'action': ['python',
+ 'src/build/android/update_deps/update_third_party_deps.py',
+ 'download',
+ '-b', 'chromium-espresso',
+ '-l', 'third_party/espresso'
+ ],
+ },
+ {
+ 'name': 'robolectric_libs',
+ 'pattern': '.',
+ 'action': ['python',
+ 'src/build/android/update_deps/update_third_party_deps.py',
+ 'download',
+ '-b', 'chromium-robolectric',
+ '-l', 'third_party/robolectric'
+ ],
+ },
+ {
+ 'name': 'apache_velocity',
+ 'pattern': '.',
+ 'action': ['python',
+ 'src/build/android/update_deps/update_third_party_deps.py',
+ 'download',
+ '-b', 'chromium-apache-velocity',
+ '-l', 'third_party/apache_velocity'
+ ],
+ },
+ {
+ 'name': 'ow2_asm',
+ 'pattern': '.',
+ 'action': ['python',
+ 'src/build/android/update_deps/update_third_party_deps.py',
+ 'download',
+ '-b', 'chromium-ow2-asm',
+ '-l', 'third_party/ow2_asm'
+ ],
+ },
+ {
+ 'name': 'icu4j',
+ 'pattern': '.',
+ 'action': ['python',
+ 'src/build/android/update_deps/update_third_party_deps.py',
+ 'download',
+ '-b', 'chromium-icu4j',
+ '-l', 'third_party/icu4j'
+ ],
+ },
+ {
+ 'name': 'accessibility_test_framework',
+ 'pattern': '.',
+ 'action': ['python',
+ 'src/build/android/update_deps/update_third_party_deps.py',
+ 'download',
+ '-b', 'chromium-accessibility-test-framework',
+ '-l', 'third_party/accessibility_test_framework'
+ ],
+ },
+ {
+ 'name': 'bouncycastle',
+ 'pattern': '.',
+ 'action': ['python',
+ 'src/build/android/update_deps/update_third_party_deps.py',
+ 'download',
+ '-b', 'chromium-bouncycastle',
+ '-l', 'third_party/bouncycastle'
+ ],
+ },
+ {
+ 'name': 'sqlite4java',
+ 'pattern': '.',
+ 'action': ['python',
+ 'src/build/android/update_deps/update_third_party_deps.py',
+ 'download',
+ '-b', 'chromium-sqlite4java',
+ '-l', 'third_party/sqlite4java'
+ ],
+ },
+ {
+ 'name': 'xstream',
+ 'pattern': '.',
+ 'action': ['python',
+ 'src/build/android/update_deps/update_third_party_deps.py',
+ 'download',
+ '-b', 'chromium-robolectric',
+ '-l', 'third_party/xstream'
+ ],
+ },
+ {
+ 'name': 'objenesis',
+ 'pattern': '.',
+ 'action': ['python',
+ 'src/build/android/update_deps/update_third_party_deps.py',
+ 'download',
+ '-b', 'chromium-objenesis',
+ '-l', 'third_party/objenesis'
+ ],
+ },
+ {
+ # Downloads the current stable linux sysroot to build/linux/ if needed.
+ # This sysroot updates at about the same rate that the chrome build deps
+ # change. This script is a no-op except for linux users who are doing
+ # official chrome builds or cross compiling.
+ 'name': 'sysroot',
+ 'pattern': '.',
+ 'action': ['python', 'src/build/linux/sysroot_scripts/install-sysroot.py',
+ '--running-as-hook'],
+ },
+ {
+ # Update the Windows toolchain if necessary.
+ 'name': 'win_toolchain',
+ 'pattern': '.',
+ 'action': ['python', 'src/build/vs_toolchain.py', 'update'],
+ },
+ # Pull binutils for linux, enabled debug fission for faster linking /
+ # debugging when used with clang on Ubuntu Precise.
+ # https://code.google.com/p/chromium/issues/detail?id=352046
+ {
+ 'name': 'binutils',
+ 'pattern': 'src/third_party/binutils',
+ 'action': [
+ 'python',
+ 'src/third_party/binutils/download.py',
+ ],
+ },
+ {
+ # Pull clang if needed or requested via GYP_DEFINES.
+ # Note: On Win, this should run after win_toolchain, as it may use it.
+ 'name': 'clang',
+ 'pattern': '.',
+ 'action': ['python', 'src/tools/clang/scripts/update.py', '--if-needed'],
+ },
+ {
+ # Update LASTCHANGE.
+ 'name': 'lastchange',
+ 'pattern': '.',
+ 'action': ['python', 'src/build/util/lastchange.py',
+ '-o', 'src/build/util/LASTCHANGE'],
+ },
+ # Pull GN binaries.
+ {
+ 'name': 'gn_win',
+ 'pattern': '.',
+ 'action': [ 'download_from_google_storage',
+ '--no_resume',
+ '--platform=win32',
+ '--no_auth',
+ '--bucket', 'chromium-gn',
+ '-s', 'src/buildtools/win/gn.exe.sha1',
+ ],
+ },
+ {
+ 'name': 'gn_mac',
+ 'pattern': '.',
+ 'action': [ 'download_from_google_storage',
+ '--no_resume',
+ '--platform=darwin',
+ '--no_auth',
+ '--bucket', 'chromium-gn',
+ '-s', 'src/buildtools/mac/gn.sha1',
+ ],
+ },
+ {
+ 'name': 'gn_linux64',
+ 'pattern': '.',
+ 'action': [ 'download_from_google_storage',
+ '--no_resume',
+ '--platform=linux*',
+ '--no_auth',
+ '--bucket', 'chromium-gn',
+ '-s', 'src/buildtools/linux64/gn.sha1',
+ ],
+ },
+ # Pull clang-format binaries using checked-in hashes.
+ {
+ 'name': 'clang_format_win',
+ 'pattern': '.',
+ 'action': [ 'download_from_google_storage',
+ '--no_resume',
+ '--platform=win32',
+ '--no_auth',
+ '--bucket', 'chromium-clang-format',
+ '-s', 'src/buildtools/win/clang-format.exe.sha1',
+ ],
+ },
+ {
+ 'name': 'clang_format_mac',
+ 'pattern': '.',
+ 'action': [ 'download_from_google_storage',
+ '--no_resume',
+ '--platform=darwin',
+ '--no_auth',
+ '--bucket', 'chromium-clang-format',
+ '-s', 'src/buildtools/mac/clang-format.sha1',
+ ],
+ },
+ {
+ 'name': 'clang_format_linux',
+ 'pattern': '.',
+ 'action': [ 'download_from_google_storage',
+ '--no_resume',
+ '--platform=linux*',
+ '--no_auth',
+ '--bucket', 'chromium-clang-format',
+ '-s', 'src/buildtools/linux64/clang-format.sha1',
+ ],
+ },
+ # Pull luci-go binaries (isolate, swarming) using checked-in hashes.
+ {
+ 'name': 'luci-go_win',
+ 'pattern': '.',
+ 'action': [ 'download_from_google_storage',
+ '--no_resume',
+ '--platform=win32',
+ '--no_auth',
+ '--bucket', 'chromium-luci',
+ '-d', 'src/tools/luci-go/win64',
+ ],
+ },
+ {
+ 'name': 'luci-go_mac',
+ 'pattern': '.',
+ 'action': [ 'download_from_google_storage',
+ '--no_resume',
+ '--platform=darwin',
+ '--no_auth',
+ '--bucket', 'chromium-luci',
+ '-d', 'src/tools/luci-go/mac64',
+ ],
+ },
+ {
+ 'name': 'luci-go_linux',
+ 'pattern': '.',
+ 'action': [ 'download_from_google_storage',
+ '--no_resume',
+ '--platform=linux*',
+ '--no_auth',
+ '--bucket', 'chromium-luci',
+ '-d', 'src/tools/luci-go/linux64',
+ ],
+ },
+]
+
+recursedeps = [
+ # buildtools provides clang_format, libc++, and libc++abi.
+ 'src/buildtools',
+ # android_tools manages the NDK.
+ 'src/third_party/android_tools',
+]
diff --git a/libyuv/src/main/cpp/libyuv/LICENSE b/libyuv/src/main/cpp/libyuv/LICENSE
new file mode 100644
index 0000000..c911747
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/LICENSE
@@ -0,0 +1,29 @@
+Copyright 2011 The LibYuv Project Authors. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+
+ * Neither the name of Google nor the names of its contributors may
+ be used to endorse or promote products derived from this software
+ without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/libyuv/src/main/cpp/libyuv/LICENSE_THIRD_PARTY b/libyuv/src/main/cpp/libyuv/LICENSE_THIRD_PARTY
new file mode 100644
index 0000000..a71591e
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/LICENSE_THIRD_PARTY
@@ -0,0 +1,8 @@
+This source tree contains third party source code which is governed by third
+party licenses. This file contains references to files which are under other
+licenses than the one provided in the LICENSE file in the root of the source
+tree.
+
+Files governed by third party licenses:
+source/x86inc.asm
+
diff --git a/libyuv/src/main/cpp/libyuv/OWNERS b/libyuv/src/main/cpp/libyuv/OWNERS
new file mode 100644
index 0000000..2db52d3
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/OWNERS
@@ -0,0 +1,13 @@
+fbarchard@chromium.org
+magjed@chromium.org
+torbjorng@chromium.org
+
+per-file *.gyp=kjellander@chromium.org
+per-file *.gn=kjellander@chromium.org
+per-file .gitignore=*
+per-file AUTHORS=*
+per-file DEPS=*
+per-file PRESUBMIT.py=kjellander@chromium.org
+per-file gyp_libyuv.py=kjellander@chromium.org
+per-file setup_links.py=*
+per-file sync_chromium.py=kjellander@chromium.org
diff --git a/libyuv/src/main/cpp/libyuv/PATENTS b/libyuv/src/main/cpp/libyuv/PATENTS
new file mode 100644
index 0000000..64aa5c9
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/PATENTS
@@ -0,0 +1,24 @@
+Additional IP Rights Grant (Patents)
+
+"This implementation" means the copyrightable works distributed by
+Google as part of the LibYuv code package.
+
+Google hereby grants to you a perpetual, worldwide, non-exclusive,
+no-charge, irrevocable (except as stated in this section) patent
+license to make, have made, use, offer to sell, sell, import,
+transfer, and otherwise run, modify and propagate the contents of this
+implementation of the LibYuv code package, where such license applies
+only to those patent claims, both currently owned by Google and
+acquired in the future, licensable by Google that are necessarily
+infringed by this implementation of the LibYuv code package. This
+grant does not include claims that would be infringed only as a
+consequence of further modification of this implementation. If you or
+your agent or exclusive licensee institute or order or agree to the
+institution of patent litigation against any entity (including a
+cross-claim or counterclaim in a lawsuit) alleging that this
+implementation of the LibYuv code package or any code incorporated
+within this implementation of the LibYuv code package constitutes
+direct or contributory patent infringement, or inducement of patent
+infringement, then any patent rights granted to you under this License
+for this implementation of the LibYuv code package shall terminate as
+of the date such litigation is filed.
\ No newline at end of file
diff --git a/libyuv/src/main/cpp/libyuv/PRESUBMIT.py b/libyuv/src/main/cpp/libyuv/PRESUBMIT.py
new file mode 100644
index 0000000..2cf1542
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/PRESUBMIT.py
@@ -0,0 +1,72 @@
+# Copyright 2017 The LibYuv Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style license
+# that can be found in the LICENSE file in the root of the source
+# tree. An additional intellectual property rights grant can be found
+# in the file PATENTS. All contributing project authors may
+# be found in the AUTHORS file in the root of the source tree.
+
+import os
+
+
+def _RunPythonTests(input_api, output_api):
+ def join(*args):
+ return input_api.os_path.join(input_api.PresubmitLocalPath(), *args)
+
+ test_directories = [
+ root for root, _, files in os.walk(join('tools_libyuv'))
+ if any(f.endswith('_test.py') for f in files)
+ ]
+
+ tests = []
+ for directory in test_directories:
+ tests.extend(
+ input_api.canned_checks.GetUnitTestsInDirectory(
+ input_api,
+ output_api,
+ directory,
+ whitelist=[r'.+_test\.py$']))
+ return input_api.RunTests(tests, parallel=True)
+
+
+def _CommonChecks(input_api, output_api):
+ """Checks common to both upload and commit."""
+ results = []
+ results.extend(input_api.canned_checks.RunPylint(input_api, output_api,
+ black_list=(r'^base[\\\/].*\.py$',
+ r'^build[\\\/].*\.py$',
+ r'^buildtools[\\\/].*\.py$',
+ r'^ios[\\\/].*\.py$',
+ r'^out.*[\\\/].*\.py$',
+ r'^testing[\\\/].*\.py$',
+ r'^third_party[\\\/].*\.py$',
+ r'^tools[\\\/].*\.py$',
+ # TODO(kjellander): should arguably be checked.
+ r'^tools_libyuv[\\\/]valgrind[\\\/].*\.py$',
+ r'^xcodebuild.*[\\\/].*\.py$',),
+ disabled_warnings=['F0401', # Failed to import x
+ 'E0611', # No package y in x
+ 'W0232', # Class has no __init__ method
+ ],
+ pylintrc='pylintrc'))
+ results.extend(_RunPythonTests(input_api, output_api))
+ return results
+
+
+def CheckChangeOnUpload(input_api, output_api):
+ results = []
+ results.extend(_CommonChecks(input_api, output_api))
+ results.extend(
+ input_api.canned_checks.CheckGNFormatted(input_api, output_api))
+ return results
+
+
+def CheckChangeOnCommit(input_api, output_api):
+ results = []
+ results.extend(_CommonChecks(input_api, output_api))
+ results.extend(input_api.canned_checks.CheckOwners(input_api, output_api))
+ results.extend(input_api.canned_checks.CheckChangeWasUploaded(
+ input_api, output_api))
+ results.extend(input_api.canned_checks.CheckChangeHasDescription(
+ input_api, output_api))
+ return results
diff --git a/libyuv/src/main/cpp/libyuv/README.chromium b/libyuv/src/main/cpp/libyuv/README.chromium
new file mode 100644
index 0000000..53ebbc8
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/README.chromium
@@ -0,0 +1,8 @@
+Name: libyuv
+URL: http://code.google.com/p/libyuv/
+Version: 1662
+License: BSD
+License File: LICENSE
+
+Description:
+libyuv is an open source project that includes YUV conversion and scaling functionality.
diff --git a/libyuv/src/main/cpp/libyuv/README.md b/libyuv/src/main/cpp/libyuv/README.md
new file mode 100644
index 0000000..b59b71c
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/README.md
@@ -0,0 +1,18 @@
+**libyuv** is an open source project that includes YUV scaling and conversion functionality.
+
+* Scale YUV to prepare content for compression, with point, bilinear or box filter.
+* Convert to YUV from webcam formats.
+* Convert from YUV to formats for rendering/effects.
+* Rotate by 90/180/270 degrees to adjust for mobile devices in portrait mode.
+* Optimized for SSE2/SSSE3/AVX2 on x86/x64.
+* Optimized for Neon on Arm.
+* Optimized for DSP R2 on Mips.
+
+### Development
+
+See [Getting started] [1] for instructions on how to get started developing.
+
+You can also browse the [docs directory] [2] for more documentation.
+
+[1]: https://chromium.googlesource.com/libyuv/libyuv/+/master/docs/getting_started.md
+[2]: https://chromium.googlesource.com/libyuv/libyuv/+/master/docs/
diff --git a/libyuv/src/main/cpp/libyuv/all.gyp b/libyuv/src/main/cpp/libyuv/all.gyp
new file mode 100644
index 0000000..cc72d9d
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/all.gyp
@@ -0,0 +1,21 @@
+# Copyright 2013 The LibYuv Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style license
+# that can be found in the LICENSE file in the root of the source
+# tree. An additional intellectual property rights grant can be found
+# in the file PATENTS. All contributing project authors may
+# be found in the AUTHORS file in the root of the source tree.
+
+# all.gyp and All target are for benefit of android gyp build.
+{
+ 'targets': [
+ {
+ 'target_name': 'All',
+ 'type': 'none',
+ 'dependencies': [
+ 'libyuv.gyp:*',
+ 'libyuv_test.gyp:*',
+ ],
+ },
+ ],
+}
diff --git a/libyuv/src/main/cpp/libyuv/android.toolchain.cmake b/libyuv/src/main/cpp/libyuv/android.toolchain.cmake
new file mode 100644
index 0000000..d9f15f9
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/android.toolchain.cmake
@@ -0,0 +1,666 @@
+# Copyright (C) 2016 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Configurable variables.
+# Modeled after the ndk-build system.
+# For any variables defined in:
+# https://developer.android.com/ndk/guides/android_mk.html
+# https://developer.android.com/ndk/guides/application_mk.html
+# if it makes sense for CMake, then replace LOCAL, APP, or NDK with ANDROID, and
+# we have that variable below.
+# The exception is ANDROID_TOOLCHAIN vs NDK_TOOLCHAIN_VERSION.
+# Since we only have one version of each gcc and clang, specifying a version
+# doesn't make much sense.
+#
+# ANDROID_NDK
+# ANDROID_TOOLCHAIN
+# ANDROID_ABI
+# ANDROID_PLATFORM
+# ANDROID_STL
+# ANDROID_PIE
+# ANDROID_CPP_FEATURES
+# ANDROID_ALLOW_UNDEFINED_SYMBOLS
+# ANDROID_ARM_MODE
+# ANDROID_ARM_NEON
+# ANDROID_DISABLE_NO_EXECUTE
+# ANDROID_DISABLE_RELRO
+# ANDROID_DISABLE_FORMAT_STRING_CHECKS
+# ANDROID_CCACHE
+
+cmake_minimum_required(VERSION 3.6.0)
+
+set(ANDROID_NDK_REVISION 12)
+
+# Touch toolchain variable to suppress "unused variable" warning.
+# This happens if CMake is invoked with the same command line the second time.
+if(CMAKE_TOOLCHAIN_FILE)
+endif()
+
+# Compatibility for configurable variables.
+# Compatible with configurable variables from the other toolchain file:
+# https://github.com/taka-no-me/android-cmake
+# TODO: We should consider dropping compatibility to simplify things once most
+# of our users have migrated to our standard set of configurable variables.
+if(ANDROID_TOOLCHAIN_NAME AND NOT ANDROID_TOOLCHAIN)
+ if(ANDROID_TOOLCHAIN_NAME MATCHES "-clang([0-9].[0-9])?$")
+ set(ANDROID_TOOLCHAIN clang)
+ elseif(ANDROID_TOOLCHAIN_NAME MATCHES "-[0-9].[0-9]$")
+ set(ANDROID_TOOLCHAIN gcc)
+ endif()
+endif()
+if(ANDROID_ABI STREQUAL "armeabi-v7a with NEON")
+ set(ANDROID_ABI armeabi-v7a)
+ set(ANDROID_ARM_NEON TRUE)
+elseif(ANDROID_TOOLCHAIN_NAME AND NOT ANDROID_ABI)
+ if(ANDROID_TOOLCHAIN_NAME MATCHES "^arm-linux-androideabi-")
+ set(ANDROID_ABI armeabi-v7a)
+ elseif(ANDROID_TOOLCHAIN_NAME MATCHES "^aarch64-linux-android-")
+ set(ANDROID_ABI arm64-v8a)
+ elseif(ANDROID_TOOLCHAIN_NAME MATCHES "^x86-")
+ set(ANDROID_ABI x86)
+ elseif(ANDROID_TOOLCHAIN_NAME MATCHES "^x86_64-")
+ set(ANDROID_ABI x86_64)
+ elseif(ANDROID_TOOLCHAIN_NAME MATCHES "^mipsel-linux-android-")
+ set(ANDROID_ABI mips)
+ elseif(ANDROID_TOOLCHAIN_NAME MATCHES "^mips64el-linux-android-")
+ set(ANDROID_ABI mips64)
+ endif()
+endif()
+if(ANDROID_NATIVE_API_LEVEL AND NOT ANDROID_PLATFORM)
+ if(ANDROID_NATIVE_API_LEVEL MATCHES "^android-[0-9]+$")
+ set(ANDROID_PLATFORM ${ANDROID_NATIVE_API_LEVEL})
+ elseif(ANDROID_NATIVE_API_LEVEL MATCHES "^[0-9]+$")
+ set(ANDROID_PLATFORM android-${ANDROID_NATIVE_API_LEVEL})
+ endif()
+endif()
+if(DEFINED ANDROID_APP_PIE AND NOT DEFINED ANDROID_PIE)
+ set(ANDROID_PIE "${ANDROID_APP_PIE}")
+endif()
+if(ANDROID_STL_FORCE_FEATURES AND NOT DEFINED ANDROID_CPP_FEATURES)
+ set(ANDROID_CPP_FEATURES "rtti exceptions")
+endif()
+if(DEFINED ANDROID_NO_UNDEFINED AND NOT DEFINED ANDROID_ALLOW_UNDEFINED_SYMBOLS)
+ if(ANDROID_NO_UNDEFINED)
+ set(ANDROID_ALLOW_UNDEFINED_SYMBOLS FALSE)
+ else()
+ set(ANDROID_ALLOW_UNDEFINED_SYMBOLS TRUE)
+ endif()
+endif()
+if(DEFINED ANDROID_SO_UNDEFINED AND NOT DEFINED ANDROID_ALLOW_UNDEFINED_SYMBOLS)
+ set(ANDROID_ALLOW_UNDEFINED_SYMBOLS "${ANDROID_SO_UNDEFINED}")
+endif()
+if(DEFINED ANDROID_FORCE_ARM_BUILD AND NOT ANDROID_ARM_MODE)
+ if(ANDROID_FORCE_ARM_BUILD)
+ set(ANDROID_ARM_MODE arm)
+ else()
+ set(ANDROID_ARM_MODE thumb)
+ endif()
+endif()
+if(DEFINED ANDROID_NOEXECSTACK AND NOT DEFINED ANDROID_DISABLE_NO_EXECUTE)
+ if(ANDROID_NOEXECSTACK)
+ set(ANDROID_DISABLE_NO_EXECUTE FALSE)
+ else()
+ set(ANDROID_DISABLE_NO_EXECUTE TRUE)
+ endif()
+endif()
+if(DEFINED ANDROID_RELRO AND NOT DEFINED ANDROID_DISABLE_RELRO)
+ if(ANDROID_RELRO)
+ set(ANDROID_DISABLE_RELRO FALSE)
+ else()
+ set(ANDROID_DISABLE_RELRO TRUE)
+ endif()
+endif()
+if(NDK_CCACHE AND NOT ANDROID_CCACHE)
+ set(ANDROID_CCACHE "${NDK_CCACHE}")
+endif()
+
+# Default values for configurable variables.
+if(NOT ANDROID_NDK)
+ if(DEFINED ENV{ANDROID_NDK_HOME}
+ AND IS_DIRECTORY "$ENV{ANDROID_NDK_HOME}")
+ set(ANDROID_NDK "$ENV{ANDROID_NDK_HOME}")
+ elseif(DEFINED ENV{ANDROID_HOME}
+ AND IS_DIRECTORY "$ENV{ANDROID_HOME}/ndk-bundle")
+ set(ANDROID_NDK "$ENV{ANDROID_HOME}/ndk-bundle")
+ elseif(CMAKE_HOST_SYSTEM_NAME STREQUAL Linux
+ AND IS_DIRECTORY "$ENV{HOME}/Android/Sdk/ndk-bundle")
+ set(ANDROID_NDK "$ENV{HOME}/Android/Sdk/ndk-bundle")
+ elseif(CMAKE_HOST_SYSTEM_NAME STREQUAL Darwin
+ AND IS_DIRECTORY "$ENV{HOME}/Library/Android/sdk/ndk-bundle")
+ set(ANDROID_NDK "$ENV{HOME}/Library/Android/sdk/ndk-bundle")
+ elseif(CMAKE_HOST_SYSTEM_NAME STREQUAL Windows
+ AND IS_DIRECTORY "$ENV{LOCALAPPDATA}/Android/Sdk/ndk-bundle")
+ set(ANDROID_NDK "$ENV{LOCALAPPDATA}/Android/Sdk/ndk-bundle")
+ else()
+ message(FATAL_ERROR "Android NDK unspecified.")
+ endif()
+endif()
+file(TO_CMAKE_PATH "${ANDROID_NDK}" ANDROID_NDK)
+if(NOT ANDROID_TOOLCHAIN)
+ set(ANDROID_TOOLCHAIN clang)
+endif()
+if(NOT ANDROID_ABI)
+ set(ANDROID_ABI armeabi-v7a)
+endif()
+if(ANDROID_PLATFORM MATCHES "^android-([0-8]|10|11)$")
+ set(ANDROID_PLATFORM android-9)
+elseif(ANDROID_PLATFORM STREQUAL android-20)
+ set(ANDROID_PLATFORM android-19)
+elseif(NOT ANDROID_PLATFORM)
+ set(ANDROID_PLATFORM android-9)
+endif()
+string(REPLACE "android-" "" ANDROID_PLATFORM_LEVEL ${ANDROID_PLATFORM})
+if(ANDROID_ABI MATCHES "64(-v8a)?$" AND ANDROID_PLATFORM_LEVEL LESS 21)
+ set(ANDROID_PLATFORM android-21)
+ set(ANDROID_PLATFORM_LEVEL 21)
+endif()
+if(NOT ANDROID_STL)
+ set(ANDROID_STL gnustl_static)
+endif()
+if(NOT DEFINED ANDROID_PIE)
+ if(ANDROID_PLATFORM_LEVEL LESS 16)
+ set(ANDROID_PIE FALSE)
+ else()
+ set(ANDROID_PIE TRUE)
+ endif()
+endif()
+if(NOT ANDROID_ARM_MODE)
+ set(ANDROID_ARM_MODE thumb)
+endif()
+
+# Export configurable variables for the try_compile() command.
+set(CMAKE_TRY_COMPILE_PLATFORM_VARIABLES
+ ANDROID_NDK
+ ANDROID_TOOLCHAIN
+ ANDROID_ABI
+ ANDROID_PLATFORM
+ ANDROID_STL
+ ANDROID_PIE
+ ANDROID_CPP_FEATURES
+ ANDROID_ALLOW_UNDEFINED_SYMBOLS
+ ANDROID_ARM_MODE
+ ANDROID_ARM_NEON
+ ANDROID_DISABLE_NO_EXECUTE
+ ANDROID_DISABLE_RELRO
+ ANDROID_DISABLE_FORMAT_STRING_CHECKS
+ ANDROID_CCACHE)
+
+# Standard cross-compiling stuff.
+set(ANDROID TRUE)
+set(CMAKE_SYSTEM_NAME Android)
+set(CMAKE_SYSTEM_VERSION ${ANDROID_PLATFORM_LEVEL})
+set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
+set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
+set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
+set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY)
+
+# ABI.
+set(CMAKE_ANDROID_ARCH_ABI ${ANDROID_ABI})
+if(ANDROID_ABI MATCHES "^armeabi(-v7a)?$")
+ set(ANDROID_SYSROOT_ABI arm)
+ set(ANDROID_TOOLCHAIN_NAME arm-linux-androideabi)
+ set(ANDROID_TOOLCHAIN_ROOT ${ANDROID_TOOLCHAIN_NAME})
+ if(ANDROID_ABI STREQUAL armeabi)
+ set(CMAKE_SYSTEM_PROCESSOR armv5te)
+ set(ANDROID_LLVM_TRIPLE armv5te-none-linux-androideabi)
+ elseif(ANDROID_ABI STREQUAL armeabi-v7a)
+ set(CMAKE_SYSTEM_PROCESSOR armv7-a)
+ set(ANDROID_LLVM_TRIPLE armv7-none-linux-androideabi)
+ endif()
+elseif(ANDROID_ABI STREQUAL arm64-v8a)
+ set(ANDROID_SYSROOT_ABI arm64)
+ set(CMAKE_SYSTEM_PROCESSOR aarch64)
+ set(ANDROID_TOOLCHAIN_NAME aarch64-linux-android)
+ set(ANDROID_TOOLCHAIN_ROOT ${ANDROID_TOOLCHAIN_NAME})
+ set(ANDROID_LLVM_TRIPLE aarch64-none-linux-android)
+elseif(ANDROID_ABI STREQUAL x86)
+ set(ANDROID_SYSROOT_ABI x86)
+ set(CMAKE_SYSTEM_PROCESSOR i686)
+ set(ANDROID_TOOLCHAIN_NAME i686-linux-android)
+ set(ANDROID_TOOLCHAIN_ROOT ${ANDROID_ABI})
+ set(ANDROID_LLVM_TRIPLE i686-none-linux-android)
+elseif(ANDROID_ABI STREQUAL x86_64)
+ set(ANDROID_SYSROOT_ABI x86_64)
+ set(CMAKE_SYSTEM_PROCESSOR x86_64)
+ set(ANDROID_TOOLCHAIN_NAME x86_64-linux-android)
+ set(ANDROID_TOOLCHAIN_ROOT ${ANDROID_ABI})
+ set(ANDROID_LLVM_TRIPLE x86_64-none-linux-android)
+elseif(ANDROID_ABI STREQUAL mips)
+ set(ANDROID_SYSROOT_ABI mips)
+ set(CMAKE_SYSTEM_PROCESSOR mips)
+ set(ANDROID_TOOLCHAIN_NAME mipsel-linux-android)
+ set(ANDROID_TOOLCHAIN_ROOT ${ANDROID_TOOLCHAIN_NAME})
+ set(ANDROID_LLVM_TRIPLE mipsel-none-linux-android)
+elseif(ANDROID_ABI STREQUAL mips64)
+ set(ANDROID_SYSROOT_ABI mips64)
+ set(CMAKE_SYSTEM_PROCESSOR mips64)
+ set(ANDROID_TOOLCHAIN_NAME mips64el-linux-android)
+ set(ANDROID_TOOLCHAIN_ROOT ${ANDROID_TOOLCHAIN_NAME})
+ set(ANDROID_LLVM_TRIPLE mips64el-none-linux-android)
+else()
+ message(FATAL_ERROR "Invalid Android ABI: ${ANDROID_ABI}.")
+endif()
+
+# STL.
+set(ANDROID_STL_STATIC_LIBRARIES)
+set(ANDROID_STL_SHARED_LIBRARIES)
+if(ANDROID_STL STREQUAL system)
+ set(ANDROID_STL_STATIC_LIBRARIES
+ supc++)
+elseif(ANDROID_STL STREQUAL stlport_static)
+ set(ANDROID_STL_STATIC_LIBRARIES
+ stlport_static)
+elseif(ANDROID_STL STREQUAL stlport_shared)
+ set(ANDROID_STL_SHARED_LIBRARIES
+ stlport_shared)
+elseif(ANDROID_STL STREQUAL gnustl_static)
+ set(ANDROID_STL_STATIC_LIBRARIES
+ gnustl_static)
+elseif(ANDROID_STL STREQUAL gnustl_shared)
+ set(ANDROID_STL_STATIC_LIBRARIES
+ supc++)
+ set(ANDROID_STL_SHARED_LIBRARIES
+ gnustl_shared)
+elseif(ANDROID_STL STREQUAL c++_static)
+ set(ANDROID_STL_STATIC_LIBRARIES
+ c++_static
+ c++abi
+ unwind
+ android_support)
+elseif(ANDROID_STL STREQUAL c++_shared)
+ set(ANDROID_STL_STATIC_LIBRARIES
+ unwind)
+ set(ANDROID_STL_SHARED_LIBRARIES
+ c++_shared)
+elseif(ANDROID_STL STREQUAL none)
+else()
+ message(FATAL_ERROR "Invalid Android STL: ${ANDROID_STL}.")
+endif()
+
+# Sysroot.
+set(CMAKE_SYSROOT "${ANDROID_NDK}/platforms/${ANDROID_PLATFORM}/arch-${ANDROID_SYSROOT_ABI}")
+
+# Toolchain.
+if(CMAKE_HOST_SYSTEM_NAME STREQUAL Linux)
+ set(ANDROID_HOST_TAG linux-x86_64)
+elseif(CMAKE_HOST_SYSTEM_NAME STREQUAL Darwin)
+ set(ANDROID_HOST_TAG darwin-x86_64)
+elseif(CMAKE_HOST_SYSTEM_NAME STREQUAL Windows)
+ set(ANDROID_HOST_TAG windows-x86_64)
+endif()
+set(ANDROID_TOOLCHAIN_ROOT "${ANDROID_NDK}/toolchains/${ANDROID_TOOLCHAIN_ROOT}-4.9/prebuilt/${ANDROID_HOST_TAG}")
+set(ANDROID_TOOLCHAIN_PREFIX "${ANDROID_TOOLCHAIN_ROOT}/bin/${ANDROID_TOOLCHAIN_NAME}-")
+if(CMAKE_HOST_SYSTEM_NAME STREQUAL Windows)
+ set(ANDROID_TOOLCHAIN_SUFFIX .exe)
+endif()
+if(ANDROID_TOOLCHAIN STREQUAL clang)
+ set(ANDROID_LLVM_TOOLCHAIN_PREFIX "${ANDROID_NDK}/toolchains/llvm/prebuilt/${ANDROID_HOST_TAG}/bin/")
+ set(ANDROID_C_COMPILER "${ANDROID_LLVM_TOOLCHAIN_PREFIX}clang${ANDROID_TOOLCHAIN_SUFFIX}")
+ set(ANDROID_CXX_COMPILER "${ANDROID_LLVM_TOOLCHAIN_PREFIX}clang++${ANDROID_TOOLCHAIN_SUFFIX}")
+ # Clang can fail to compile if CMake doesn't correctly supply the target and
+ # external toolchain, but to do so, CMake needs to already know that the
+ # compiler is clang. Tell CMake that the compiler is really clang, but don't
+ # use CMakeForceCompiler, since we still want compile checks. We only want
+ # to skip the compiler ID detection step.
+ set(CMAKE_C_COMPILER_ID_RUN TRUE)
+ set(CMAKE_CXX_COMPILER_ID_RUN TRUE)
+ set(CMAKE_C_COMPILER_ID Clang)
+ set(CMAKE_CXX_COMPILER_ID Clang)
+ set(CMAKE_C_COMPILER_TARGET ${ANDROID_LLVM_TRIPLE})
+ set(CMAKE_CXX_COMPILER_TARGET ${ANDROID_LLVM_TRIPLE})
+ set(CMAKE_C_COMPILER_EXTERNAL_TOOLCHAIN "${ANDROID_TOOLCHAIN_ROOT}")
+ set(CMAKE_CXX_COMPILER_EXTERNAL_TOOLCHAIN "${ANDROID_TOOLCHAIN_ROOT}")
+elseif(ANDROID_TOOLCHAIN STREQUAL gcc)
+ set(ANDROID_C_COMPILER "${ANDROID_TOOLCHAIN_PREFIX}gcc${ANDROID_TOOLCHAIN_SUFFIX}")
+ set(ANDROID_CXX_COMPILER "${ANDROID_TOOLCHAIN_PREFIX}g++${ANDROID_TOOLCHAIN_SUFFIX}")
+else()
+ message(FATAL_ERROR "Invalid Android toolchain: ${ANDROID_TOOLCHAIN}.")
+endif()
+
+# Check that the NDK is valid.
+if(NOT IS_DIRECTORY "${ANDROID_NDK}"
+ OR NOT IS_DIRECTORY "${ANDROID_NDK}/platforms"
+ OR NOT IS_DIRECTORY "${ANDROID_NDK}/prebuilt"
+ OR NOT IS_DIRECTORY "${ANDROID_NDK}/sources"
+ OR NOT IS_DIRECTORY "${ANDROID_NDK}/toolchains")
+ message(FATAL_ERROR "Invalid Android NDK: ${ANDROID_NDK}.")
+elseif(NOT IS_DIRECTORY "${ANDROID_NDK}/platforms/${ANDROID_PLATFORM}")
+ message(FATAL_ERROR "Invalid Android platform: ${ANDROID_PLATFORM}.")
+elseif(NOT IS_DIRECTORY "${CMAKE_SYSROOT}")
+ message(FATAL_ERROR "Invalid Android sysroot: ${CMAKE_SYSROOT}.")
+endif()
+# Check if source.properties file exists.
+if(NOT EXISTS "${ANDROID_NDK}/source.properties")
+ message(FATAL_ERROR "Missing file: ${ANDROID_NDK}/source.properties. Please use NDK r12+.")
+endif()
+file(READ "${ANDROID_NDK}/source.properties" ANDROID_NDK_SOURCE_PROPERTIES)
+set(ANDROID_NDK_SOURCE_PROPERTIES_REGEX
+ "^Pkg\\.Desc = Android NDK\nPkg\\.Revision = ([0-9]+\\.[0-9]+\\.[0-9]+(-beta[0-9]+)?)\n$")
+if(NOT ANDROID_NDK_SOURCE_PROPERTIES MATCHES "${ANDROID_NDK_SOURCE_PROPERTIES_REGEX}")
+ message(FATAL_ERROR "Failed to parse Android NDK revision: ${ANDROID_NDK}/source.properties.")
+endif()
+string(REGEX REPLACE "${ANDROID_NDK_SOURCE_PROPERTIES_REGEX}" "\\1"
+ ANDROID_NDK_PACKAGE_REVISION "${ANDROID_NDK_SOURCE_PROPERTIES}")
+if(NOT ANDROID_NDK_PACKAGE_REVISION MATCHES "^${ANDROID_NDK_REVISION}\\.")
+ message(FATAL_ERROR "Invalid Android NDK revision (should be ${ANDROID_NDK_REVISION}): ${ANDROID_NDK_PACKAGE_REVISION}.")
+endif()
+
+set(ANDROID_COMPILER_FLAGS)
+set(ANDROID_COMPILER_FLAGS_CXX)
+set(ANDROID_COMPILER_FLAGS_DEBUG)
+set(ANDROID_COMPILER_FLAGS_RELEASE)
+set(ANDROID_LINKER_FLAGS)
+set(ANDROID_LINKER_FLAGS_EXE)
+
+# Generic flags.
+list(APPEND ANDROID_COMPILER_FLAGS
+ -g
+ -DANDROID
+ -ffunction-sections
+ -funwind-tables
+ -fstack-protector-strong
+ -no-canonical-prefixes)
+list(APPEND ANDROID_COMPILER_FLAGS_CXX
+ -fno-exceptions
+ -fno-rtti)
+list(APPEND ANDROID_LINKER_FLAGS
+ -Wl,--build-id
+ -Wl,--warn-shared-textrel
+ -Wl,--fatal-warnings)
+list(APPEND ANDROID_LINKER_FLAGS_EXE
+ -Wl,--gc-sections
+ -Wl,-z,nocopyreloc)
+
+# Debug and release flags.
+list(APPEND ANDROID_COMPILER_FLAGS_DEBUG
+ -O0)
+if(ANDROID_ABI MATCHES "^armeabi")
+ list(APPEND ANDROID_COMPILER_FLAGS_RELEASE
+ -Os)
+else()
+ list(APPEND ANDROID_COMPILER_FLAGS_RELEASE
+ -O2)
+endif()
+list(APPEND ANDROID_COMPILER_FLAGS_RELEASE
+ -DNDEBUG)
+if(ANDROID_TOOLCHAIN STREQUAL clang)
+ list(APPEND ANDROID_COMPILER_FLAGS_DEBUG
+ -fno-limit-debug-info)
+endif()
+
+# Toolchain and ABI specific flags.
+if(ANDROID_ABI STREQUAL armeabi)
+ list(APPEND ANDROID_COMPILER_FLAGS
+ -march=armv5te
+ -mtune=xscale
+ -msoft-float)
+endif()
+if(ANDROID_ABI STREQUAL armeabi-v7a)
+ list(APPEND ANDROID_COMPILER_FLAGS
+ -march=armv7-a
+ -mfloat-abi=softfp
+ -mfpu=vfpv3-d16)
+ list(APPEND ANDROID_LINKER_FLAGS
+ -Wl,--fix-cortex-a8)
+endif()
+if(ANDROID_ABI MATCHES "^armeabi" AND ANDROID_TOOLCHAIN STREQUAL clang)
+ # Disable integrated-as for better compatibility.
+ list(APPEND ANDROID_COMPILER_FLAGS
+ -fno-integrated-as)
+endif()
+if(ANDROID_ABI STREQUAL mips AND ANDROID_TOOLCHAIN STREQUAL clang)
+ list(APPEND ANDROID_COMPILER_FLAGS
+ -mips32)
+endif()
+
+# STL specific flags.
+if(ANDROID_STL STREQUAL system)
+ set(ANDROID_STL_PREFIX gnu-libstdc++/4.9)
+ set(CMAKE_CXX_STANDARD_INCLUDE_DIRECTORIES
+ "${ANDROID_NDK}/sources/cxx-stl/system/include")
+elseif(ANDROID_STL MATCHES "^stlport_")
+ set(ANDROID_STL_PREFIX stlport)
+ set(CMAKE_CXX_STANDARD_INCLUDE_DIRECTORIES
+ "${ANDROID_NDK}/sources/cxx-stl/${ANDROID_STL_PREFIX}/stlport"
+ "${ANDROID_NDK}/sources/cxx-stl/gabi++/include")
+elseif(ANDROID_STL MATCHES "^gnustl_")
+ set(ANDROID_STL_PREFIX gnu-libstdc++/4.9)
+ set(CMAKE_CXX_STANDARD_INCLUDE_DIRECTORIES
+ "${ANDROID_NDK}/sources/cxx-stl/${ANDROID_STL_PREFIX}/include"
+ "${ANDROID_NDK}/sources/cxx-stl/${ANDROID_STL_PREFIX}/libs/${ANDROID_ABI}/include"
+ "${ANDROID_NDK}/sources/cxx-stl/${ANDROID_STL_PREFIX}/include/backward")
+elseif(ANDROID_STL MATCHES "^c\\+\\+_")
+ set(ANDROID_STL_PREFIX llvm-libc++)
+ if(ANDROID_ABI MATCHES "^armeabi")
+ list(APPEND ANDROID_LINKER_FLAGS
+ -Wl,--exclude-libs,libunwind.a)
+ else()
+ list(REMOVE_ITEM ANDROID_STL_STATIC_LIBRARIES
+ unwind)
+ endif()
+ list(APPEND ANDROID_COMPILER_FLAGS_CXX
+ -std=c++11)
+ if(ANDROID_TOOLCHAIN STREQUAL gcc)
+ list(APPEND ANDROID_COMPILER_FLAGS_CXX
+ -fno-strict-aliasing)
+ endif()
+ set(CMAKE_CXX_STANDARD_INCLUDE_DIRECTORIES
+ "${ANDROID_NDK}/sources/cxx-stl/${ANDROID_STL_PREFIX}/libcxx/include"
+ "${ANDROID_NDK}/sources/android/support/include"
+ "${ANDROID_NDK}/sources/cxx-stl/${ANDROID_STL_PREFIX}abi/libcxxabi/include")
+endif()
+set(ANDROID_CXX_STANDARD_LIBRARIES)
+foreach(library ${ANDROID_STL_STATIC_LIBRARIES})
+ list(APPEND ANDROID_CXX_STANDARD_LIBRARIES
+ "${ANDROID_NDK}/sources/cxx-stl/${ANDROID_STL_PREFIX}/libs/${ANDROID_ABI}/lib${library}.a")
+endforeach()
+foreach(library ${ANDROID_STL_SHARED_LIBRARIES})
+ list(APPEND ANDROID_CXX_STANDARD_LIBRARIES
+ "${ANDROID_NDK}/sources/cxx-stl/${ANDROID_STL_PREFIX}/libs/${ANDROID_ABI}/lib${library}.so")
+endforeach()
+if(ANDROID_ABI STREQUAL armeabi AND NOT ANDROID_STL MATCHES "^(none|system)$")
+ list(APPEND ANDROID_CXX_STANDARD_LIBRARIES
+ -latomic)
+endif()
+set(CMAKE_C_STANDARD_LIBRARIES_INIT "-lm")
+set(CMAKE_CXX_STANDARD_LIBRARIES_INIT "${CMAKE_C_STANDARD_LIBRARIES_INIT}")
+if(ANDROID_CXX_STANDARD_LIBRARIES)
+ string(REPLACE ";" "\" \"" ANDROID_CXX_STANDARD_LIBRARIES "\"${ANDROID_CXX_STANDARD_LIBRARIES}\"")
+ set(CMAKE_CXX_STANDARD_LIBRARIES_INIT "${CMAKE_CXX_STANDARD_LIBRARIES_INIT} ${ANDROID_CXX_STANDARD_LIBRARIES}")
+endif()
+
+# Configuration specific flags.
+if(ANDROID_PIE)
+ set(CMAKE_POSITION_INDEPENDENT_CODE TRUE)
+ list(APPEND ANDROID_LINKER_FLAGS_EXE
+ -pie
+ -fPIE)
+endif()
+if(ANDROID_CPP_FEATURES)
+ separate_arguments(ANDROID_CPP_FEATURES)
+ foreach(feature ${ANDROID_CPP_FEATURES})
+ if(NOT ${feature} MATCHES "^(rtti|exceptions)$")
+ message(FATAL_ERROR "Invalid Android C++ feature: ${feature}.")
+ endif()
+ list(APPEND ANDROID_COMPILER_FLAGS_CXX
+ -f${feature})
+ endforeach()
+ string(REPLACE ";" " " ANDROID_CPP_FEATURES "${ANDROID_CPP_FEATURES}")
+endif()
+if(NOT ANDROID_ALLOW_UNDEFINED_SYMBOLS)
+ list(APPEND ANDROID_LINKER_FLAGS
+ -Wl,--no-undefined)
+endif()
+if(ANDROID_ABI MATCHES "armeabi")
+ if(ANDROID_ARM_MODE STREQUAL thumb)
+ list(APPEND ANDROID_COMPILER_FLAGS
+ -mthumb)
+ elseif(ANDROID_ARM_MODE STREQUAL arm)
+ list(APPEND ANDROID_COMPILER_FLAGS
+ -marm)
+ else()
+ message(FATAL_ERROR "Invalid Android ARM mode: ${ANDROID_ARM_MODE}.")
+ endif()
+ if(ANDROID_ABI STREQUAL armeabi-v7a AND ANDROID_ARM_NEON)
+ list(APPEND ANDROID_COMPILER_FLAGS
+ -mfpu=neon)
+ endif()
+endif()
+if(ANDROID_DISABLE_NO_EXECUTE)
+ list(APPEND ANDROID_COMPILER_FLAGS
+ -Wa,--execstack)
+ list(APPEND ANDROID_LINKER_FLAGS
+ -Wl,-z,execstack)
+else()
+ list(APPEND ANDROID_COMPILER_FLAGS
+ -Wa,--noexecstack)
+ list(APPEND ANDROID_LINKER_FLAGS
+ -Wl,-z,noexecstack)
+endif()
+if(ANDROID_TOOLCHAIN STREQUAL clang)
+ # CMake automatically forwards all compiler flags to the linker,
+ # and clang doesn't like having -Wa flags being used for linking.
+ # To prevent CMake from doing this would require meddling with
+ # the CMAKE__COMPILE_OBJECT rules, which would get quite messy.
+ list(APPEND ANDROID_LINKER_FLAGS
+ -Qunused-arguments)
+endif()
+if(ANDROID_DISABLE_RELRO)
+ list(APPEND ANDROID_LINKER_FLAGS
+ -Wl,-z,norelro -Wl,-z,lazy)
+else()
+ list(APPEND ANDROID_LINKER_FLAGS
+ -Wl,-z,relro -Wl,-z,now)
+endif()
+if(ANDROID_DISABLE_FORMAT_STRING_CHECKS)
+ list(APPEND ANDROID_COMPILER_FLAGS
+ -Wno-error=format-security)
+else()
+ list(APPEND ANDROID_COMPILER_FLAGS
+ -Wformat -Werror=format-security)
+endif()
+
+# Convert these lists into strings.
+string(REPLACE ";" " " ANDROID_COMPILER_FLAGS "${ANDROID_COMPILER_FLAGS}")
+string(REPLACE ";" " " ANDROID_COMPILER_FLAGS_CXX "${ANDROID_COMPILER_FLAGS_CXX}")
+string(REPLACE ";" " " ANDROID_COMPILER_FLAGS_DEBUG "${ANDROID_COMPILER_FLAGS_DEBUG}")
+string(REPLACE ";" " " ANDROID_COMPILER_FLAGS_RELEASE "${ANDROID_COMPILER_FLAGS_RELEASE}")
+string(REPLACE ";" " " ANDROID_LINKER_FLAGS "${ANDROID_LINKER_FLAGS}")
+string(REPLACE ";" " " ANDROID_LINKER_FLAGS_EXE "${ANDROID_LINKER_FLAGS_EXE}")
+
+if(ANDROID_CCACHE)
+ set(CMAKE_C_COMPILER_LAUNCHER "${ANDROID_CCACHE}")
+ set(CMAKE_CXX_COMPILER_LAUNCHER "${ANDROID_CCACHE}")
+endif()
+set(CMAKE_C_COMPILER "${ANDROID_C_COMPILER}")
+set(CMAKE_CXX_COMPILER "${ANDROID_CXX_COMPILER}")
+set(_CMAKE_TOOLCHAIN_PREFIX "${ANDROID_TOOLCHAIN_PREFIX}")
+
+# Set or retrieve the cached flags.
+# This is necessary in case the user sets/changes flags in subsequent
+# configures. If we included the Android flags in here, they would get
+# overwritten.
+set(CMAKE_C_FLAGS ""
+ CACHE STRING "Flags used by the compiler during all build types.")
+set(CMAKE_CXX_FLAGS ""
+ CACHE STRING "Flags used by the compiler during all build types.")
+set(CMAKE_C_FLAGS_DEBUG ""
+ CACHE STRING "Flags used by the compiler during debug builds.")
+set(CMAKE_CXX_FLAGS_DEBUG ""
+ CACHE STRING "Flags used by the compiler during debug builds.")
+set(CMAKE_C_FLAGS_RELEASE ""
+ CACHE STRING "Flags used by the compiler during release builds.")
+set(CMAKE_CXX_FLAGS_RELEASE ""
+ CACHE STRING "Flags used by the compiler during release builds.")
+set(CMAKE_MODULE_LINKER_FLAGS ""
+ CACHE STRING "Flags used by the linker during the creation of modules.")
+set(CMAKE_SHARED_LINKER_FLAGS ""
+ CACHE STRING "Flags used by the linker during the creation of dll's.")
+set(CMAKE_EXE_LINKER_FLAGS ""
+ CACHE STRING "Flags used by the linker.")
+
+set(CMAKE_C_FLAGS "${ANDROID_COMPILER_FLAGS} ${CMAKE_C_FLAGS}")
+set(CMAKE_CXX_FLAGS "${ANDROID_COMPILER_FLAGS} ${ANDROID_COMPILER_FLAGS_CXX} ${CMAKE_CXX_FLAGS}")
+set(CMAKE_C_FLAGS_DEBUG "${ANDROID_COMPILER_FLAGS_DEBUG} ${CMAKE_C_FLAGS_DEBUG}")
+set(CMAKE_CXX_FLAGS_DEBUG "${ANDROID_COMPILER_FLAGS_DEBUG} ${CMAKE_CXX_FLAGS_DEBUG}")
+set(CMAKE_C_FLAGS_RELEASE "${ANDROID_COMPILER_FLAGS_RELEASE} ${CMAKE_C_FLAGS_RELEASE}")
+set(CMAKE_CXX_FLAGS_RELEASE "${ANDROID_COMPILER_FLAGS_RELEASE} ${CMAKE_CXX_FLAGS_RELEASE}")
+set(CMAKE_SHARED_LINKER_FLAGS "${ANDROID_LINKER_FLAGS} ${CMAKE_SHARED_LINKER_FLAGS}")
+set(CMAKE_MODULE_LINKER_FLAGS "${ANDROID_LINKER_FLAGS} ${CMAKE_MODULE_LINKER_FLAGS}")
+set(CMAKE_EXE_LINKER_FLAGS "${ANDROID_LINKER_FLAGS} ${ANDROID_LINKER_FLAGS_EXE} ${CMAKE_EXE_LINKER_FLAGS}")
+
+# Compatibility for read-only variables.
+# Read-only variables for compatibility with the other toolchain file.
+# We'll keep these around for the existing projects that still use them.
+# TODO: All of the variables here have equivalents in our standard set of
+# configurable variables, so we can remove these once most of our users migrate
+# to those variables.
+set(ANDROID_NATIVE_API_LEVEL ${ANDROID_PLATFORM_LEVEL})
+if(ANDROID_ALLOW_UNDEFINED_SYMBOLS)
+ set(ANDROID_SO_UNDEFINED TRUE)
+else()
+ set(ANDROID_NO_UNDEFINED TRUE)
+endif()
+set(ANDROID_FUNCTION_LEVEL_LINKING TRUE)
+set(ANDROID_GOLD_LINKER TRUE)
+if(NOT ANDROID_DISABLE_NO_EXECUTE)
+ set(ANDROID_NOEXECSTACK TRUE)
+endif()
+if(NOT ANDROID_DISABLE_RELRO)
+ set(ANDROID_RELRO TRUE)
+endif()
+if(ANDROID_ARM_MODE STREQUAL arm)
+ set(ANDROID_FORCE_ARM_BUILD TRUE)
+endif()
+if(ANDROID_CPP_FEATURES MATCHES "rtti"
+ AND ANDROID_CPP_FEATURES MATCHES "exceptions")
+ set(ANDROID_STL_FORCE_FEATURES TRUE)
+endif()
+if(ANDROID_CCACHE)
+ set(NDK_CCACHE "${ANDROID_CCACHE}")
+endif()
+if(ANDROID_TOOLCHAIN STREQUAL clang)
+ set(ANDROID_TOOLCHAIN_NAME ${ANDROID_TOOLCHAIN_NAME}-clang)
+else()
+ set(ANDROID_TOOLCHAIN_NAME ${ANDROID_TOOLCHAIN_NAME}-4.9)
+endif()
+set(ANDROID_NDK_HOST_X64 TRUE)
+set(ANDROID_NDK_LAYOUT RELEASE)
+if(ANDROID_ABI STREQUAL armeabi)
+ set(ARMEABI TRUE)
+elseif(ANDROID_ABI STREQUAL armeabi-v7a)
+ set(ARMEABI_V7A TRUE)
+ if(ANDROID_ARM_NEON)
+ set(NEON TRUE)
+ endif()
+elseif(ANDROID_ABI STREQUAL arm64-v8a)
+ set(ARM64_V8A TRUE)
+elseif(ANDROID_ABI STREQUAL x86)
+ set(X86 TRUE)
+elseif(ANDROID_ABI STREQUAL x86_64)
+ set(X86_64 TRUE)
+elseif(ANDROID_ABI STREQUAL mips)
+ set(MIPS TRUE)
+elseif(ANDROID_ABI STREQUAL MIPS64)
+ set(MIPS64 TRUE)
+endif()
+set(ANDROID_NDK_HOST_SYSTEM_NAME ${ANDROID_HOST_TAG})
+set(ANDROID_NDK_ABI_NAME ${ANDROID_ABI})
+set(ANDROID_NDK_RELEASE r${ANDROID_NDK_REVISION})
+set(ANDROID_ARCH_NAME ${ANDROID_SYSROOT_ABI})
+set(ANDROID_SYSROOT "${CMAKE_SYSROOT}")
+set(TOOL_OS_SUFFIX ${ANDROID_TOOLCHAIN_SUFFIX})
+if(ANDROID_TOOLCHAIN STREQUAL clang)
+ set(ANDROID_COMPILER_IS_CLANG TRUE)
+endif()
diff --git a/libyuv/src/main/cpp/libyuv/build_overrides/build.gni b/libyuv/src/main/cpp/libyuv/build_overrides/build.gni
new file mode 100644
index 0000000..9bb73ab
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/build_overrides/build.gni
@@ -0,0 +1,38 @@
+# Copyright 2016 The LibYuv Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style license
+# that can be found in the LICENSE file in the root of the source
+# tree. An additional intellectual property rights grant can be found
+# in the file PATENTS. All contributing project authors may
+# be found in the AUTHORS file in the root of the source tree.
+
+# Some non-Chromium builds don't use Chromium's third_party/binutils.
+linux_use_bundled_binutils_override = true
+
+# Variable that can be used to support multiple build scenarios, like having
+# Chromium specific targets in a client project's GN file etc.
+build_with_chromium = false
+
+# Some non-Chromium builds don't support building java targets.
+enable_java_templates = true
+
+# Allow using custom suppressions files (currently not used by libyuv).
+asan_suppressions_file = "//build/sanitizers/asan_suppressions.cc"
+lsan_suppressions_file = "//build/sanitizers/lsan_suppressions.cc"
+tsan_suppressions_file = "//build/sanitizers/tsan_suppressions.cc"
+
+msan_blacklist_path =
+ rebase_path("//tools_libyuv/msan/blacklist.txt", root_build_dir)
+ubsan_blacklist_path =
+ rebase_path("//tools_libyuv/ubsan/blacklist.txt", root_build_dir)
+ubsan_vptr_blacklist_path =
+ rebase_path("//tools_libyuv/ubsan/vptr_blacklist.txt", root_build_dir)
+
+# For Chromium, Android 32-bit non-component, non-clang builds hit a 4GiB size
+# limit, making them requiring symbol_level=2. WebRTC doesn't hit that problem
+# so we just ignore that assert. See https://crbug.com/648948 for more info.
+ignore_elf32_limitations = true
+
+# Use system Xcode installation instead of the Chromium bundled Mac toolchain,
+# since it contains only SDK 10.11, not 10.12 which WebRTC needs.
+use_system_xcode = true
diff --git a/libyuv/src/main/cpp/libyuv/build_overrides/gtest.gni b/libyuv/src/main/cpp/libyuv/build_overrides/gtest.gni
new file mode 100644
index 0000000..d3c3f68
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/build_overrides/gtest.gni
@@ -0,0 +1,19 @@
+# Copyright (c) 2016 The LibYuv project authors. All Rights Reserved.
+#
+# Use of this source code is governed by a BSD-style license
+# that can be found in the LICENSE file in the root of the source
+# tree. An additional intellectual property rights grant can be found
+# in the file PATENTS. All contributing project authors may
+# be found in the AUTHORS file in the root of the source tree.
+
+# Include support for registering main function in multi-process tests.
+gtest_include_multiprocess = true
+
+# Include support for platform-specific operations across unit tests.
+gtest_include_platform_test = true
+
+# Exclude support for testing Objective C code on OS X and iOS.
+gtest_include_objc_support = true
+
+# Exclude support for flushing coverage files on iOS.
+gtest_include_ios_coverage = true
diff --git a/libyuv/src/main/cpp/libyuv/cleanup_links.py b/libyuv/src/main/cpp/libyuv/cleanup_links.py
new file mode 100644
index 0000000..ba29078
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/cleanup_links.py
@@ -0,0 +1,107 @@
+#!/usr/bin/env python
+# Copyright 2017 The LibYuv Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style license
+# that can be found in the LICENSE file in the root of the source
+# tree. An additional intellectual property rights grant can be found
+# in the file PATENTS. All contributing project authors may
+# be found in the AUTHORS file in the root of the source tree.
+
+# This is a copy of the file from WebRTC in:
+# https://chromium.googlesource.com/external/webrtc/+/master/cleanup_links.py
+
+"""Script to cleanup symlinks created from setup_links.py.
+
+Before 177567c518b121731e507e9b9c4049c4dc96e4c8 (#15754) we had a Chromium
+checkout which we created symlinks into. In order to do clean syncs after
+landing that change, this script cleans up any old symlinks, avoiding annoying
+manual cleanup needed in order to complete gclient sync.
+"""
+
+import logging
+import optparse
+import os
+import shelve
+import subprocess
+import sys
+
+
+ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
+LINKS_DB = 'links'
+
+# Version management to make future upgrades/downgrades easier to support.
+SCHEMA_VERSION = 1
+
+class WebRTCLinkSetup(object):
+ def __init__(self, links_db, dry_run=False):
+ self._dry_run = dry_run
+ self._links_db = links_db
+
+ def CleanupLinks(self):
+ logging.debug('CleanupLinks')
+ for source, link_path in self._links_db.iteritems():
+ if source == 'SCHEMA_VERSION':
+ continue
+ if os.path.islink(link_path) or sys.platform.startswith('win'):
+ # os.path.islink() always returns false on Windows
+ # See http://bugs.python.org/issue13143.
+ logging.debug('Removing link to %s at %s', source, link_path)
+ if not self._dry_run:
+ if os.path.exists(link_path):
+ if sys.platform.startswith('win') and os.path.isdir(link_path):
+ subprocess.check_call(['rmdir', '/q', '/s', link_path],
+ shell=True)
+ else:
+ os.remove(link_path)
+ del self._links_db[source]
+
+
+def _initialize_database(filename):
+ links_database = shelve.open(filename)
+ # Wipe the database if this version of the script ends up looking at a
+ # newer (future) version of the links db, just to be sure.
+ version = links_database.get('SCHEMA_VERSION')
+ if version and version != SCHEMA_VERSION:
+ logging.info('Found database with schema version %s while this script only '
+ 'supports %s. Wiping previous database contents.', version,
+ SCHEMA_VERSION)
+ links_database.clear()
+ links_database['SCHEMA_VERSION'] = SCHEMA_VERSION
+ return links_database
+
+
+def main():
+ parser = optparse.OptionParser()
+ parser.add_option('-d', '--dry-run', action='store_true', default=False,
+ help='Print what would be done, but don\'t perform any '
+ 'operations. This will automatically set logging to '
+ 'verbose.')
+ parser.add_option('-v', '--verbose', action='store_const',
+ const=logging.DEBUG, default=logging.INFO,
+ help='Print verbose output for debugging.')
+ options, _ = parser.parse_args()
+
+ if options.dry_run:
+ options.verbose = logging.DEBUG
+ logging.basicConfig(format='%(message)s', level=options.verbose)
+
+ # Work from the root directory of the checkout.
+ script_dir = os.path.dirname(os.path.abspath(__file__))
+ os.chdir(script_dir)
+
+ # The database file gets .db appended on some platforms.
+ db_filenames = [LINKS_DB, LINKS_DB + '.db']
+ if any(os.path.isfile(f) for f in db_filenames):
+ links_database = _initialize_database(LINKS_DB)
+ try:
+ symlink_creator = WebRTCLinkSetup(links_database, options.dry_run)
+ symlink_creator.CleanupLinks()
+ finally:
+ for f in db_filenames:
+ if os.path.isfile(f):
+ os.remove(f)
+ return 0
+
+
+if __name__ == '__main__':
+ sys.exit(main())
diff --git a/libyuv/src/main/cpp/libyuv/codereview.settings b/libyuv/src/main/cpp/libyuv/codereview.settings
new file mode 100644
index 0000000..00ba1d3
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/codereview.settings
@@ -0,0 +1,6 @@
+# This file is used by git cl to get repository specific information.
+CODE_REVIEW_SERVER: codereview.chromium.org
+GERRIT_HOST: True
+PROJECT: libyuv
+TRY_ON_UPLOAD: False
+VIEW_VC: https://chromium.googlesource.com/libyuv/libyuv/+/
diff --git a/libyuv/src/main/cpp/libyuv/docs/deprecated_builds.md b/libyuv/src/main/cpp/libyuv/docs/deprecated_builds.md
new file mode 100644
index 0000000..d54a028
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/docs/deprecated_builds.md
@@ -0,0 +1,440 @@
+# Deprecated Builds
+
+Older documentation on build configs which are no longer supported.
+
+## Pre-requisites
+
+You'll need to have depot tools installed: https://www.chromium.org/developers/how-tos/install-depot-tools
+Refer to chromium instructions for each platform for other prerequisites.
+
+## Getting the Code
+
+Create a working directory, enter it, and run:
+
+ gclient config https://chromium.googlesource.com/libyuv/libyuv
+ gclient sync
+
+
+Then you'll get a .gclient file like:
+
+ solutions = [
+ { "name" : "libyuv",
+ "url" : "https://chromium.googlesource.com/libyuv/libyuv",
+ "deps_file" : "DEPS",
+ "managed" : True,
+ "custom_deps" : {
+ },
+ "safesync_url": "",
+ },
+ ];
+
+
+For iOS add `;target_os=['ios'];` to your OSX .gclient and run `GYP_DEFINES="OS=ios" gclient sync.`
+
+Browse the Git reprository: https://chromium.googlesource.com/libyuv/libyuv/+/master
+
+### Android
+For Android add `;target_os=['android'];` to your Linux .gclient
+
+
+ solutions = [
+ { "name" : "libyuv",
+ "url" : "https://chromium.googlesource.com/libyuv/libyuv",
+ "deps_file" : "DEPS",
+ "managed" : True,
+ "custom_deps" : {
+ },
+ "safesync_url": "",
+ },
+ ];
+ target_os = ["android", "unix"];
+
+Then run:
+
+ export GYP_DEFINES="OS=android"
+ gclient sync
+
+Caveat: Theres an error with Google Play services updates. If you get the error "Your version of the Google Play services library is not up to date", run the following:
+
+ cd chromium/src
+ ./build/android/play_services/update.py download
+ cd ../..
+
+For Windows the gclient sync must be done from an Administrator command prompt.
+
+The sync will generate native build files for your environment using gyp (Windows: Visual Studio, OSX: XCode, Linux: make). This generation can also be forced manually: `gclient runhooks`
+
+To get just the source (not buildable):
+
+ git clone https://chromium.googlesource.com/libyuv/libyuv
+
+
+## Building the Library and Unittests
+
+### Windows
+
+ set GYP_DEFINES=target_arch=ia32
+ call python gyp_libyuv -fninja -G msvs_version=2013
+ ninja -j7 -C out\Release
+ ninja -j7 -C out\Debug
+
+ set GYP_DEFINES=target_arch=x64
+ call python gyp_libyuv -fninja -G msvs_version=2013
+ ninja -C out\Debug_x64
+ ninja -C out\Release_x64
+
+#### Building with clangcl
+ set GYP_DEFINES=clang=1 target_arch=ia32
+ call python tools\clang\scripts\update.py
+ call python gyp_libyuv -fninja libyuv_test.gyp
+ ninja -C out\Debug
+ ninja -C out\Release
+
+### OSX
+
+Clang 64 bit shown. Remove `clang=1` for GCC and change x64 to ia32 for 32 bit.
+
+ GYP_DEFINES="clang=1 target_arch=x64" ./gyp_libyuv
+ ninja -j7 -C out/Debug
+ ninja -j7 -C out/Release
+
+ GYP_DEFINES="clang=1 target_arch=ia32" ./gyp_libyuv
+ ninja -j7 -C out/Debug
+ ninja -j7 -C out/Release
+
+### iOS
+http://www.chromium.org/developers/how-tos/build-instructions-ios
+
+Add to .gclient last line: `target_os=['ios'];`
+
+armv7
+
+ GYP_DEFINES="OS=ios target_arch=armv7 target_subarch=arm32" GYP_CROSSCOMPILE=1 GYP_GENERATOR_FLAGS="output_dir=out_ios" ./gyp_libyuv
+ ninja -j7 -C out_ios/Debug-iphoneos libyuv_unittest
+ ninja -j7 -C out_ios/Release-iphoneos libyuv_unittest
+
+arm64
+
+ GYP_DEFINES="OS=ios target_arch=arm64 target_subarch=arm64" GYP_CROSSCOMPILE=1 GYP_GENERATOR_FLAGS="output_dir=out_ios" ./gyp_libyuv
+ ninja -j7 -C out_ios/Debug-iphoneos libyuv_unittest
+ ninja -j7 -C out_ios/Release-iphoneos libyuv_unittest
+
+both armv7 and arm64 (fat)
+
+ GYP_DEFINES="OS=ios target_arch=armv7 target_subarch=both" GYP_CROSSCOMPILE=1 GYP_GENERATOR_FLAGS="output_dir=out_ios" ./gyp_libyuv
+ ninja -j7 -C out_ios/Debug-iphoneos libyuv_unittest
+ ninja -j7 -C out_ios/Release-iphoneos libyuv_unittest
+
+simulator
+
+ GYP_DEFINES="OS=ios target_arch=ia32 target_subarch=arm32" GYP_CROSSCOMPILE=1 GYP_GENERATOR_FLAGS="output_dir=out_sim" ./gyp_libyuv
+ ninja -j7 -C out_sim/Debug-iphonesimulator libyuv_unittest
+ ninja -j7 -C out_sim/Release-iphonesimulator libyuv_unittest
+
+### Android
+https://code.google.com/p/chromium/wiki/AndroidBuildInstructions
+
+Add to .gclient last line: `target_os=['android'];`
+
+armv7
+
+ GYP_DEFINES="OS=android" GYP_CROSSCOMPILE=1 ./gyp_libyuv
+ ninja -j7 -C out/Debug yuv_unittest_apk
+ ninja -j7 -C out/Release yuv_unittest_apk
+
+arm64
+
+ GYP_DEFINES="OS=android target_arch=arm64 target_subarch=arm64" GYP_CROSSCOMPILE=1 ./gyp_libyuv
+ ninja -j7 -C out/Debug yuv_unittest_apk
+ ninja -j7 -C out/Release yuv_unittest_apk
+
+ia32
+
+ GYP_DEFINES="OS=android target_arch=ia32" GYP_CROSSCOMPILE=1 ./gyp_libyuv
+ ninja -j7 -C out/Debug yuv_unittest_apk
+ ninja -j7 -C out/Release yuv_unittest_apk
+
+ GYP_DEFINES="OS=android target_arch=ia32 android_full_debug=1" GYP_CROSSCOMPILE=1 ./gyp_libyuv
+ ninja -j7 -C out/Debug yuv_unittest_apk
+
+mipsel
+
+ GYP_DEFINES="OS=android target_arch=mipsel" GYP_CROSSCOMPILE=1 ./gyp_libyuv
+ ninja -j7 -C out/Debug yuv_unittest_apk
+ ninja -j7 -C out/Release yuv_unittest_apk
+
+arm32 disassembly:
+
+ third_party/android_tools/ndk/toolchains/arm-linux-androideabi-4.9/prebuilt/linux-x86_64/bin/arm-linux-androideabi-objdump -d out/Release/obj/source/libyuv.row_neon.o
+
+arm64 disassembly:
+
+ third_party/android_tools/ndk/toolchains/aarch64-linux-android-4.9/prebuilt/linux-x86_64/bin/aarch64-linux-android-objdump -d out/Release/obj/source/libyuv.row_neon64.o
+
+Running tests:
+
+ build/android/test_runner.py gtest -s libyuv_unittest -t 7200 --verbose --release --gtest_filter=*
+
+Running test as benchmark:
+
+ build/android/test_runner.py gtest -s libyuv_unittest -t 7200 --verbose --release --gtest_filter=* -a "--libyuv_width=1280 --libyuv_height=720 --libyuv_repeat=999 --libyuv_flags=-1"
+
+Running test with C code:
+
+ build/android/test_runner.py gtest -s libyuv_unittest -t 7200 --verbose --release --gtest_filter=* -a "--libyuv_width=1280 --libyuv_height=720 --libyuv_repeat=999 --libyuv_flags=1 --libyuv_cpu_info=1"
+
+#### Building with GN
+
+ gn gen out/Release "--args=is_debug=false target_cpu=\"x86\""
+ gn gen out/Debug "--args=is_debug=true target_cpu=\"x86\""
+ ninja -C out/Release
+ ninja -C out/Debug
+
+### Building Offical with GN
+
+ gn gen out/Official "--args=is_debug=false is_official_build=true is_chrome_branded=true"
+ ninja -C out/Official
+
+#### Building mips with GN
+
+mipsel
+ gn gen out/Default "--args=is_debug=false target_cpu=\"mipsel\" target_os = \"android\" mips_arch_variant = \"r6\" mips_use_msa = true is_component_build = true is_clang = false"
+ ninja -C out/Default
+
+mips64el
+ gn gen out/Default "--args=is_debug=false target_cpu=\"mips64el\" target_os = \"android\" mips_arch_variant = \"r6\" mips_use_msa = true is_component_build = true is_clang = false"
+ ninja -C out/Default
+
+### Linux
+
+ GYP_DEFINES="target_arch=x64" ./gyp_libyuv
+ ninja -j7 -C out/Debug
+ ninja -j7 -C out/Release
+
+ GYP_DEFINES="target_arch=ia32" ./gyp_libyuv
+ ninja -j7 -C out/Debug
+ ninja -j7 -C out/Release
+
+#### CentOS
+
+On CentOS 32 bit the following work around allows a sync:
+
+ export GYP_DEFINES="host_arch=ia32"
+ gclient sync
+
+### Windows Shared Library
+
+Modify libyuv.gyp from 'static_library' to 'shared_library', and add 'LIBYUV_BUILDING_SHARED_LIBRARY' to 'defines'.
+
+ gclient runhooks
+
+After this command follow the building the library instructions above.
+
+If you get a compile error for atlthunk.lib on Windows, read http://www.chromium.org/developers/how-tos/build-instructions-windows
+
+
+### Build targets
+
+ ninja -C out/Debug libyuv
+ ninja -C out/Debug libyuv_unittest
+ ninja -C out/Debug compare
+ ninja -C out/Debug yuvconvert
+ ninja -C out/Debug psnr
+ ninja -C out/Debug cpuid
+
+
+## Building the Library with make
+
+### Linux
+
+ make -j7 V=1 -f linux.mk
+ make -j7 V=1 -f linux.mk clean
+ make -j7 V=1 -f linux.mk CXX=clang++
+
+## Building the Library with cmake
+
+Install cmake: http://www.cmake.org/
+
+Default debug build:
+
+ mkdir out
+ cd out
+ cmake ..
+ cmake --build .
+
+Release build/install
+
+ mkdir out
+ cd out
+ cmake -DCMAKE_INSTALL_PREFIX="/usr/lib" -DCMAKE_BUILD_TYPE="Release" ..
+ cmake --build . --config Release
+ sudo cmake --build . --target install --config Release
+
+### Windows 8 Phone
+
+Pre-requisite:
+
+* Install Visual Studio 2012 and Arm to your environment.
+
+Then:
+
+ call "c:\Program Files (x86)\Microsoft Visual Studio 11.0\VC\bin\x86_arm\vcvarsx86_arm.bat"
+
+or with Visual Studio 2013:
+
+ call "c:\Program Files (x86)\Microsoft Visual Studio 12.0\VC\bin\x86_arm\vcvarsx86_arm.bat"
+ nmake /f winarm.mk clean
+ nmake /f winarm.mk
+
+### Windows Shared Library
+
+Modify libyuv.gyp from 'static_library' to 'shared_library', and add 'LIBYUV_BUILDING_SHARED_LIBRARY' to 'defines'. Then run this.
+
+ gclient runhooks
+
+After this command follow the building the library instructions above.
+
+If you get a compile error for atlthunk.lib on Windows, read http://www.chromium.org/developers/how-tos/build-instructions-windows
+
+### 64 bit Windows
+
+ set GYP_DEFINES=target_arch=x64
+ gclient runhooks V=1
+
+### ARM Linux
+
+ export GYP_DEFINES="target_arch=arm"
+ export CROSSTOOL=``/arm-none-linux-gnueabi
+ export CXX=$CROSSTOOL-g++
+ export CC=$CROSSTOOL-gcc
+ export AR=$CROSSTOOL-ar
+ export AS=$CROSSTOOL-as
+ export RANLIB=$CROSSTOOL-ranlib
+ gclient runhooks
+
+## Running Unittests
+
+### Windows
+
+ out\Release\libyuv_unittest.exe --gtest_catch_exceptions=0 --gtest_filter="*"
+
+### OSX
+
+ out/Release/libyuv_unittest --gtest_filter="*"
+
+### Linux
+
+ out/Release/libyuv_unittest --gtest_filter="*"
+
+Replace --gtest_filter="*" with specific unittest to run. May include wildcards. e.g.
+
+ out/Release/libyuv_unittest --gtest_filter=libyuvTest.I420ToARGB_Opt
+
+## CPU Emulator tools
+
+### Intel SDE (Software Development Emulator)
+
+Pre-requisite: Install IntelSDE for Windows: http://software.intel.com/en-us/articles/intel-software-development-emulator
+
+Then run:
+
+ c:\intelsde\sde -hsw -- out\release\libyuv_unittest.exe --gtest_filter=*
+
+
+## Memory tools
+
+### Running Dr Memory memcheck for Windows
+
+Pre-requisite: Install Dr Memory for Windows and add it to your path: http://www.drmemory.org/docs/page_install_windows.html
+
+ set GYP_DEFINES=build_for_tool=drmemory target_arch=ia32
+ call python gyp_libyuv -fninja -G msvs_version=2013
+ ninja -C out\Debug
+ drmemory out\Debug\libyuv_unittest.exe --gtest_catch_exceptions=0 --gtest_filter=*
+
+### Running UBSan
+
+See Chromium instructions for sanitizers: https://www.chromium.org/developers/testing/undefinedbehaviorsanitizer
+
+Sanitizers available: TSan, MSan, ASan, UBSan, LSan
+
+ GYP_DEFINES='ubsan=1' gclient runhooks
+ ninja -C out/Release
+
+### Running Valgrind memcheck
+
+Memory errors and race conditions can be found by running tests under special memory tools. [Valgrind] [1] is an instrumentation framework for building dynamic analysis tools. Various tests and profilers are built upon it to find memory handling errors and memory leaks, for instance.
+
+[1]: http://valgrind.org
+
+ solutions = [
+ { "name" : "libyuv",
+ "url" : "https://chromium.googlesource.com/libyuv/libyuv",
+ "deps_file" : "DEPS",
+ "managed" : True,
+ "custom_deps" : {
+ "libyuv/chromium/src/third_party/valgrind": "https://chromium.googlesource.com/chromium/deps/valgrind/binaries",
+ },
+ "safesync_url": "",
+ },
+ ]
+
+Then run:
+
+ GYP_DEFINES="clang=0 target_arch=x64 build_for_tool=memcheck" python gyp_libyuv
+ ninja -C out/Debug
+ valgrind out/Debug/libyuv_unittest
+
+
+For more information, see http://www.chromium.org/developers/how-tos/using-valgrind
+
+### Running Thread Sanitizer (TSan)
+
+ GYP_DEFINES="clang=0 target_arch=x64 build_for_tool=tsan" python gyp_libyuv
+ ninja -C out/Debug
+ valgrind out/Debug/libyuv_unittest
+
+For more info, see http://www.chromium.org/developers/how-tos/using-valgrind/threadsanitizer
+
+### Running Address Sanitizer (ASan)
+
+ GYP_DEFINES="clang=0 target_arch=x64 build_for_tool=asan" python gyp_libyuv
+ ninja -C out/Debug
+ valgrind out/Debug/libyuv_unittest
+
+For more info, see http://dev.chromium.org/developers/testing/addresssanitizer
+
+## Benchmarking
+
+The unittests can be used to benchmark.
+
+### Windows
+
+ set LIBYUV_WIDTH=1280
+ set LIBYUV_HEIGHT=720
+ set LIBYUV_REPEAT=999
+ set LIBYUV_FLAGS=-1
+ out\Release\libyuv_unittest.exe --gtest_filter=*I420ToARGB_Opt
+
+
+### Linux and Mac
+
+ LIBYUV_WIDTH=1280 LIBYUV_HEIGHT=720 LIBYUV_REPEAT=1000 out/Release/libyuv_unittest --gtest_filter=*I420ToARGB_Opt
+
+ libyuvTest.I420ToARGB_Opt (547 ms)
+
+Indicates 0.547 ms/frame for 1280 x 720.
+
+## Making a change
+
+ gclient sync
+ git checkout -b mycl -t origin/master
+ git pull
+
+ git add -u
+ git commit -m "my change"
+ git cl lint
+ git cl try
+ git cl upload -r a-reviewer@chomium.org -s
+
+ git cl land
diff --git a/libyuv/src/main/cpp/libyuv/docs/environment_variables.md b/libyuv/src/main/cpp/libyuv/docs/environment_variables.md
new file mode 100644
index 0000000..5802599
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/docs/environment_variables.md
@@ -0,0 +1,32 @@
+# Introduction
+
+For test purposes, environment variables can be set to control libyuv behavior. These should only be used for testing, to narrow down bugs or to test performance.
+
+# CPU
+
+By default the cpu is detected and the most advanced form of SIMD is used. But you can disable instruction sets selectively, or completely, falling back on C code. Set the variable to 1 to disable the specified instruction set.
+
+ LIBYUV_DISABLE_ASM
+ LIBYUV_DISABLE_X86
+ LIBYUV_DISABLE_SSE2
+ LIBYUV_DISABLE_SSSE3
+ LIBYUV_DISABLE_SSE41
+ LIBYUV_DISABLE_SSE42
+ LIBYUV_DISABLE_AVX
+ LIBYUV_DISABLE_AVX2
+ LIBYUV_DISABLE_AVX3
+ LIBYUV_DISABLE_ERMS
+ LIBYUV_DISABLE_FMA3
+ LIBYUV_DISABLE_DSPR2
+ LIBYUV_DISABLE_NEON
+
+# Test Width/Height/Repeat
+
+The unittests default to a small image (128x72) to run fast. This can be set by environment variable to test a specific resolutions.
+You can also repeat the test a specified number of iterations, allowing benchmarking and profiling.
+
+ set LIBYUV_WIDTH=1280
+ set LIBYUV_HEIGHT=720
+ set LIBYUV_REPEAT=999
+ set LIBYUV_FLAGS=-1
+ set LIBYUV_CPU_INFO=-1
diff --git a/libyuv/src/main/cpp/libyuv/docs/filtering.md b/libyuv/src/main/cpp/libyuv/docs/filtering.md
new file mode 100644
index 0000000..8696976
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/docs/filtering.md
@@ -0,0 +1,196 @@
+# Introduction
+
+This document discusses the current state of filtering in libyuv. An emphasis on maximum performance while avoiding memory exceptions, and minimal amount of code/complexity. See future work at end.
+
+# LibYuv Filter Subsampling
+
+There are 2 challenges with subsampling
+
+1. centering of samples, which involves clamping on edges
+2. clipping a source region
+
+Centering depends on scale factor and filter mode.
+
+# Down Sampling
+
+If scaling down, the stepping rate is always src_width / dst_width.
+
+ dx = src_width / dst_width;
+
+e.g. If scaling from 1280x720 to 640x360, the step thru the source will be 2.0, stepping over 2 pixels of source for each pixel of destination.
+
+Centering, depends on filter mode.
+
+*Point* downsampling takes the middle pixel.
+
+ x = dx >> 1;
+
+For odd scale factors (e.g. 3x down) this is exactly the middle. For even scale factors, this rounds up and takes the pixel to the right of center. e.g. scale of 4x down will take pixel 2.
+
+**Bilinear** filter, uses the 2x2 pixels in the middle.
+
+ x = dx / 2 - 0.5;
+
+For odd scale factors (e.g. 3x down) this is exactly the middle, and point sampling is used.
+For even scale factors, this evenly filters the middle 2x2 pixels. e.g. 4x down will filter pixels 1,2 at 50% in both directions.
+
+**Box** filter averages the entire box so sampling starts at 0.
+
+ x = 0;
+
+For a scale factor of 2x down, this is equivalent to bilinear.
+
+# Up Sampling
+
+**Point** upsampling use stepping rate of src_width / dst_width and a starting coordinate of 0.
+
+ x = 0;
+ dx = src_width / dst_width;
+
+e.g. If scaling from 640x360 to 1280x720 the step thru the source will be 0.0, stepping half a pixel of source for each pixel of destination. Each pixel is replicated by the scale factor.
+
+**Bilinear** filter stretches such that the first pixel of source maps to the first pixel of destination, and the last pixel of source maps to the last pixel of destination.
+
+ x = 0;
+ dx = (src_width - 1) / (dst_width - 1);
+
+This method is not technically correct, and will likely change in the future.
+
+* It is inconsistent with the bilinear down sampler. The same method could be used for down sampling, and then it would be more reversible, but that would prevent specialized 2x down sampling.
+* Although centered, the image is slightly magnified.
+* The filtering was changed in early 2013 - previously it used:
+
+ x = 0;
+ dx = (src_width - 1) / (dst_width - 1);
+
+Which is the correct scale factor, but shifted the image left, and extruded the last pixel. The reason for the change was to remove the extruding code from the low level row functions, allowing 3 functions to sshare the same row functions - ARGBScale, I420Scale, and ARGBInterpolate. Then the one function was ported to many cpu variations: SSE2, SSSE3, AVX2, Neon and 'Any' version for any number of pixels and alignment. The function is also specialized for 0,25,50,75%.
+
+The above goes still has the potential to read the last pixel 100% and last pixel + 1 0%, which may cause a memory exception. So the left pixel goes to a fraction less than the last pixel, but filters in the minimum amount of it, and the maximum of the last pixel.
+
+ dx = FixedDiv((src_width << 16) - 0x00010001, (dst << 16) - 0x00010000);
+
+**Box** filter for upsampling switches over to Bilinear.
+
+# Scale snippet:
+
+ #define CENTERSTART(dx, s) (dx < 0) ? -((-dx >> 1) + s) : ((dx >> 1) + s)
+ #define FIXEDDIV1(src, dst) FixedDiv((src << 16) - 0x00010001, \
+ (dst << 16) - 0x00010000);
+
+ // Compute slope values for stepping.
+ void ScaleSlope(int src_width, int src_height,
+ int dst_width, int dst_height,
+ FilterMode filtering,
+ int* x, int* y, int* dx, int* dy) {
+ assert(x != NULL);
+ assert(y != NULL);
+ assert(dx != NULL);
+ assert(dy != NULL);
+ assert(src_width != 0);
+ assert(src_height != 0);
+ assert(dst_width > 0);
+ assert(dst_height > 0);
+ if (filtering == kFilterBox) {
+ // Scale step for point sampling duplicates all pixels equally.
+ *dx = FixedDiv(Abs(src_width), dst_width);
+ *dy = FixedDiv(src_height, dst_height);
+ *x = 0;
+ *y = 0;
+ } else if (filtering == kFilterBilinear) {
+ // Scale step for bilinear sampling renders last pixel once for upsample.
+ if (dst_width <= Abs(src_width)) {
+ *dx = FixedDiv(Abs(src_width), dst_width);
+ *x = CENTERSTART(*dx, -32768);
+ } else if (dst_width > 1) {
+ *dx = FIXEDDIV1(Abs(src_width), dst_width);
+ *x = 0;
+ }
+ if (dst_height <= src_height) {
+ *dy = FixedDiv(src_height, dst_height);
+ *y = CENTERSTART(*dy, -32768); // 32768 = -0.5 to center bilinear.
+ } else if (dst_height > 1) {
+ *dy = FIXEDDIV1(src_height, dst_height);
+ *y = 0;
+ }
+ } else if (filtering == kFilterLinear) {
+ // Scale step for bilinear sampling renders last pixel once for upsample.
+ if (dst_width <= Abs(src_width)) {
+ *dx = FixedDiv(Abs(src_width), dst_width);
+ *x = CENTERSTART(*dx, -32768);
+ } else if (dst_width > 1) {
+ *dx = FIXEDDIV1(Abs(src_width), dst_width);
+ *x = 0;
+ }
+ *dy = FixedDiv(src_height, dst_height);
+ *y = *dy >> 1;
+ } else {
+ // Scale step for point sampling duplicates all pixels equally.
+ *dx = FixedDiv(Abs(src_width), dst_width);
+ *dy = FixedDiv(src_height, dst_height);
+ *x = CENTERSTART(*dx, 0);
+ *y = CENTERSTART(*dy, 0);
+ }
+ // Negative src_width means horizontally mirror.
+ if (src_width < 0) {
+ *x += (dst_width - 1) * *dx;
+ *dx = -*dx;
+ src_width = -src_width;
+ }
+ }
+
+# Future Work
+
+Point sampling should ideally be the same as bilinear, but pixel by pixel, round to nearest neighbor. But as is, it is reversible and exactly matches ffmpeg at all scale factors, both up and down. The scale factor is
+
+ dx = src_width / dst_width;
+
+The step value is centered for down sample:
+
+ x = dx / 2;
+
+Or starts at 0 for upsample.
+
+ x = 0;
+
+Bilinear filtering is currently correct for down sampling, but not for upsampling.
+Upsampling is stretching the first and last pixel of source, to the first and last pixel of destination.
+
+ dx = (src_width - 1) / (dst_width - 1);
+ x = 0;
+
+It should be stretching such that the first pixel is centered in the middle of the scale factor, to match the pixel that would be sampled for down sampling by the same amount. And same on last pixel.
+
+ dx = src_width / dst_width;
+ x = dx / 2 - 0.5;
+
+This would start at -0.5 and go to last pixel + 0.5, sampling 50% from last pixel + 1.
+Then clamping would be needed. On GPUs there are numerous ways to clamp.
+
+1. Clamp the coordinate to the edge of the texture, duplicating the first and last pixel.
+2. Blend with a constant color, such as transparent black. Typically best for fonts.
+3. Mirror the UV coordinate, which is similar to clamping. Good for continuous tone images.
+4. Wrap the coordinate, for texture tiling.
+5. Allow the coordinate to index beyond the image, which may be the correct data if sampling a subimage.
+6. Extrapolate the edge based on the previous pixel. pixel -0.5 is computed from slope of pixel 0 and 1.
+
+Some of these are computational, even for a GPU, which is one reason textures are sometimes limited to power of 2 sizes.
+We do care about the clipping case, where allowing coordinates to become negative and index pixels before the image is the correct data. But normally for simple scaling, we want to clamp to the edge pixel. For example, if bilinear scaling from 3x3 to 30x30, we’d essentially want 10 pixels of each of the original 3 pixels. But we want the original pixels to land in the middle of each 10 pixels, at offsets 5, 15 and 25. There would be filtering between 5 and 15 between the original pixels 0 and 1. And filtering between 15 and 25 from original pixels 1 and 2. The first 5 pixels are clamped to pixel 0 and the last 5 pixels are clamped to pixel 2.
+The easiest way to implement this is copy the original 3 pixels to a buffer, and duplicate the first and last pixels. 0,1,2 becomes 0, 0,1,2, 2. Then implement a filtering without clamping. We call this source extruding. Its only necessary on up sampling, since down sampler will always have valid surrounding pixels.
+Extruding is practical when the image is already copied to a temporary buffer. It could be done to the original image, as long as the original memory is restored, but valgrind and/or memory protection would disallow this, so it requires a memcpy to a temporary buffer, which may hurt performance. The memcpy has a performance advantage, from a cache point of view, that can actually make this technique faster, depending on hardware characteristics.
+Vertical extrusion can be done with a memcpy of the first/last row, or clamping a pointer.
+
+
+The other way to implement clamping is handle the edges with a memset. e.g. Read first source pixel and memset the first 5 pixels. Filter pixels 0,1,2 to 5 to 25. Read last pixel and memset the last 5 pixels. Blur is implemented with this method like this, which has 3 loops per row - left, middle and right.
+
+Box filter is only used for 2x down sample or more. Its based on integer sized boxes. Technically it should be filtered edges, but thats substantially slower (roughly 100x), and at that point you may as well do a cubic filter which is more correct.
+
+Box filter currently sums rows into a row buffer. It does this with
+
+Mirroring will use the same slope as normal, but with a negative.
+The starting coordinate needs to consider the scale factor and filter. e.g. box filter of 30x30 to 3x3 with mirroring would use -10 for step, but x = 20. width (30) - dx.
+
+Step needs to be accurate, so it uses an integer divide. This is as much as 5% of the profile. An approximated divide is substantially faster, but the inaccuracy causes stepping beyond the original image boundaries. 3 general solutions:
+
+1. copy image to buffer with padding. allows for small errors in stepping.
+2. hash the divide, so common values are quickly found.
+3. change api so caller provides the slope.
diff --git a/libyuv/src/main/cpp/libyuv/docs/formats.md b/libyuv/src/main/cpp/libyuv/docs/formats.md
new file mode 100644
index 0000000..cddfe02
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/docs/formats.md
@@ -0,0 +1,140 @@
+# Introduction
+
+Formats (FOURCC) supported by libyuv are detailed here.
+
+# Core Formats
+
+There are 2 core formats supported by libyuv - I420 and ARGB. All YUV formats can be converted to/from I420. All RGB formats can be converted to/from ARGB.
+
+Filtering functions such as scaling and planar functions work on I420 and/or ARGB.
+
+# OSX Core Media Pixel Formats
+
+This is how OSX formats map to libyuv
+
+ enum {
+ kCMPixelFormat_32ARGB = 32, FOURCC_BGRA
+ kCMPixelFormat_32BGRA = 'BGRA', FOURCC_ARGB
+ kCMPixelFormat_24RGB = 24, FOURCC_RAW
+ kCMPixelFormat_16BE555 = 16, Not supported.
+ kCMPixelFormat_16BE565 = 'B565', Not supported.
+ kCMPixelFormat_16LE555 = 'L555', FOURCC_RGBO
+ kCMPixelFormat_16LE565 = 'L565', FOURCC_RGBP
+ kCMPixelFormat_16LE5551 = '5551', FOURCC_RGBO
+ kCMPixelFormat_422YpCbCr8 = '2vuy', FOURCC_UYVY
+ kCMPixelFormat_422YpCbCr8_yuvs = 'yuvs', FOURCC_YUY2
+ kCMPixelFormat_444YpCbCr8 = 'v308', FOURCC_I444 ?
+ kCMPixelFormat_4444YpCbCrA8 = 'v408', Not supported.
+ kCMPixelFormat_422YpCbCr16 = 'v216', Not supported.
+ kCMPixelFormat_422YpCbCr10 = 'v210', FOURCC_V210 previously. Removed now.
+ kCMPixelFormat_444YpCbCr10 = 'v410', Not supported.
+ kCMPixelFormat_8IndexedGray_WhiteIsZero = 0x00000028, Not supported.
+ };
+
+
+# FOURCC (Four Charactacter Code) List
+
+The following is extracted from video_common.h as a complete list of formats supported by libyuv.
+
+ enum FourCC {
+ // 8 Primary YUV formats: 5 planar, 2 biplanar, 2 packed.
+ FOURCC_I420 = FOURCC('I', '4', '2', '0'),
+ FOURCC_I422 = FOURCC('I', '4', '2', '2'),
+ FOURCC_I444 = FOURCC('I', '4', '4', '4'),
+ FOURCC_I400 = FOURCC('I', '4', '0', '0'),
+ FOURCC_NV21 = FOURCC('N', 'V', '2', '1'),
+ FOURCC_NV12 = FOURCC('N', 'V', '1', '2'),
+ FOURCC_YUY2 = FOURCC('Y', 'U', 'Y', '2'),
+ FOURCC_UYVY = FOURCC('U', 'Y', 'V', 'Y'),
+
+ // 1 Secondary YUV formats: row biplanar.
+ FOURCC_M420 = FOURCC('M', '4', '2', '0'),
+
+ // 9 Primary RGB formats: 4 32 bpp, 2 24 bpp, 3 16 bpp.
+ FOURCC_ARGB = FOURCC('A', 'R', 'G', 'B'),
+ FOURCC_BGRA = FOURCC('B', 'G', 'R', 'A'),
+ FOURCC_ABGR = FOURCC('A', 'B', 'G', 'R'),
+ FOURCC_24BG = FOURCC('2', '4', 'B', 'G'),
+ FOURCC_RAW = FOURCC('r', 'a', 'w', ' '),
+ FOURCC_RGBA = FOURCC('R', 'G', 'B', 'A'),
+ FOURCC_RGBP = FOURCC('R', 'G', 'B', 'P'), // rgb565 LE.
+ FOURCC_RGBO = FOURCC('R', 'G', 'B', 'O'), // argb1555 LE.
+ FOURCC_R444 = FOURCC('R', '4', '4', '4'), // argb4444 LE.
+
+ // 4 Secondary RGB formats: 4 Bayer Patterns.
+ FOURCC_RGGB = FOURCC('R', 'G', 'G', 'B'),
+ FOURCC_BGGR = FOURCC('B', 'G', 'G', 'R'),
+ FOURCC_GRBG = FOURCC('G', 'R', 'B', 'G'),
+ FOURCC_GBRG = FOURCC('G', 'B', 'R', 'G'),
+
+ // 1 Primary Compressed YUV format.
+ FOURCC_MJPG = FOURCC('M', 'J', 'P', 'G'),
+
+ // 5 Auxiliary YUV variations: 3 with U and V planes are swapped, 1 Alias.
+ FOURCC_YV12 = FOURCC('Y', 'V', '1', '2'),
+ FOURCC_YV16 = FOURCC('Y', 'V', '1', '6'),
+ FOURCC_YV24 = FOURCC('Y', 'V', '2', '4'),
+ FOURCC_YU12 = FOURCC('Y', 'U', '1', '2'), // Linux version of I420.
+ FOURCC_J420 = FOURCC('J', '4', '2', '0'),
+ FOURCC_J400 = FOURCC('J', '4', '0', '0'),
+
+ // 14 Auxiliary aliases. CanonicalFourCC() maps these to canonical fourcc.
+ FOURCC_IYUV = FOURCC('I', 'Y', 'U', 'V'), // Alias for I420.
+ FOURCC_YU16 = FOURCC('Y', 'U', '1', '6'), // Alias for I422.
+ FOURCC_YU24 = FOURCC('Y', 'U', '2', '4'), // Alias for I444.
+ FOURCC_YUYV = FOURCC('Y', 'U', 'Y', 'V'), // Alias for YUY2.
+ FOURCC_YUVS = FOURCC('y', 'u', 'v', 's'), // Alias for YUY2 on Mac.
+ FOURCC_HDYC = FOURCC('H', 'D', 'Y', 'C'), // Alias for UYVY.
+ FOURCC_2VUY = FOURCC('2', 'v', 'u', 'y'), // Alias for UYVY on Mac.
+ FOURCC_JPEG = FOURCC('J', 'P', 'E', 'G'), // Alias for MJPG.
+ FOURCC_DMB1 = FOURCC('d', 'm', 'b', '1'), // Alias for MJPG on Mac.
+ FOURCC_BA81 = FOURCC('B', 'A', '8', '1'), // Alias for BGGR.
+ FOURCC_RGB3 = FOURCC('R', 'G', 'B', '3'), // Alias for RAW.
+ FOURCC_BGR3 = FOURCC('B', 'G', 'R', '3'), // Alias for 24BG.
+ FOURCC_CM32 = FOURCC(0, 0, 0, 32), // Alias for BGRA kCMPixelFormat_32ARGB
+ FOURCC_CM24 = FOURCC(0, 0, 0, 24), // Alias for RAW kCMPixelFormat_24RGB
+ FOURCC_L555 = FOURCC('L', '5', '5', '5'), // Alias for RGBO.
+ FOURCC_L565 = FOURCC('L', '5', '6', '5'), // Alias for RGBP.
+ FOURCC_5551 = FOURCC('5', '5', '5', '1'), // Alias for RGBO.
+
+ // 1 Auxiliary compressed YUV format set aside for capturer.
+ FOURCC_H264 = FOURCC('H', '2', '6', '4'),
+
+# Planar YUV
+ The following formats contains a full size Y plane followed by 1 or 2
+ planes for UV: I420, I422, I444, I400, NV21, NV12, I400
+ The size (subsampling) of the UV varies.
+ I420, NV12 and NV21 are half width, half height
+ I422, NV16 and NV61 are half width, full height
+ I444, NV24 and NV42 are full width, full height
+ I400 and J400 have no chroma channel.
+
+# The ARGB FOURCC
+
+There are 4 ARGB layouts - ARGB, BGRA, ABGR and RGBA. ARGB is most common by far, used for screen formats, and windows webcam drivers.
+
+The fourcc describes the order of channels in a ***register***.
+
+A fourcc provided by capturer, can be thought of string, e.g. "ARGB".
+
+On little endian machines, as an int, this would have 'A' in the lowest byte. The FOURCC macro reverses the order:
+
+ #define FOURCC(a, b, c, d) (((uint32)(a)) | ((uint32)(b) << 8) | ((uint32)(c) << 16) | ((uint32)(d) << 24))
+
+So the "ARGB" string, read as an uint32, is
+
+ FOURCC_ARGB = FOURCC('A', 'R', 'G', 'B')
+
+If you were to read ARGB pixels as uint32's, the alpha would be in the high byte, and the blue in the lowest byte. In memory, these are stored little endian, so 'B' is first, then 'G', 'R' and 'A' last.
+
+When calling conversion functions, the names match the FOURCC, so in this case it would be I420ToARGB().
+
+All formats can be converted to/from ARGB.
+
+Most 'planar_functions' work on ARGB (e.g. ARGBBlend).
+
+Some are channel order agnostic (e.g. ARGBScale).
+
+Some functions are symmetric (e.g. ARGBToBGRA is the same as BGRAToARGB, so its a macro).
+
+ARGBBlend expects preattenuated ARGB. The R,G,B are premultiplied by alpha. Other functions don't care.
diff --git a/libyuv/src/main/cpp/libyuv/docs/getting_started.md b/libyuv/src/main/cpp/libyuv/docs/getting_started.md
new file mode 100644
index 0000000..5df4c30
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/docs/getting_started.md
@@ -0,0 +1,289 @@
+# Getting Started
+
+How to get and build the libyuv code.
+
+## Pre-requisites
+
+You'll need to have depot tools installed: https://www.chromium.org/developers/how-tos/install-depot-tools
+Refer to chromium instructions for each platform for other prerequisites.
+
+## Getting the Code
+
+Create a working directory, enter it, and run:
+
+ gclient config --name src https://chromium.googlesource.com/libyuv/libyuv
+ gclient sync
+
+Then you'll get a .gclient file like:
+
+ solutions = [
+ { "name" : "src",
+ "url" : "https://chromium.googlesource.com/libyuv/libyuv",
+ "deps_file" : "DEPS",
+ "managed" : True,
+ "custom_deps" : {
+ },
+ "safesync_url": "",
+ },
+ ];
+
+For iOS add `;target_os=['ios'];` to your OSX .gclient and run `GYP_DEFINES="OS=ios" gclient sync.`
+
+Browse the Git reprository: https://chromium.googlesource.com/libyuv/libyuv/+/master
+
+### Android
+For Android add `;target_os=['android'];` to your Linux .gclient
+
+ solutions = [
+ { "name" : "src",
+ "url" : "https://chromium.googlesource.com/libyuv/libyuv",
+ "deps_file" : "DEPS",
+ "managed" : True,
+ "custom_deps" : {
+ },
+ "safesync_url": "",
+ },
+ ];
+ target_os = ["android", "linux"];
+
+Then run:
+
+ export GYP_DEFINES="OS=android"
+ gclient sync
+
+The sync will generate native build files for your environment using gyp (Windows: Visual Studio, OSX: XCode, Linux: make). This generation can also be forced manually: `gclient runhooks`
+
+To get just the source (not buildable):
+
+ git clone https://chromium.googlesource.com/libyuv/libyuv
+
+
+## Building the Library and Unittests
+
+### Windows
+
+ call gn gen out/Release "--args=is_debug=false target_cpu=\"x86\""
+ call gn gen out/Debug "--args=is_debug=true target_cpu=\"x86\""
+ ninja -v -C out/Release
+ ninja -v -C out/Debug
+
+ call gn gen out/Release "--args=is_debug=false target_cpu=\"x64\""
+ call gn gen out/Debug "--args=is_debug=true target_cpu=\"x64\""
+ ninja -v -C out/Release
+ ninja -v -C out/Debug
+
+#### Building with clang-cl
+
+ set GYP_DEFINES=clang=1 target_arch=ia32
+ call python tools\clang\scripts\update.py
+
+ call gn gen out/Release "--args=is_debug=false is_official_build=false is_clang=true target_cpu=\"x86\""
+ call gn gen out/Debug "--args=is_debug=true is_official_build=false is_clang=true target_cpu=\"x86\""
+ ninja -v -C out/Release
+ ninja -v -C out/Debug
+
+ call gn gen out/Release "--args=is_debug=false is_official_build=false is_clang=true target_cpu=\"x64\""
+ call gn gen out/Debug "--args=is_debug=true is_official_build=false is_clang=true target_cpu=\"x64\""
+ ninja -v -C out/Release
+ ninja -v -C out/Debug
+
+### macOS and Linux
+
+ gn gen out/Release "--args=is_debug=false"
+ gn gen out/Debug "--args=is_debug=true"
+ ninja -v -C out/Release
+ ninja -v -C out/Debug
+
+### Building Offical with GN
+
+ gn gen out/Official "--args=is_debug=false is_official_build=true is_chrome_branded=true"
+ ninja -C out/Official
+
+### iOS
+http://www.chromium.org/developers/how-tos/build-instructions-ios
+
+Add to .gclient last line: `target_os=['ios'];`
+
+arm64
+
+ gn gen out/Release "--args=is_debug=false target_os=\"ios\" ios_enable_code_signing=false target_cpu=\"arm64\""
+ gn gen out/Debug "--args=is_debug=true target_os=\"ios\" ios_enable_code_signing=false target_cpu=\"arm64\""
+ ninja -v -C out/Debug libyuv_unittest
+ ninja -v -C out/Release libyuv_unittest
+
+ios simulator
+
+ gn gen out/Release "--args=is_debug=false target_os=\"ios\" ios_enable_code_signing=false target_cpu=\"x86\""
+ gn gen out/Debug "--args=is_debug=true target_os=\"ios\" ios_enable_code_signing=false target_cpu=\"x86\""
+ ninja -v -C out/Debug libyuv_unittest
+ ninja -v -C out/Release libyuv_unittest
+
+### Android
+https://code.google.com/p/chromium/wiki/AndroidBuildInstructions
+
+Add to .gclient last line: `target_os=['android'];`
+
+armv7
+
+ gn gen out/Release "--args=is_debug=false target_os=\"android\" target_cpu=\"arm\""
+ gn gen out/Debug "--args=is_debug=true target_os=\"android\" target_cpu=\"arm\""
+ ninja -v -C out/Debug libyuv_unittest
+ ninja -v -C out/Release libyuv_unittest
+
+arm64
+
+ gn gen out/Release "--args=is_debug=false target_os=\"android\" target_cpu=\"arm64\""
+ gn gen out/Debug "--args=is_debug=true target_os=\"android\" target_cpu=\"arm64\""
+ ninja -v -C out/Debug libyuv_unittest
+ ninja -v -C out/Release libyuv_unittest
+
+ia32
+
+ gn gen out/Release "--args=is_debug=false target_os=\"android\" target_cpu=\"x86\""
+ gn gen out/Debug "--args=is_debug=true target_os=\"android\" target_cpu=\"x86\""
+ ninja -v -C out/Debug libyuv_unittest
+ ninja -v -C out/Release libyuv_unittest
+
+mipsel
+
+ gn gen out/Release "--args=is_debug=false target_os=\"android\" target_cpu=\"mipsel\" mips_arch_variant=\"r6\" mips_use_msa=true is_component_build=true is_clang=false"
+ gn gen out/Debug "--args=is_debug=true target_os=\"android\" target_cpu=\"mipsel\" mips_arch_variant=\"r6\" mips_use_msa=true is_component_build=true is_clang=false"
+ ninja -v -C out/Debug libyuv_unittest
+ ninja -v -C out/Release libyuv_unittest
+
+ gn gen out/Release "--args=is_debug=false target_os=\"android\" target_cpu=\"mips64el\" mips_arch_variant=\"r6\" mips_use_msa=true is_component_build=true is_clang=false"
+ gn gen out/Debug "--args=is_debug=true target_os=\"android\" target_cpu=\"mips64el\" mips_arch_variant=\"r6\" mips_use_msa=true is_component_build=true is_clang=false"
+ ninja -v -C out/Debug libyuv_unittest
+ ninja -v -C out/Release libyuv_unittest
+
+arm disassembly:
+
+ third_party/android_tools/ndk/toolchains/aarch64-linux-android-4.9/prebuilt/linux-x86_64/bin/aarch64-linux-android-objdump -d ./out/Release/obj/libyuv/row_common.o >row_common.txt
+
+ third_party/android_tools/ndk/toolchains/aarch64-linux-android-4.9/prebuilt/linux-x86_64/bin/aarch64-linux-android-objdump -d ./out/Release/obj/libyuv_neon/row_neon.o >row_neon.txt
+
+ third_party/android_tools/ndk/toolchains/aarch64-linux-android-4.9/prebuilt/linux-x86_64/bin/aarch64-linux-android-objdump -d ./out/Release/obj/libyuv_neon/row_neon64.o >row_neon64.txt
+
+Running tests:
+
+ build/android/test_runner.py gtest -s libyuv_unittest -t 7200 --verbose --release --gtest_filter=*
+
+Running test as benchmark:
+
+ build/android/test_runner.py gtest -s libyuv_unittest -t 7200 --verbose --release --gtest_filter=* -a "--libyuv_width=1280 --libyuv_height=720 --libyuv_repeat=999 --libyuv_flags=-1 --libyuv_cpu_info=-1"
+
+Running test with C code:
+
+ build/android/test_runner.py gtest -s libyuv_unittest -t 7200 --verbose --release --gtest_filter=* -a "--libyuv_width=1280 --libyuv_height=720 --libyuv_repeat=999 --libyuv_flags=1 --libyuv_cpu_info=1"
+
+### Build targets
+
+ ninja -C out/Debug libyuv
+ ninja -C out/Debug libyuv_unittest
+ ninja -C out/Debug compare
+ ninja -C out/Debug convert
+ ninja -C out/Debug psnr
+ ninja -C out/Debug cpuid
+
+### ARM Linux
+
+ gn gen out/Release "--args=is_debug=false target_cpu=\"arm64\""
+ gn gen out/Debug "--args=is_debug=true target_cpu=\"arm64\""
+ ninja -v -C out/Debug libyuv_unittest
+ ninja -v -C out/Release libyuv_unittest
+
+## Building the Library with make
+
+### Linux
+
+ make V=1 -f linux.mk
+ make V=1 -f linux.mk clean
+ make V=1 -f linux.mk CXX=clang++
+
+## Building the library with cmake
+
+Install cmake: http://www.cmake.org/
+
+### Default debug build:
+
+ mkdir out
+ cd out
+ cmake ..
+ cmake --build .
+
+### Release build/install
+
+ mkdir out
+ cd out
+ cmake -DCMAKE_INSTALL_PREFIX="/usr/lib" -DCMAKE_BUILD_TYPE="Release" ..
+ cmake --build . --config Release
+ sudo cmake --build . --target install --config Release
+
+### Build RPM/DEB packages
+
+ mkdir out
+ cd out
+ cmake -DCMAKE_BUILD_TYPE=Release ..
+ make -j4
+ make package
+
+## Setup for Arm Cross compile
+
+See also https://www.ccoderun.ca/programming/2015-12-20_CrossCompiling/index.html
+
+ sudo apt-get install ssh dkms build-essential linux-headers-generic
+ sudo apt-get install kdevelop cmake git subversion
+ sudo apt-get install graphviz doxygen doxygen-gui
+ sudo apt-get install manpages manpages-dev manpages-posix manpages-posix-dev
+ sudo apt-get install libboost-all-dev libboost-dev libssl-dev
+ sudo apt-get install rpm terminator fish
+ sudo apt-get install g++-arm-linux-gnueabihf gcc-arm-linux-gnueabihf
+
+### Build psnr tool
+
+ cd util
+ arm-linux-gnueabihf-g++ psnr_main.cc psnr.cc ssim.cc -o psnr
+ arm-linux-gnueabihf-objdump -d psnr
+
+## Running Unittests
+
+### Windows
+
+ out\Release\libyuv_unittest.exe --gtest_catch_exceptions=0 --gtest_filter="*"
+
+### OSX
+
+ out/Release/libyuv_unittest --gtest_filter="*"
+
+### Linux
+
+ out/Release/libyuv_unittest --gtest_filter="*"
+
+Replace --gtest_filter="*" with specific unittest to run. May include wildcards. e.g.
+
+ out/Release/libyuv_unittest --gtest_filter=*I420ToARGB_Opt
+
+## CPU Emulator tools
+
+### Intel SDE (Software Development Emulator)
+
+Pre-requisite: Install IntelSDE: http://software.intel.com/en-us/articles/intel-software-development-emulator
+
+Then run:
+
+ c:\intelsde\sde -hsw -- out\Release\libyuv_unittest.exe --gtest_filter=*
+
+ ~/intelsde/sde -skx -- out/Release/libyuv_unittest --gtest_filter=**I420ToARGB_Opt
+
+## Sanitizers
+
+ gn gen out/Debug "--args=is_debug=true is_asan=true"
+ ninja -v -C out/Debug
+
+ Sanitizers available: tsan, msan, asan, ubsan, lsan
+
+### Running Dr Memory memcheck for Windows
+
+Pre-requisite: Install Dr Memory for Windows and add it to your path: http://www.drmemory.org/docs/page_install_windows.html
+
+ drmemory out\Debug\libyuv_unittest.exe --gtest_catch_exceptions=0 --gtest_filter=*
diff --git a/libyuv/src/main/cpp/libyuv/docs/rotation.md b/libyuv/src/main/cpp/libyuv/docs/rotation.md
new file mode 100644
index 0000000..fb84fce
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/docs/rotation.md
@@ -0,0 +1,103 @@
+# Introduction
+
+Rotation by multiplies of 90 degrees allows mobile devices to rotate webcams from landscape to portrait. The higher level functions ConvertToI420 and ConvertToARGB allow rotation of any format. Optimized functionality is supported for I420, ARGB, NV12 and NV21.
+
+# ConvertToI420
+
+ int ConvertToI420(const uint8* src_frame, size_t src_size,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int crop_x, int crop_y,
+ int src_width, int src_height,
+ int crop_width, int crop_height,
+ enum RotationMode rotation,
+ uint32 format);
+
+This function crops, converts, and rotates. You should think of it in that order.
+ * Crops the original image, which is src_width x src_height, to crop_width x crop_height. At this point the image is still not rotated.
+ * Converts the cropped region to I420. Supports inverted source for src_height negative.
+ * Rotates by 90, 180 or 270 degrees.
+The buffer the caller provides should account for rotation. Be especially important to get stride of the destination correct.
+
+e.g.
+640 x 480 NV12 captured
+Crop to 640 x 360
+Rotate by 90 degrees to 360 x 640.
+Caller passes stride of 360 for Y and 360 / 2 for U and V.
+Caller passes crop_width of 640, crop_height of 360.
+
+# ConvertToARGB
+
+ int ConvertToARGB(const uint8* src_frame, size_t src_size,
+ uint8* dst_argb, int dst_stride_argb,
+ int crop_x, int crop_y,
+ int src_width, int src_height,
+ int crop_width, int crop_height,
+ enum RotationMode rotation,
+ uint32 format);
+
+Same as I420, but implementation is less optimized - reads columns and writes rows, 16 bytes at a time.
+
+# I420Rotate
+
+ int I420Rotate(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int src_width, int src_height, enum RotationMode mode);
+
+Destination is rotated, so pass dst_stride_y etc that consider rotation.
+Rotate by 180 can be done in place, but 90 and 270 can not.
+
+Implementation (Neon/SSE2) uses 8 x 8 block transpose, so best efficiency is with sizes and pointers that are aligned to 8.
+
+Cropping can be achieved by adjusting the src_y/u/v pointers and src_width, src_height.
+
+Lower level plane functions are provided, allowing other planar formats to be rotated. (e.g. I444)
+
+For other planar YUV formats (I444, I422, I411, I400, NV16, NV24), the planar functions are exposed and can be called directly
+
+
+ // Rotate a plane by 0, 90, 180, or 270.
+ int RotatePlane(const uint8* src, int src_stride,
+ uint8* dst, int dst_stride,
+ int src_width, int src_height, enum RotationMode mode);
+
+# ARGBRotate
+
+ LIBYUV_API
+ int ARGBRotate(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_argb, int dst_stride_argb,
+ int src_width, int src_height, enum RotationMode mode);
+
+Same as I420, but implementation is less optimized - reads columns and writes rows.
+
+Rotate by 90, or any angle, can be achieved using ARGBAffine.
+
+# Mirror - Horizontal Flip
+
+Mirror functions for horizontally flipping an image, which can be useful for 'self view' of a webcam.
+
+ int I420Mirror(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height);
+ int ARGBMirror(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height);
+
+Mirror functionality can also be achieved with the I420Scale and ARGBScale functions by passing negative width and/or height.
+
+# Invert - Vertical Flip
+
+Inverting can be achieved with almost any libyuv function by passing a negative source height.
+
+I420Mirror and ARGBMirror can also be used to rotate by 180 degrees by passing a negative height.
+
+
diff --git a/libyuv/src/main/cpp/libyuv/download_vs_toolchain.py b/libyuv/src/main/cpp/libyuv/download_vs_toolchain.py
new file mode 100644
index 0000000..4b34578
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/download_vs_toolchain.py
@@ -0,0 +1,29 @@
+#!/usr/bin/env python
+#
+# Copyright 2014 The LibYuv Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style license
+# that can be found in the LICENSE file in the root of the source
+# tree. An additional intellectual property rights grant can be found
+# in the file PATENTS. All contributing project authors may
+# be found in the AUTHORS file in the root of the source tree.
+
+# This script is used to run the vs_toolchain.py script to download the
+# Visual Studio toolchain. It's just a temporary measure while waiting for the
+# Chrome team to move find_depot_tools into src/build to get rid of these
+# workarounds (similar one in gyp_libyuv).
+
+import os
+import sys
+
+
+checkout_root = os.path.dirname(os.path.realpath(__file__))
+sys.path.insert(0, os.path.join(checkout_root, 'build'))
+sys.path.insert(0, os.path.join(checkout_root, 'tools', 'find_depot_tools'))
+
+
+import vs_toolchain
+
+
+if __name__ == '__main__':
+ sys.exit(vs_toolchain.main())
diff --git a/libyuv/src/main/cpp/libyuv/gyp_libyuv b/libyuv/src/main/cpp/libyuv/gyp_libyuv
new file mode 100644
index 0000000..445b924
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/gyp_libyuv
@@ -0,0 +1,101 @@
+#!/usr/bin/env python
+#
+# Copyright 2014 The LibYuv Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style license
+# that can be found in the LICENSE file in the root of the source
+# tree. An additional intellectual property rights grant can be found
+# in the file PATENTS. All contributing project authors may
+# be found in the AUTHORS file in the root of the source tree.
+
+# This script is used to run GYP for libyuv. It contains selected parts of the
+# main function from the src/build/gyp_chromium file.
+
+import glob
+import os
+import shlex
+import sys
+
+checkout_root = os.path.dirname(os.path.realpath(__file__))
+
+sys.path.insert(0, os.path.join(checkout_root, 'build'))
+import gyp_chromium
+import gyp_helper
+import vs_toolchain
+
+sys.path.insert(0, os.path.join(checkout_root, 'tools', 'gyp', 'pylib'))
+import gyp
+
+def GetSupplementalFiles():
+ """Returns a list of the supplemental files that are included in all GYP
+ sources."""
+ # Can't use the one in gyp_chromium since the directory location of the root
+ # is different.
+ return glob.glob(os.path.join(checkout_root, '*', 'supplement.gypi'))
+
+
+if __name__ == '__main__':
+ args = sys.argv[1:]
+
+ if int(os.environ.get('GYP_CHROMIUM_NO_ACTION', 0)):
+ print 'Skipping gyp_libyuv due to GYP_CHROMIUM_NO_ACTION env var.'
+ sys.exit(0)
+
+ # This could give false positives since it doesn't actually do real option
+ # parsing. Oh well.
+ gyp_file_specified = False
+ for arg in args:
+ if arg.endswith('.gyp'):
+ gyp_file_specified = True
+ break
+
+ # If we didn't get a file, assume 'all.gyp' in the root of the checkout.
+ if not gyp_file_specified:
+ # Because of a bug in gyp, simply adding the abspath to all.gyp doesn't
+ # work, but chdir'ing and adding the relative path does. Spooky :/
+ os.chdir(checkout_root)
+ args.append('all.gyp')
+
+ # There shouldn't be a circular dependency relationship between .gyp files,
+ args.append('--no-circular-check')
+
+ # Default to ninja unless GYP_GENERATORS is set.
+ if not os.environ.get('GYP_GENERATORS'):
+ os.environ['GYP_GENERATORS'] = 'ninja'
+
+ vs2013_runtime_dll_dirs = None
+ if int(os.environ.get('DEPOT_TOOLS_WIN_TOOLCHAIN', '1')):
+ vs2013_runtime_dll_dirs = vs_toolchain.SetEnvironmentAndGetRuntimeDllDirs()
+
+ # Enforce gyp syntax checking. This adds about 20% execution time.
+ args.append('--check')
+
+ supplemental_includes = gyp_chromium.GetSupplementalFiles()
+ gyp_vars_dict = gyp_chromium.GetGypVars(supplemental_includes)
+
+ # Automatically turn on crosscompile support for platforms that need it.
+ if all(('ninja' in os.environ.get('GYP_GENERATORS', ''),
+ gyp_vars_dict.get('OS') in ['android', 'ios'],
+ 'GYP_CROSSCOMPILE' not in os.environ)):
+ os.environ['GYP_CROSSCOMPILE'] = '1'
+
+ args.extend(['-I' + i for i in
+ gyp_chromium.additional_include_files(supplemental_includes,
+ args)])
+
+ # Set the gyp depth variable to the root of the checkout.
+ args.append('--depth=' + os.path.relpath(checkout_root))
+
+ print 'Updating projects from gyp files...'
+ sys.stdout.flush()
+
+ # Off we go...
+ gyp_rc = gyp.main(args)
+
+ if vs2013_runtime_dll_dirs:
+ x64_runtime, x86_runtime = vs2013_runtime_dll_dirs
+ vs_toolchain.CopyVsRuntimeDlls(
+ os.path.join(checkout_root, gyp_chromium.GetOutputDirectory()),
+ (x86_runtime, x64_runtime))
+
+ sys.exit(gyp_rc)
diff --git a/libyuv/src/main/cpp/libyuv/gyp_libyuv.py b/libyuv/src/main/cpp/libyuv/gyp_libyuv.py
new file mode 100644
index 0000000..bb32ec3
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/gyp_libyuv.py
@@ -0,0 +1,28 @@
+#!/usr/bin/env python
+#
+# Copyright 2014 The LibYuv Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style license
+# that can be found in the LICENSE file in the root of the source
+# tree. An additional intellectual property rights grant can be found
+# in the file PATENTS. All contributing project authors may
+# be found in the AUTHORS file in the root of the source tree.
+
+
+# This script is a modified copy of the src/build/gyp_chromium.py file.
+# It is needed for parallel processing.
+
+# This file is (possibly, depending on python version) imported by
+# gyp_libyuv when GYP_PARALLEL=1 and it creates sub-processes
+# through the multiprocessing library.
+
+# Importing in Python 2.6 (fixed in 2.7) on Windows doesn't search for
+# imports that don't end in .py (and aren't directories with an
+# __init__.py). This wrapper makes "import gyp_libyuv" work with
+# those old versions and makes it possible to execute gyp_libyuv.py
+# directly on Windows where the extension is useful.
+
+import os
+
+path = os.path.abspath(os.path.split(__file__)[0])
+execfile(os.path.join(path, 'gyp_libyuv'))
diff --git a/libyuv/src/main/cpp/libyuv/include/libyuv.h b/libyuv/src/main/cpp/libyuv/include/libyuv.h
new file mode 100644
index 0000000..aeffd5e
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/include/libyuv.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_H_
+#define INCLUDE_LIBYUV_H_
+
+#include "libyuv/basic_types.h"
+#include "libyuv/compare.h"
+#include "libyuv/convert.h"
+#include "libyuv/convert_argb.h"
+#include "libyuv/convert_from.h"
+#include "libyuv/convert_from_argb.h"
+#include "libyuv/cpu_id.h"
+#include "libyuv/mjpeg_decoder.h"
+#include "libyuv/planar_functions.h"
+#include "libyuv/rotate.h"
+#include "libyuv/rotate_argb.h"
+#include "libyuv/row.h"
+#include "libyuv/scale.h"
+#include "libyuv/scale_argb.h"
+#include "libyuv/scale_row.h"
+#include "libyuv/version.h"
+#include "libyuv/video_common.h"
+
+#endif // INCLUDE_LIBYUV_H_
diff --git a/libyuv/src/main/cpp/libyuv/include/libyuv/basic_types.h b/libyuv/src/main/cpp/libyuv/include/libyuv/basic_types.h
new file mode 100644
index 0000000..7d98bb9
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/include/libyuv/basic_types.h
@@ -0,0 +1,117 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_BASIC_TYPES_H_
+#define INCLUDE_LIBYUV_BASIC_TYPES_H_
+
+#include // for NULL, size_t
+
+#if defined(_MSC_VER) && (_MSC_VER < 1600)
+#include // for uintptr_t on x86
+#else
+#include // for uintptr_t
+#endif
+
+#ifndef GG_LONGLONG
+#ifndef INT_TYPES_DEFINED
+#define INT_TYPES_DEFINED
+#ifdef COMPILER_MSVC
+typedef unsigned __int64 uint64;
+typedef __int64 int64;
+#ifndef INT64_C
+#define INT64_C(x) x##I64
+#endif
+#ifndef UINT64_C
+#define UINT64_C(x) x##UI64
+#endif
+#define INT64_F "I64"
+#else // COMPILER_MSVC
+#if defined(__LP64__) && !defined(__OpenBSD__) && !defined(__APPLE__)
+typedef unsigned long uint64; // NOLINT
+typedef long int64; // NOLINT
+#ifndef INT64_C
+#define INT64_C(x) x##L
+#endif
+#ifndef UINT64_C
+#define UINT64_C(x) x##UL
+#endif
+#define INT64_F "l"
+#else // defined(__LP64__) && !defined(__OpenBSD__) && !defined(__APPLE__)
+typedef unsigned long long uint64; // NOLINT
+typedef long long int64; // NOLINT
+#ifndef INT64_C
+#define INT64_C(x) x##LL
+#endif
+#ifndef UINT64_C
+#define UINT64_C(x) x##ULL
+#endif
+#define INT64_F "ll"
+#endif // __LP64__
+#endif // COMPILER_MSVC
+typedef unsigned int uint32;
+typedef int int32;
+typedef unsigned short uint16; // NOLINT
+typedef short int16; // NOLINT
+typedef unsigned char uint8;
+typedef signed char int8;
+#endif // INT_TYPES_DEFINED
+#endif // GG_LONGLONG
+
+// Detect compiler is for x86 or x64.
+#if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || \
+ defined(_M_IX86)
+#define CPU_X86 1
+#endif
+// Detect compiler is for ARM.
+#if defined(__arm__) || defined(_M_ARM)
+#define CPU_ARM 1
+#endif
+
+#ifndef ALIGNP
+#ifdef __cplusplus
+#define ALIGNP(p, t) \
+ reinterpret_cast( \
+ ((reinterpret_cast(p) + ((t)-1)) & ~((t)-1)))
+#else
+#define ALIGNP(p, t) \
+ (uint8*)((((uintptr_t)(p) + ((t)-1)) & ~((t)-1))) /* NOLINT */
+#endif
+#endif
+
+#if !defined(LIBYUV_API)
+#if defined(_WIN32) || defined(__CYGWIN__)
+#if defined(LIBYUV_BUILDING_SHARED_LIBRARY)
+#define LIBYUV_API __declspec(dllexport)
+#elif defined(LIBYUV_USING_SHARED_LIBRARY)
+#define LIBYUV_API __declspec(dllimport)
+#else
+#define LIBYUV_API
+#endif // LIBYUV_BUILDING_SHARED_LIBRARY
+#elif defined(__GNUC__) && (__GNUC__ >= 4) && !defined(__APPLE__) && \
+ (defined(LIBYUV_BUILDING_SHARED_LIBRARY) || \
+ defined(LIBYUV_USING_SHARED_LIBRARY))
+#define LIBYUV_API __attribute__((visibility("default")))
+#else
+#define LIBYUV_API
+#endif // __GNUC__
+#endif // LIBYUV_API
+
+#define LIBYUV_BOOL int
+#define LIBYUV_FALSE 0
+#define LIBYUV_TRUE 1
+
+// Visual C x86 or GCC little endian.
+#if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || \
+ defined(_M_IX86) || defined(__arm__) || defined(_M_ARM) || \
+ (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
+#define LIBYUV_LITTLE_ENDIAN
+#endif
+
+#endif // INCLUDE_LIBYUV_BASIC_TYPES_H_
diff --git a/libyuv/src/main/cpp/libyuv/include/libyuv/compare.h b/libyuv/src/main/cpp/libyuv/include/libyuv/compare.h
new file mode 100644
index 0000000..a06eff2
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/include/libyuv/compare.h
@@ -0,0 +1,106 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_COMPARE_H_
+#define INCLUDE_LIBYUV_COMPARE_H_
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Compute a hash for specified memory. Seed of 5381 recommended.
+LIBYUV_API
+uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed);
+
+// Hamming Distance
+LIBYUV_API
+uint64 ComputeHammingDistance(const uint8* src_a,
+ const uint8* src_b,
+ int count);
+
+// Scan an opaque argb image and return fourcc based on alpha offset.
+// Returns FOURCC_ARGB, FOURCC_BGRA, or 0 if unknown.
+LIBYUV_API
+uint32 ARGBDetect(const uint8* argb, int stride_argb, int width, int height);
+
+// Sum Square Error - used to compute Mean Square Error or PSNR.
+LIBYUV_API
+uint64 ComputeSumSquareError(const uint8* src_a, const uint8* src_b, int count);
+
+LIBYUV_API
+uint64 ComputeSumSquareErrorPlane(const uint8* src_a,
+ int stride_a,
+ const uint8* src_b,
+ int stride_b,
+ int width,
+ int height);
+
+static const int kMaxPsnr = 128;
+
+LIBYUV_API
+double SumSquareErrorToPsnr(uint64 sse, uint64 count);
+
+LIBYUV_API
+double CalcFramePsnr(const uint8* src_a,
+ int stride_a,
+ const uint8* src_b,
+ int stride_b,
+ int width,
+ int height);
+
+LIBYUV_API
+double I420Psnr(const uint8* src_y_a,
+ int stride_y_a,
+ const uint8* src_u_a,
+ int stride_u_a,
+ const uint8* src_v_a,
+ int stride_v_a,
+ const uint8* src_y_b,
+ int stride_y_b,
+ const uint8* src_u_b,
+ int stride_u_b,
+ const uint8* src_v_b,
+ int stride_v_b,
+ int width,
+ int height);
+
+LIBYUV_API
+double CalcFrameSsim(const uint8* src_a,
+ int stride_a,
+ const uint8* src_b,
+ int stride_b,
+ int width,
+ int height);
+
+LIBYUV_API
+double I420Ssim(const uint8* src_y_a,
+ int stride_y_a,
+ const uint8* src_u_a,
+ int stride_u_a,
+ const uint8* src_v_a,
+ int stride_v_a,
+ const uint8* src_y_b,
+ int stride_y_b,
+ const uint8* src_u_b,
+ int stride_u_b,
+ const uint8* src_v_b,
+ int stride_v_b,
+ int width,
+ int height);
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
+
+#endif // INCLUDE_LIBYUV_COMPARE_H_
diff --git a/libyuv/src/main/cpp/libyuv/include/libyuv/compare_row.h b/libyuv/src/main/cpp/libyuv/include/libyuv/compare_row.h
new file mode 100644
index 0000000..971aecf
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/include/libyuv/compare_row.h
@@ -0,0 +1,90 @@
+/*
+ * Copyright 2013 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_COMPARE_ROW_H_
+#define INCLUDE_LIBYUV_COMPARE_ROW_H_
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#if defined(__pnacl__) || defined(__CLR_VER) || \
+ (defined(__i386__) && !defined(__SSE2__))
+#define LIBYUV_DISABLE_X86
+#endif
+// MemorySanitizer does not support assembly code yet. http://crbug.com/344505
+#if defined(__has_feature)
+#if __has_feature(memory_sanitizer)
+#define LIBYUV_DISABLE_X86
+#endif
+#endif
+
+// Visual C 2012 required for AVX2.
+#if defined(_M_IX86) && !defined(__clang__) && defined(_MSC_VER) && \
+ _MSC_VER >= 1700
+#define VISUALC_HAS_AVX2 1
+#endif // VisualStudio >= 2012
+
+// clang >= 3.4.0 required for AVX2.
+#if defined(__clang__) && (defined(__x86_64__) || defined(__i386__))
+#if (__clang_major__ > 3) || (__clang_major__ == 3 && (__clang_minor__ >= 4))
+#define CLANG_HAS_AVX2 1
+#endif // clang >= 3.4
+#endif // __clang__
+
+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \
+ (defined(VISUALC_HAS_AVX2) || defined(CLANG_HAS_AVX2))
+#define HAS_HASHDJB2_AVX2
+#endif
+
+// The following are available for Visual C and GCC:
+#if !defined(LIBYUV_DISABLE_X86) && \
+ (defined(__x86_64__) || defined(__i386__) || defined(_M_IX86))
+#define HAS_HASHDJB2_SSE41
+#define HAS_SUMSQUAREERROR_SSE2
+#define HAS_HAMMINGDISTANCE_X86
+#endif
+
+// The following are available for Visual C and clangcl 32 bit:
+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \
+ (defined(VISUALC_HAS_AVX2) || defined(CLANG_HAS_AVX2))
+#define HAS_HASHDJB2_AVX2
+#define HAS_SUMSQUAREERROR_AVX2
+#endif
+
+// The following are available for Neon:
+#if !defined(LIBYUV_DISABLE_NEON) && \
+ (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
+#define HAS_SUMSQUAREERROR_NEON
+#define HAS_HAMMINGDISTANCE_NEON
+#endif
+
+uint32 HammingDistance_C(const uint8* src_a, const uint8* src_b, int count);
+uint32 HammingDistance_X86(const uint8* src_a, const uint8* src_b, int count);
+uint32 HammingDistance_NEON(const uint8* src_a, const uint8* src_b, int count);
+
+uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b, int count);
+uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count);
+uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count);
+uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count);
+
+uint32 HashDjb2_C(const uint8* src, int count, uint32 seed);
+uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed);
+uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed);
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
+
+#endif // INCLUDE_LIBYUV_COMPARE_ROW_H_
diff --git a/libyuv/src/main/cpp/libyuv/include/libyuv/convert.h b/libyuv/src/main/cpp/libyuv/include/libyuv/convert.h
new file mode 100644
index 0000000..f096d19
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/include/libyuv/convert.h
@@ -0,0 +1,366 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_CONVERT_H_
+#define INCLUDE_LIBYUV_CONVERT_H_
+
+#include "libyuv/basic_types.h"
+
+#include "libyuv/rotate.h" // For enum RotationMode.
+
+// TODO(fbarchard): fix WebRTC source to include following libyuv headers:
+#include "libyuv/convert_argb.h" // For WebRTC I420ToARGB. b/620
+#include "libyuv/convert_from.h" // For WebRTC ConvertFromI420. b/620
+#include "libyuv/planar_functions.h" // For WebRTC I420Rect, CopyPlane. b/618
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Convert I444 to I420.
+LIBYUV_API
+int I444ToI420(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
+// Convert I422 to I420.
+LIBYUV_API
+int I422ToI420(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
+// Copy I420 to I420.
+#define I420ToI420 I420Copy
+LIBYUV_API
+int I420Copy(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
+// Convert I400 (grey) to I420.
+LIBYUV_API
+int I400ToI420(const uint8* src_y,
+ int src_stride_y,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
+#define J400ToJ420 I400ToI420
+
+// Convert NV12 to I420.
+LIBYUV_API
+int NV12ToI420(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_uv,
+ int src_stride_uv,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
+// Convert NV21 to I420.
+LIBYUV_API
+int NV21ToI420(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_vu,
+ int src_stride_vu,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
+// Convert YUY2 to I420.
+LIBYUV_API
+int YUY2ToI420(const uint8* src_yuy2,
+ int src_stride_yuy2,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
+// Convert UYVY to I420.
+LIBYUV_API
+int UYVYToI420(const uint8* src_uyvy,
+ int src_stride_uyvy,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
+// Convert M420 to I420.
+LIBYUV_API
+int M420ToI420(const uint8* src_m420,
+ int src_stride_m420,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
+// Convert Android420 to I420.
+LIBYUV_API
+int Android420ToI420(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ int pixel_stride_uv,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
+// ARGB little endian (bgra in memory) to I420.
+LIBYUV_API
+int ARGBToI420(const uint8* src_frame,
+ int src_stride_frame,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
+// BGRA little endian (argb in memory) to I420.
+LIBYUV_API
+int BGRAToI420(const uint8* src_frame,
+ int src_stride_frame,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
+// ABGR little endian (rgba in memory) to I420.
+LIBYUV_API
+int ABGRToI420(const uint8* src_frame,
+ int src_stride_frame,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
+// RGBA little endian (abgr in memory) to I420.
+LIBYUV_API
+int RGBAToI420(const uint8* src_frame,
+ int src_stride_frame,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
+// RGB little endian (bgr in memory) to I420.
+LIBYUV_API
+int RGB24ToI420(const uint8* src_frame,
+ int src_stride_frame,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
+// RGB big endian (rgb in memory) to I420.
+LIBYUV_API
+int RAWToI420(const uint8* src_frame,
+ int src_stride_frame,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
+// RGB16 (RGBP fourcc) little endian to I420.
+LIBYUV_API
+int RGB565ToI420(const uint8* src_frame,
+ int src_stride_frame,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
+// RGB15 (RGBO fourcc) little endian to I420.
+LIBYUV_API
+int ARGB1555ToI420(const uint8* src_frame,
+ int src_stride_frame,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
+// RGB12 (R444 fourcc) little endian to I420.
+LIBYUV_API
+int ARGB4444ToI420(const uint8* src_frame,
+ int src_stride_frame,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
+#ifdef HAVE_JPEG
+// src_width/height provided by capture.
+// dst_width/height for clipping determine final size.
+LIBYUV_API
+int MJPGToI420(const uint8* sample,
+ size_t sample_size,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height);
+
+// Query size of MJPG in pixels.
+LIBYUV_API
+int MJPGSize(const uint8* sample, size_t sample_size, int* width, int* height);
+#endif
+
+// Convert camera sample to I420 with cropping, rotation and vertical flip.
+// "src_size" is needed to parse MJPG.
+// "dst_stride_y" number of bytes in a row of the dst_y plane.
+// Normally this would be the same as dst_width, with recommended alignment
+// to 16 bytes for better efficiency.
+// If rotation of 90 or 270 is used, stride is affected. The caller should
+// allocate the I420 buffer according to rotation.
+// "dst_stride_u" number of bytes in a row of the dst_u plane.
+// Normally this would be the same as (dst_width + 1) / 2, with
+// recommended alignment to 16 bytes for better efficiency.
+// If rotation of 90 or 270 is used, stride is affected.
+// "crop_x" and "crop_y" are starting position for cropping.
+// To center, crop_x = (src_width - dst_width) / 2
+// crop_y = (src_height - dst_height) / 2
+// "src_width" / "src_height" is size of src_frame in pixels.
+// "src_height" can be negative indicating a vertically flipped image source.
+// "crop_width" / "crop_height" is the size to crop the src to.
+// Must be less than or equal to src_width/src_height
+// Cropping parameters are pre-rotation.
+// "rotation" can be 0, 90, 180 or 270.
+// "format" is a fourcc. ie 'I420', 'YUY2'
+// Returns 0 for successful; -1 for invalid parameter. Non-zero for failure.
+LIBYUV_API
+int ConvertToI420(const uint8* src_frame,
+ size_t src_size,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int crop_x,
+ int crop_y,
+ int src_width,
+ int src_height,
+ int crop_width,
+ int crop_height,
+ enum RotationMode rotation,
+ uint32 format);
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
+
+#endif // INCLUDE_LIBYUV_CONVERT_H_
diff --git a/libyuv/src/main/cpp/libyuv/include/libyuv/convert_argb.h b/libyuv/src/main/cpp/libyuv/include/libyuv/convert_argb.h
new file mode 100644
index 0000000..f43a506
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/include/libyuv/convert_argb.h
@@ -0,0 +1,454 @@
+/*
+ * Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_CONVERT_ARGB_H_
+#define INCLUDE_LIBYUV_CONVERT_ARGB_H_
+
+#include "libyuv/basic_types.h"
+
+#include "libyuv/rotate.h" // For enum RotationMode.
+
+// TODO(fbarchard): This set of functions should exactly match convert.h
+// TODO(fbarchard): Add tests. Create random content of right size and convert
+// with C vs Opt and or to I420 and compare.
+// TODO(fbarchard): Some of these functions lack parameter setting.
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Alias.
+#define ARGBToARGB ARGBCopy
+
+// Copy ARGB to ARGB.
+LIBYUV_API
+int ARGBCopy(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Convert I420 to ARGB.
+LIBYUV_API
+int I420ToARGB(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Duplicate prototype for function in convert_from.h for remoting.
+LIBYUV_API
+int I420ToABGR(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Convert I422 to ARGB.
+LIBYUV_API
+int I422ToARGB(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Convert I444 to ARGB.
+LIBYUV_API
+int I444ToARGB(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Convert J444 to ARGB.
+LIBYUV_API
+int J444ToARGB(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Convert I444 to ABGR.
+LIBYUV_API
+int I444ToABGR(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height);
+
+// Convert I420 with Alpha to preattenuated ARGB.
+LIBYUV_API
+int I420AlphaToARGB(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ const uint8* src_a,
+ int src_stride_a,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height,
+ int attenuate);
+
+// Convert I420 with Alpha to preattenuated ABGR.
+LIBYUV_API
+int I420AlphaToABGR(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ const uint8* src_a,
+ int src_stride_a,
+ uint8* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height,
+ int attenuate);
+
+// Convert I400 (grey) to ARGB. Reverse of ARGBToI400.
+LIBYUV_API
+int I400ToARGB(const uint8* src_y,
+ int src_stride_y,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Convert J400 (jpeg grey) to ARGB.
+LIBYUV_API
+int J400ToARGB(const uint8* src_y,
+ int src_stride_y,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Alias.
+#define YToARGB I400ToARGB
+
+// Convert NV12 to ARGB.
+LIBYUV_API
+int NV12ToARGB(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_uv,
+ int src_stride_uv,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Convert NV21 to ARGB.
+LIBYUV_API
+int NV21ToARGB(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_vu,
+ int src_stride_vu,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Convert M420 to ARGB.
+LIBYUV_API
+int M420ToARGB(const uint8* src_m420,
+ int src_stride_m420,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Convert YUY2 to ARGB.
+LIBYUV_API
+int YUY2ToARGB(const uint8* src_yuy2,
+ int src_stride_yuy2,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Convert UYVY to ARGB.
+LIBYUV_API
+int UYVYToARGB(const uint8* src_uyvy,
+ int src_stride_uyvy,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Convert J420 to ARGB.
+LIBYUV_API
+int J420ToARGB(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Convert J422 to ARGB.
+LIBYUV_API
+int J422ToARGB(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Convert J420 to ABGR.
+LIBYUV_API
+int J420ToABGR(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height);
+
+// Convert J422 to ABGR.
+LIBYUV_API
+int J422ToABGR(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height);
+
+// Convert H420 to ARGB.
+LIBYUV_API
+int H420ToARGB(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Convert H422 to ARGB.
+LIBYUV_API
+int H422ToARGB(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Convert H420 to ABGR.
+LIBYUV_API
+int H420ToABGR(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height);
+
+// Convert H422 to ABGR.
+LIBYUV_API
+int H422ToABGR(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height);
+
+// BGRA little endian (argb in memory) to ARGB.
+LIBYUV_API
+int BGRAToARGB(const uint8* src_frame,
+ int src_stride_frame,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// ABGR little endian (rgba in memory) to ARGB.
+LIBYUV_API
+int ABGRToARGB(const uint8* src_frame,
+ int src_stride_frame,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// RGBA little endian (abgr in memory) to ARGB.
+LIBYUV_API
+int RGBAToARGB(const uint8* src_frame,
+ int src_stride_frame,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Deprecated function name.
+#define BG24ToARGB RGB24ToARGB
+
+// RGB little endian (bgr in memory) to ARGB.
+LIBYUV_API
+int RGB24ToARGB(const uint8* src_frame,
+ int src_stride_frame,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// RGB big endian (rgb in memory) to ARGB.
+LIBYUV_API
+int RAWToARGB(const uint8* src_frame,
+ int src_stride_frame,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// RGB16 (RGBP fourcc) little endian to ARGB.
+LIBYUV_API
+int RGB565ToARGB(const uint8* src_frame,
+ int src_stride_frame,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// RGB15 (RGBO fourcc) little endian to ARGB.
+LIBYUV_API
+int ARGB1555ToARGB(const uint8* src_frame,
+ int src_stride_frame,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// RGB12 (R444 fourcc) little endian to ARGB.
+LIBYUV_API
+int ARGB4444ToARGB(const uint8* src_frame,
+ int src_stride_frame,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+#ifdef HAVE_JPEG
+// src_width/height provided by capture
+// dst_width/height for clipping determine final size.
+LIBYUV_API
+int MJPGToARGB(const uint8* sample,
+ size_t sample_size,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height);
+#endif
+
+// Convert camera sample to ARGB with cropping, rotation and vertical flip.
+// "src_size" is needed to parse MJPG.
+// "dst_stride_argb" number of bytes in a row of the dst_argb plane.
+// Normally this would be the same as dst_width, with recommended alignment
+// to 16 bytes for better efficiency.
+// If rotation of 90 or 270 is used, stride is affected. The caller should
+// allocate the I420 buffer according to rotation.
+// "dst_stride_u" number of bytes in a row of the dst_u plane.
+// Normally this would be the same as (dst_width + 1) / 2, with
+// recommended alignment to 16 bytes for better efficiency.
+// If rotation of 90 or 270 is used, stride is affected.
+// "crop_x" and "crop_y" are starting position for cropping.
+// To center, crop_x = (src_width - dst_width) / 2
+// crop_y = (src_height - dst_height) / 2
+// "src_width" / "src_height" is size of src_frame in pixels.
+// "src_height" can be negative indicating a vertically flipped image source.
+// "crop_width" / "crop_height" is the size to crop the src to.
+// Must be less than or equal to src_width/src_height
+// Cropping parameters are pre-rotation.
+// "rotation" can be 0, 90, 180 or 270.
+// "format" is a fourcc. ie 'I420', 'YUY2'
+// Returns 0 for successful; -1 for invalid parameter. Non-zero for failure.
+LIBYUV_API
+int ConvertToARGB(const uint8* src_frame,
+ size_t src_size,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int crop_x,
+ int crop_y,
+ int src_width,
+ int src_height,
+ int crop_width,
+ int crop_height,
+ enum RotationMode rotation,
+ uint32 format);
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
+
+#endif // INCLUDE_LIBYUV_CONVERT_ARGB_H_
diff --git a/libyuv/src/main/cpp/libyuv/include/libyuv/convert_from.h b/libyuv/src/main/cpp/libyuv/include/libyuv/convert_from.h
new file mode 100644
index 0000000..a050e44
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/include/libyuv/convert_from.h
@@ -0,0 +1,277 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_CONVERT_FROM_H_
+#define INCLUDE_LIBYUV_CONVERT_FROM_H_
+
+#include "libyuv/basic_types.h"
+#include "libyuv/rotate.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// See Also convert.h for conversions from formats to I420.
+
+// I420Copy in convert to I420ToI420.
+
+LIBYUV_API
+int I420ToI422(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
+LIBYUV_API
+int I420ToI444(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
+// Copy to I400. Source can be I420, I422, I444, I400, NV12 or NV21.
+LIBYUV_API
+int I400Copy(const uint8* src_y,
+ int src_stride_y,
+ uint8* dst_y,
+ int dst_stride_y,
+ int width,
+ int height);
+
+LIBYUV_API
+int I420ToNV12(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height);
+
+LIBYUV_API
+int I420ToNV21(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_vu,
+ int dst_stride_vu,
+ int width,
+ int height);
+
+LIBYUV_API
+int I420ToYUY2(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_frame,
+ int dst_stride_frame,
+ int width,
+ int height);
+
+LIBYUV_API
+int I420ToUYVY(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_frame,
+ int dst_stride_frame,
+ int width,
+ int height);
+
+LIBYUV_API
+int I420ToARGB(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+LIBYUV_API
+int I420ToBGRA(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+LIBYUV_API
+int I420ToABGR(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+LIBYUV_API
+int I420ToRGBA(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_rgba,
+ int dst_stride_rgba,
+ int width,
+ int height);
+
+LIBYUV_API
+int I420ToRGB24(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_frame,
+ int dst_stride_frame,
+ int width,
+ int height);
+
+LIBYUV_API
+int I420ToRAW(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_frame,
+ int dst_stride_frame,
+ int width,
+ int height);
+
+LIBYUV_API
+int I420ToRGB565(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_frame,
+ int dst_stride_frame,
+ int width,
+ int height);
+
+LIBYUV_API
+int I422ToRGB565(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_frame,
+ int dst_stride_frame,
+ int width,
+ int height);
+
+// Convert I420 To RGB565 with 4x4 dither matrix (16 bytes).
+// Values in dither matrix from 0 to 7 recommended.
+// The order of the dither matrix is first byte is upper left.
+
+LIBYUV_API
+int I420ToRGB565Dither(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_frame,
+ int dst_stride_frame,
+ const uint8* dither4x4,
+ int width,
+ int height);
+
+LIBYUV_API
+int I420ToARGB1555(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_frame,
+ int dst_stride_frame,
+ int width,
+ int height);
+
+LIBYUV_API
+int I420ToARGB4444(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_frame,
+ int dst_stride_frame,
+ int width,
+ int height);
+
+// Convert I420 to specified format.
+// "dst_sample_stride" is bytes in a row for the destination. Pass 0 if the
+// buffer has contiguous rows. Can be negative. A multiple of 16 is optimal.
+LIBYUV_API
+int ConvertFromI420(const uint8* y,
+ int y_stride,
+ const uint8* u,
+ int u_stride,
+ const uint8* v,
+ int v_stride,
+ uint8* dst_sample,
+ int dst_sample_stride,
+ int width,
+ int height,
+ uint32 format);
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
+
+#endif // INCLUDE_LIBYUV_CONVERT_FROM_H_
diff --git a/libyuv/src/main/cpp/libyuv/include/libyuv/convert_from_argb.h b/libyuv/src/main/cpp/libyuv/include/libyuv/convert_from_argb.h
new file mode 100644
index 0000000..50722d7
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/include/libyuv/convert_from_argb.h
@@ -0,0 +1,265 @@
+/*
+ * Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_
+#define INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Copy ARGB to ARGB.
+#define ARGBToARGB ARGBCopy
+LIBYUV_API
+int ARGBCopy(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Convert ARGB To BGRA.
+LIBYUV_API
+int ARGBToBGRA(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_bgra,
+ int dst_stride_bgra,
+ int width,
+ int height);
+
+// Convert ARGB To ABGR.
+LIBYUV_API
+int ARGBToABGR(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height);
+
+// Convert ARGB To RGBA.
+LIBYUV_API
+int ARGBToRGBA(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_rgba,
+ int dst_stride_rgba,
+ int width,
+ int height);
+
+// Convert ARGB To RGB24.
+LIBYUV_API
+int ARGBToRGB24(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_rgb24,
+ int dst_stride_rgb24,
+ int width,
+ int height);
+
+// Convert ARGB To RAW.
+LIBYUV_API
+int ARGBToRAW(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_rgb,
+ int dst_stride_rgb,
+ int width,
+ int height);
+
+// Convert ARGB To RGB565.
+LIBYUV_API
+int ARGBToRGB565(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_rgb565,
+ int dst_stride_rgb565,
+ int width,
+ int height);
+
+// Convert ARGB To RGB565 with 4x4 dither matrix (16 bytes).
+// Values in dither matrix from 0 to 7 recommended.
+// The order of the dither matrix is first byte is upper left.
+// TODO(fbarchard): Consider pointer to 2d array for dither4x4.
+// const uint8(*dither)[4][4];
+LIBYUV_API
+int ARGBToRGB565Dither(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_rgb565,
+ int dst_stride_rgb565,
+ const uint8* dither4x4,
+ int width,
+ int height);
+
+// Convert ARGB To ARGB1555.
+LIBYUV_API
+int ARGBToARGB1555(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_argb1555,
+ int dst_stride_argb1555,
+ int width,
+ int height);
+
+// Convert ARGB To ARGB4444.
+LIBYUV_API
+int ARGBToARGB4444(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_argb4444,
+ int dst_stride_argb4444,
+ int width,
+ int height);
+
+// Convert ARGB To I444.
+LIBYUV_API
+int ARGBToI444(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
+// Convert ARGB To I422.
+LIBYUV_API
+int ARGBToI422(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
+// Convert ARGB To I420. (also in convert.h)
+LIBYUV_API
+int ARGBToI420(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
+// Convert ARGB to J420. (JPeg full range I420).
+LIBYUV_API
+int ARGBToJ420(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_yj,
+ int dst_stride_yj,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
+// Convert ARGB to J422.
+LIBYUV_API
+int ARGBToJ422(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_yj,
+ int dst_stride_yj,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
+// Convert ARGB to J400. (JPeg full range).
+LIBYUV_API
+int ARGBToJ400(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_yj,
+ int dst_stride_yj,
+ int width,
+ int height);
+
+// Convert ARGB to I400.
+LIBYUV_API
+int ARGBToI400(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_y,
+ int dst_stride_y,
+ int width,
+ int height);
+
+// Convert ARGB to G. (Reverse of J400toARGB, which replicates G back to ARGB)
+LIBYUV_API
+int ARGBToG(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_g,
+ int dst_stride_g,
+ int width,
+ int height);
+
+// Convert ARGB To NV12.
+LIBYUV_API
+int ARGBToNV12(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height);
+
+// Convert ARGB To NV21.
+LIBYUV_API
+int ARGBToNV21(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_vu,
+ int dst_stride_vu,
+ int width,
+ int height);
+
+// Convert ARGB To NV21.
+LIBYUV_API
+int ARGBToNV21(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_vu,
+ int dst_stride_vu,
+ int width,
+ int height);
+
+// Convert ARGB To YUY2.
+LIBYUV_API
+int ARGBToYUY2(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_yuy2,
+ int dst_stride_yuy2,
+ int width,
+ int height);
+
+// Convert ARGB To UYVY.
+LIBYUV_API
+int ARGBToUYVY(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_uyvy,
+ int dst_stride_uyvy,
+ int width,
+ int height);
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
+
+#endif // INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_
diff --git a/libyuv/src/main/cpp/libyuv/include/libyuv/cpu_id.h b/libyuv/src/main/cpp/libyuv/include/libyuv/cpu_id.h
new file mode 100644
index 0000000..6d1afbd
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/include/libyuv/cpu_id.h
@@ -0,0 +1,91 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_CPU_ID_H_
+#define INCLUDE_LIBYUV_CPU_ID_H_
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Internal flag to indicate cpuid requires initialization.
+static const int kCpuInitialized = 0x1;
+
+// These flags are only valid on ARM processors.
+static const int kCpuHasARM = 0x2;
+static const int kCpuHasNEON = 0x4;
+// 0x8 reserved for future ARM flag.
+
+// These flags are only valid on x86 processors.
+static const int kCpuHasX86 = 0x10;
+static const int kCpuHasSSE2 = 0x20;
+static const int kCpuHasSSSE3 = 0x40;
+static const int kCpuHasSSE41 = 0x80;
+static const int kCpuHasSSE42 = 0x100; // unused at this time.
+static const int kCpuHasAVX = 0x200;
+static const int kCpuHasAVX2 = 0x400;
+static const int kCpuHasERMS = 0x800;
+static const int kCpuHasFMA3 = 0x1000;
+static const int kCpuHasAVX3 = 0x2000;
+static const int kCpuHasF16C = 0x4000;
+
+// 0x8000 reserved for future X86 flags.
+
+// These flags are only valid on MIPS processors.
+static const int kCpuHasMIPS = 0x10000;
+static const int kCpuHasDSPR2 = 0x20000;
+static const int kCpuHasMSA = 0x40000;
+
+// Optional init function. TestCpuFlag does an auto-init.
+// Returns cpu_info flags.
+LIBYUV_API
+int InitCpuFlags(void);
+
+// Detect CPU has SSE2 etc.
+// Test_flag parameter should be one of kCpuHas constants above.
+// Returns non-zero if instruction set is detected
+static __inline int TestCpuFlag(int test_flag) {
+ LIBYUV_API extern int cpu_info_;
+#ifdef __ATOMIC_RELAXED
+ int cpu_info = __atomic_load_n(&cpu_info_, __ATOMIC_RELAXED);
+#else
+ int cpu_info = cpu_info_;
+#endif
+ return (!cpu_info ? InitCpuFlags() : cpu_info) & test_flag;
+}
+
+// Internal function for parsing /proc/cpuinfo.
+LIBYUV_API
+int ArmCpuCaps(const char* cpuinfo_name);
+
+// For testing, allow CPU flags to be disabled.
+// ie MaskCpuFlags(~kCpuHasSSSE3) to disable SSSE3.
+// MaskCpuFlags(-1) to enable all cpu specific optimizations.
+// MaskCpuFlags(1) to disable all cpu specific optimizations.
+// MaskCpuFlags(0) to reset state so next call will auto init.
+// Returns cpu_info flags.
+LIBYUV_API
+int MaskCpuFlags(int enable_flags);
+
+// Low level cpuid for X86. Returns zeros on other CPUs.
+// eax is the info type that you want.
+// ecx is typically the cpu number, and should normally be zero.
+LIBYUV_API
+void CpuId(int eax, int ecx, int* cpu_info);
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
+
+#endif // INCLUDE_LIBYUV_CPU_ID_H_
diff --git a/libyuv/src/main/cpp/libyuv/include/libyuv/macros_msa.h b/libyuv/src/main/cpp/libyuv/include/libyuv/macros_msa.h
new file mode 100644
index 0000000..61be352
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/include/libyuv/macros_msa.h
@@ -0,0 +1,233 @@
+/*
+ * Copyright 2016 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_MACROS_MSA_H_
+#define INCLUDE_LIBYUV_MACROS_MSA_H_
+
+#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
+#include
+#include
+
+#if (__mips_isa_rev >= 6)
+#define LW(psrc) \
+ ({ \
+ uint8* psrc_lw_m = (uint8*)(psrc); /* NOLINT */ \
+ uint32 val_m; \
+ asm volatile("lw %[val_m], %[psrc_lw_m] \n" \
+ : [val_m] "=r"(val_m) \
+ : [psrc_lw_m] "m"(*psrc_lw_m)); \
+ val_m; \
+ })
+
+#if (__mips == 64)
+#define LD(psrc) \
+ ({ \
+ uint8* psrc_ld_m = (uint8*)(psrc); /* NOLINT */ \
+ uint64 val_m = 0; \
+ asm volatile("ld %[val_m], %[psrc_ld_m] \n" \
+ : [val_m] "=r"(val_m) \
+ : [psrc_ld_m] "m"(*psrc_ld_m)); \
+ val_m; \
+ })
+#else // !(__mips == 64)
+#define LD(psrc) \
+ ({ \
+ uint8* psrc_ld_m = (uint8*)(psrc); /* NOLINT */ \
+ uint32 val0_m, val1_m; \
+ uint64 val_m = 0; \
+ val0_m = LW(psrc_ld_m); \
+ val1_m = LW(psrc_ld_m + 4); \
+ val_m = (uint64)(val1_m); /* NOLINT */ \
+ val_m = (uint64)((val_m << 32) & 0xFFFFFFFF00000000); /* NOLINT */ \
+ val_m = (uint64)(val_m | (uint64)val0_m); /* NOLINT */ \
+ val_m; \
+ })
+#endif // (__mips == 64)
+
+#define SW(val, pdst) \
+ ({ \
+ uint8_t* pdst_sw_m = (uint8_t*)(pdst); /* NOLINT */ \
+ uint32_t val_m = (val); \
+ asm volatile("sw %[val_m], %[pdst_sw_m] \n" \
+ : [pdst_sw_m] "=m"(*pdst_sw_m) \
+ : [val_m] "r"(val_m)); \
+ })
+
+#if (__mips == 64)
+#define SD(val, pdst) \
+ ({ \
+ uint8_t* pdst_sd_m = (uint8_t*)(pdst); /* NOLINT */ \
+ uint64_t val_m = (val); \
+ asm volatile("sd %[val_m], %[pdst_sd_m] \n" \
+ : [pdst_sd_m] "=m"(*pdst_sd_m) \
+ : [val_m] "r"(val_m)); \
+ })
+#else // !(__mips == 64)
+#define SD(val, pdst) \
+ ({ \
+ uint8_t* pdst_sd_m = (uint8_t*)(pdst); /* NOLINT */ \
+ uint32_t val0_m, val1_m; \
+ val0_m = (uint32_t)((val)&0x00000000FFFFFFFF); \
+ val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \
+ SW(val0_m, pdst_sd_m); \
+ SW(val1_m, pdst_sd_m + 4); \
+ })
+#endif // !(__mips == 64)
+#else // !(__mips_isa_rev >= 6)
+#define LW(psrc) \
+ ({ \
+ uint8* psrc_lw_m = (uint8*)(psrc); /* NOLINT */ \
+ uint32 val_m; \
+ asm volatile("ulw %[val_m], %[psrc_lw_m] \n" \
+ : [val_m] "=r"(val_m) \
+ : [psrc_lw_m] "m"(*psrc_lw_m)); \
+ val_m; \
+ })
+
+#if (__mips == 64)
+#define LD(psrc) \
+ ({ \
+ uint8* psrc_ld_m = (uint8*)(psrc); /* NOLINT */ \
+ uint64 val_m = 0; \
+ asm volatile("uld %[val_m], %[psrc_ld_m] \n" \
+ : [val_m] "=r"(val_m) \
+ : [psrc_ld_m] "m"(*psrc_ld_m)); \
+ val_m; \
+ })
+#else // !(__mips == 64)
+#define LD(psrc) \
+ ({ \
+ uint8* psrc_ld_m = (uint8*)(psrc); /* NOLINT */ \
+ uint32 val0_m, val1_m; \
+ uint64 val_m = 0; \
+ val0_m = LW(psrc_ld_m); \
+ val1_m = LW(psrc_ld_m + 4); \
+ val_m = (uint64)(val1_m); /* NOLINT */ \
+ val_m = (uint64)((val_m << 32) & 0xFFFFFFFF00000000); /* NOLINT */ \
+ val_m = (uint64)(val_m | (uint64)val0_m); /* NOLINT */ \
+ val_m; \
+ })
+#endif // (__mips == 64)
+
+#define SW(val, pdst) \
+ ({ \
+ uint8_t* pdst_sw_m = (uint8_t*)(pdst); /* NOLINT */ \
+ uint32_t val_m = (val); \
+ asm volatile("usw %[val_m], %[pdst_sw_m] \n" \
+ : [pdst_sw_m] "=m"(*pdst_sw_m) \
+ : [val_m] "r"(val_m)); \
+ })
+
+#define SD(val, pdst) \
+ ({ \
+ uint8_t* pdst_sd_m = (uint8_t*)(pdst); /* NOLINT */ \
+ uint32_t val0_m, val1_m; \
+ val0_m = (uint32_t)((val)&0x00000000FFFFFFFF); \
+ val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \
+ SW(val0_m, pdst_sd_m); \
+ SW(val1_m, pdst_sd_m + 4); \
+ })
+#endif // (__mips_isa_rev >= 6)
+
+// TODO(fbarchard): Consider removing __VAR_ARGS versions.
+#define LD_B(RTYPE, psrc) *((RTYPE*)(psrc)) /* NOLINT */
+#define LD_UB(...) LD_B(v16u8, __VA_ARGS__)
+
+#define ST_B(RTYPE, in, pdst) *((RTYPE*)(pdst)) = (in) /* NOLINT */
+#define ST_UB(...) ST_B(v16u8, __VA_ARGS__)
+
+#define ST_H(RTYPE, in, pdst) *((RTYPE*)(pdst)) = (in) /* NOLINT */
+#define ST_UH(...) ST_H(v8u16, __VA_ARGS__)
+
+/* Description : Load two vectors with 16 'byte' sized elements
+ Arguments : Inputs - psrc, stride
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Load 16 byte elements in 'out0' from (psrc)
+ Load 16 byte elements in 'out1' from (psrc + stride)
+*/
+#define LD_B2(RTYPE, psrc, stride, out0, out1) \
+ { \
+ out0 = LD_B(RTYPE, (psrc)); \
+ out1 = LD_B(RTYPE, (psrc) + stride); \
+ }
+#define LD_UB2(...) LD_B2(v16u8, __VA_ARGS__)
+
+#define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3) \
+ { \
+ LD_B2(RTYPE, (psrc), stride, out0, out1); \
+ LD_B2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \
+ }
+#define LD_UB4(...) LD_B4(v16u8, __VA_ARGS__)
+
+/* Description : Store two vectors with stride each having 16 'byte' sized
+ elements
+ Arguments : Inputs - in0, in1, pdst, stride
+ Details : Store 16 byte elements from 'in0' to (pdst)
+ Store 16 byte elements from 'in1' to (pdst + stride)
+*/
+#define ST_B2(RTYPE, in0, in1, pdst, stride) \
+ { \
+ ST_B(RTYPE, in0, (pdst)); \
+ ST_B(RTYPE, in1, (pdst) + stride); \
+ }
+#define ST_UB2(...) ST_B2(v16u8, __VA_ARGS__)
+
+#define ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride) \
+ { \
+ ST_B2(RTYPE, in0, in1, (pdst), stride); \
+ ST_B2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \
+ }
+#define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__)
+
+/* Description : Store vectors of 8 halfword elements with stride
+ Arguments : Inputs - in0, in1, pdst, stride
+ Details : Store 8 halfword elements from 'in0' to (pdst)
+ Store 8 halfword elements from 'in1' to (pdst + stride)
+*/
+#define ST_H2(RTYPE, in0, in1, pdst, stride) \
+ { \
+ ST_H(RTYPE, in0, (pdst)); \
+ ST_H(RTYPE, in1, (pdst) + stride); \
+ }
+#define ST_UH2(...) ST_H2(v8u16, __VA_ARGS__)
+
+// TODO(fbarchard): Consider using __msa_vshf_b and __msa_ilvr_b directly.
+/* Description : Shuffle byte vector elements as per mask vector
+ Arguments : Inputs - in0, in1, in2, in3, mask0, mask1
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Byte elements from 'in0' & 'in1' are copied selectively to
+ 'out0' as per control vector 'mask0'
+*/
+#define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_vshf_b((v16i8)mask0, (v16i8)in1, (v16i8)in0); \
+ out1 = (RTYPE)__msa_vshf_b((v16i8)mask1, (v16i8)in3, (v16i8)in2); \
+ }
+#define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__)
+
+/* Description : Interleave both left and right half of input vectors
+ Arguments : Inputs - in0, in1
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Right half of byte elements from 'in0' and 'in1' are
+ interleaved and written to 'out0'
+*/
+#define ILVRL_B2(RTYPE, in0, in1, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \
+ out1 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \
+ }
+#define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__)
+
+#endif /* !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) */
+
+#endif // INCLUDE_LIBYUV_MACROS_MSA_H_
diff --git a/libyuv/src/main/cpp/libyuv/include/libyuv/mjpeg_decoder.h b/libyuv/src/main/cpp/libyuv/include/libyuv/mjpeg_decoder.h
new file mode 100644
index 0000000..8a4f282
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/include/libyuv/mjpeg_decoder.h
@@ -0,0 +1,195 @@
+/*
+ * Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_MJPEG_DECODER_H_
+#define INCLUDE_LIBYUV_MJPEG_DECODER_H_
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+// NOTE: For a simplified public API use convert.h MJPGToI420().
+
+struct jpeg_common_struct;
+struct jpeg_decompress_struct;
+struct jpeg_source_mgr;
+
+namespace libyuv {
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+LIBYUV_BOOL ValidateJpeg(const uint8* sample, size_t sample_size);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+static const uint32 kUnknownDataSize = 0xFFFFFFFF;
+
+enum JpegSubsamplingType {
+ kJpegYuv420,
+ kJpegYuv422,
+ kJpegYuv444,
+ kJpegYuv400,
+ kJpegUnknown
+};
+
+struct Buffer {
+ const uint8* data;
+ int len;
+};
+
+struct BufferVector {
+ Buffer* buffers;
+ int len;
+ int pos;
+};
+
+struct SetJmpErrorMgr;
+
+// MJPEG ("Motion JPEG") is a pseudo-standard video codec where the frames are
+// simply independent JPEG images with a fixed huffman table (which is omitted).
+// It is rarely used in video transmission, but is common as a camera capture
+// format, especially in Logitech devices. This class implements a decoder for
+// MJPEG frames.
+//
+// See http://tools.ietf.org/html/rfc2435
+class LIBYUV_API MJpegDecoder {
+ public:
+ typedef void (*CallbackFunction)(void* opaque,
+ const uint8* const* data,
+ const int* strides,
+ int rows);
+
+ static const int kColorSpaceUnknown;
+ static const int kColorSpaceGrayscale;
+ static const int kColorSpaceRgb;
+ static const int kColorSpaceYCbCr;
+ static const int kColorSpaceCMYK;
+ static const int kColorSpaceYCCK;
+
+ MJpegDecoder();
+ ~MJpegDecoder();
+
+ // Loads a new frame, reads its headers, and determines the uncompressed
+ // image format.
+ // Returns LIBYUV_TRUE if image looks valid and format is supported.
+ // If return value is LIBYUV_TRUE, then the values for all the following
+ // getters are populated.
+ // src_len is the size of the compressed mjpeg frame in bytes.
+ LIBYUV_BOOL LoadFrame(const uint8* src, size_t src_len);
+
+ // Returns width of the last loaded frame in pixels.
+ int GetWidth();
+
+ // Returns height of the last loaded frame in pixels.
+ int GetHeight();
+
+ // Returns format of the last loaded frame. The return value is one of the
+ // kColorSpace* constants.
+ int GetColorSpace();
+
+ // Number of color components in the color space.
+ int GetNumComponents();
+
+ // Sample factors of the n-th component.
+ int GetHorizSampFactor(int component);
+
+ int GetVertSampFactor(int component);
+
+ int GetHorizSubSampFactor(int component);
+
+ int GetVertSubSampFactor(int component);
+
+ // Public for testability.
+ int GetImageScanlinesPerImcuRow();
+
+ // Public for testability.
+ int GetComponentScanlinesPerImcuRow(int component);
+
+ // Width of a component in bytes.
+ int GetComponentWidth(int component);
+
+ // Height of a component.
+ int GetComponentHeight(int component);
+
+ // Width of a component in bytes with padding for DCTSIZE. Public for testing.
+ int GetComponentStride(int component);
+
+ // Size of a component in bytes.
+ int GetComponentSize(int component);
+
+ // Call this after LoadFrame() if you decide you don't want to decode it
+ // after all.
+ LIBYUV_BOOL UnloadFrame();
+
+ // Decodes the entire image into a one-buffer-per-color-component format.
+ // dst_width must match exactly. dst_height must be <= to image height; if
+ // less, the image is cropped. "planes" must have size equal to at least
+ // GetNumComponents() and they must point to non-overlapping buffers of size
+ // at least GetComponentSize(i). The pointers in planes are incremented
+ // to point to after the end of the written data.
+ // TODO(fbarchard): Add dst_x, dst_y to allow specific rect to be decoded.
+ LIBYUV_BOOL DecodeToBuffers(uint8** planes, int dst_width, int dst_height);
+
+ // Decodes the entire image and passes the data via repeated calls to a
+ // callback function. Each call will get the data for a whole number of
+ // image scanlines.
+ // TODO(fbarchard): Add dst_x, dst_y to allow specific rect to be decoded.
+ LIBYUV_BOOL DecodeToCallback(CallbackFunction fn,
+ void* opaque,
+ int dst_width,
+ int dst_height);
+
+ // The helper function which recognizes the jpeg sub-sampling type.
+ static JpegSubsamplingType JpegSubsamplingTypeHelper(
+ int* subsample_x,
+ int* subsample_y,
+ int number_of_components);
+
+ private:
+ void AllocOutputBuffers(int num_outbufs);
+ void DestroyOutputBuffers();
+
+ LIBYUV_BOOL StartDecode();
+ LIBYUV_BOOL FinishDecode();
+
+ void SetScanlinePointers(uint8** data);
+ LIBYUV_BOOL DecodeImcuRow();
+
+ int GetComponentScanlinePadding(int component);
+
+ // A buffer holding the input data for a frame.
+ Buffer buf_;
+ BufferVector buf_vec_;
+
+ jpeg_decompress_struct* decompress_struct_;
+ jpeg_source_mgr* source_mgr_;
+ SetJmpErrorMgr* error_mgr_;
+
+ // LIBYUV_TRUE iff at least one component has scanline padding. (i.e.,
+ // GetComponentScanlinePadding() != 0.)
+ LIBYUV_BOOL has_scanline_padding_;
+
+ // Temporaries used to point to scanline outputs.
+ int num_outbufs_; // Outermost size of all arrays below.
+ uint8*** scanlines_;
+ int* scanlines_sizes_;
+ // Temporary buffer used for decoding when we can't decode directly to the
+ // output buffers. Large enough for just one iMCU row.
+ uint8** databuf_;
+ int* databuf_strides_;
+};
+
+} // namespace libyuv
+
+#endif // __cplusplus
+#endif // INCLUDE_LIBYUV_MJPEG_DECODER_H_
diff --git a/libyuv/src/main/cpp/libyuv/include/libyuv/planar_functions.h b/libyuv/src/main/cpp/libyuv/include/libyuv/planar_functions.h
new file mode 100644
index 0000000..040839c
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/include/libyuv/planar_functions.h
@@ -0,0 +1,796 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_
+#define INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_
+
+#include "libyuv/basic_types.h"
+
+// TODO(fbarchard): Remove the following headers includes.
+#include "libyuv/convert.h"
+#include "libyuv/convert_argb.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Copy a plane of data.
+LIBYUV_API
+void CopyPlane(const uint8* src_y,
+ int src_stride_y,
+ uint8* dst_y,
+ int dst_stride_y,
+ int width,
+ int height);
+
+LIBYUV_API
+void CopyPlane_16(const uint16* src_y,
+ int src_stride_y,
+ uint16* dst_y,
+ int dst_stride_y,
+ int width,
+ int height);
+
+// Set a plane of data to a 32 bit value.
+LIBYUV_API
+void SetPlane(uint8* dst_y,
+ int dst_stride_y,
+ int width,
+ int height,
+ uint32 value);
+
+// Split interleaved UV plane into separate U and V planes.
+LIBYUV_API
+void SplitUVPlane(const uint8* src_uv,
+ int src_stride_uv,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
+// Merge separate U and V planes into one interleaved UV plane.
+LIBYUV_API
+void MergeUVPlane(const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height);
+
+// Copy I400. Supports inverting.
+LIBYUV_API
+int I400ToI400(const uint8* src_y,
+ int src_stride_y,
+ uint8* dst_y,
+ int dst_stride_y,
+ int width,
+ int height);
+
+#define J400ToJ400 I400ToI400
+
+// Copy I422 to I422.
+#define I422ToI422 I422Copy
+LIBYUV_API
+int I422Copy(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
+// Copy I444 to I444.
+#define I444ToI444 I444Copy
+LIBYUV_API
+int I444Copy(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
+// Convert YUY2 to I422.
+LIBYUV_API
+int YUY2ToI422(const uint8* src_yuy2,
+ int src_stride_yuy2,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
+// Convert UYVY to I422.
+LIBYUV_API
+int UYVYToI422(const uint8* src_uyvy,
+ int src_stride_uyvy,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
+LIBYUV_API
+int YUY2ToNV12(const uint8* src_yuy2,
+ int src_stride_yuy2,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height);
+
+LIBYUV_API
+int UYVYToNV12(const uint8* src_uyvy,
+ int src_stride_uyvy,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height);
+
+LIBYUV_API
+int YUY2ToY(const uint8* src_yuy2,
+ int src_stride_yuy2,
+ uint8* dst_y,
+ int dst_stride_y,
+ int width,
+ int height);
+
+// Convert I420 to I400. (calls CopyPlane ignoring u/v).
+LIBYUV_API
+int I420ToI400(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_y,
+ int dst_stride_y,
+ int width,
+ int height);
+
+// Alias
+#define J420ToJ400 I420ToI400
+#define I420ToI420Mirror I420Mirror
+
+// I420 mirror.
+LIBYUV_API
+int I420Mirror(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
+// Alias
+#define I400ToI400Mirror I400Mirror
+
+// I400 mirror. A single plane is mirrored horizontally.
+// Pass negative height to achieve 180 degree rotation.
+LIBYUV_API
+int I400Mirror(const uint8* src_y,
+ int src_stride_y,
+ uint8* dst_y,
+ int dst_stride_y,
+ int width,
+ int height);
+
+// Alias
+#define ARGBToARGBMirror ARGBMirror
+
+// ARGB mirror.
+LIBYUV_API
+int ARGBMirror(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Convert NV12 to RGB565.
+LIBYUV_API
+int NV12ToRGB565(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_uv,
+ int src_stride_uv,
+ uint8* dst_rgb565,
+ int dst_stride_rgb565,
+ int width,
+ int height);
+
+// I422ToARGB is in convert_argb.h
+// Convert I422 to BGRA.
+LIBYUV_API
+int I422ToBGRA(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_bgra,
+ int dst_stride_bgra,
+ int width,
+ int height);
+
+// Convert I422 to ABGR.
+LIBYUV_API
+int I422ToABGR(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height);
+
+// Convert I422 to RGBA.
+LIBYUV_API
+int I422ToRGBA(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_rgba,
+ int dst_stride_rgba,
+ int width,
+ int height);
+
+// Alias
+#define RGB24ToRAW RAWToRGB24
+
+LIBYUV_API
+int RAWToRGB24(const uint8* src_raw,
+ int src_stride_raw,
+ uint8* dst_rgb24,
+ int dst_stride_rgb24,
+ int width,
+ int height);
+
+// Draw a rectangle into I420.
+LIBYUV_API
+int I420Rect(uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int x,
+ int y,
+ int width,
+ int height,
+ int value_y,
+ int value_u,
+ int value_v);
+
+// Draw a rectangle into ARGB.
+LIBYUV_API
+int ARGBRect(uint8* dst_argb,
+ int dst_stride_argb,
+ int x,
+ int y,
+ int width,
+ int height,
+ uint32 value);
+
+// Convert ARGB to gray scale ARGB.
+LIBYUV_API
+int ARGBGrayTo(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Make a rectangle of ARGB gray scale.
+LIBYUV_API
+int ARGBGray(uint8* dst_argb,
+ int dst_stride_argb,
+ int x,
+ int y,
+ int width,
+ int height);
+
+// Make a rectangle of ARGB Sepia tone.
+LIBYUV_API
+int ARGBSepia(uint8* dst_argb,
+ int dst_stride_argb,
+ int x,
+ int y,
+ int width,
+ int height);
+
+// Apply a matrix rotation to each ARGB pixel.
+// matrix_argb is 4 signed ARGB values. -128 to 127 representing -2 to 2.
+// The first 4 coefficients apply to B, G, R, A and produce B of the output.
+// The next 4 coefficients apply to B, G, R, A and produce G of the output.
+// The next 4 coefficients apply to B, G, R, A and produce R of the output.
+// The last 4 coefficients apply to B, G, R, A and produce A of the output.
+LIBYUV_API
+int ARGBColorMatrix(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ const int8* matrix_argb,
+ int width,
+ int height);
+
+// Deprecated. Use ARGBColorMatrix instead.
+// Apply a matrix rotation to each ARGB pixel.
+// matrix_argb is 3 signed ARGB values. -128 to 127 representing -1 to 1.
+// The first 4 coefficients apply to B, G, R, A and produce B of the output.
+// The next 4 coefficients apply to B, G, R, A and produce G of the output.
+// The last 4 coefficients apply to B, G, R, A and produce R of the output.
+LIBYUV_API
+int RGBColorMatrix(uint8* dst_argb,
+ int dst_stride_argb,
+ const int8* matrix_rgb,
+ int x,
+ int y,
+ int width,
+ int height);
+
+// Apply a color table each ARGB pixel.
+// Table contains 256 ARGB values.
+LIBYUV_API
+int ARGBColorTable(uint8* dst_argb,
+ int dst_stride_argb,
+ const uint8* table_argb,
+ int x,
+ int y,
+ int width,
+ int height);
+
+// Apply a color table each ARGB pixel but preserve destination alpha.
+// Table contains 256 ARGB values.
+LIBYUV_API
+int RGBColorTable(uint8* dst_argb,
+ int dst_stride_argb,
+ const uint8* table_argb,
+ int x,
+ int y,
+ int width,
+ int height);
+
+// Apply a luma/color table each ARGB pixel but preserve destination alpha.
+// Table contains 32768 values indexed by [Y][C] where 7 it 7 bit luma from
+// RGB (YJ style) and C is an 8 bit color component (R, G or B).
+LIBYUV_API
+int ARGBLumaColorTable(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ const uint8* luma_rgb_table,
+ int width,
+ int height);
+
+// Apply a 3 term polynomial to ARGB values.
+// poly points to a 4x4 matrix. The first row is constants. The 2nd row is
+// coefficients for b, g, r and a. The 3rd row is coefficients for b squared,
+// g squared, r squared and a squared. The 4rd row is coefficients for b to
+// the 3, g to the 3, r to the 3 and a to the 3. The values are summed and
+// result clamped to 0 to 255.
+// A polynomial approximation can be dirived using software such as 'R'.
+
+LIBYUV_API
+int ARGBPolynomial(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ const float* poly,
+ int width,
+ int height);
+
+// Convert plane of 16 bit shorts to half floats.
+// Source values are multiplied by scale before storing as half float.
+LIBYUV_API
+int HalfFloatPlane(const uint16* src_y,
+ int src_stride_y,
+ uint16* dst_y,
+ int dst_stride_y,
+ float scale,
+ int width,
+ int height);
+
+// Quantize a rectangle of ARGB. Alpha unaffected.
+// scale is a 16 bit fractional fixed point scaler between 0 and 65535.
+// interval_size should be a value between 1 and 255.
+// interval_offset should be a value between 0 and 255.
+LIBYUV_API
+int ARGBQuantize(uint8* dst_argb,
+ int dst_stride_argb,
+ int scale,
+ int interval_size,
+ int interval_offset,
+ int x,
+ int y,
+ int width,
+ int height);
+
+// Copy ARGB to ARGB.
+LIBYUV_API
+int ARGBCopy(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Copy Alpha channel of ARGB to alpha of ARGB.
+LIBYUV_API
+int ARGBCopyAlpha(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Extract the alpha channel from ARGB.
+LIBYUV_API
+int ARGBExtractAlpha(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_a,
+ int dst_stride_a,
+ int width,
+ int height);
+
+// Copy Y channel to Alpha of ARGB.
+LIBYUV_API
+int ARGBCopyYToAlpha(const uint8* src_y,
+ int src_stride_y,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+typedef void (*ARGBBlendRow)(const uint8* src_argb0,
+ const uint8* src_argb1,
+ uint8* dst_argb,
+ int width);
+
+// Get function to Alpha Blend ARGB pixels and store to destination.
+LIBYUV_API
+ARGBBlendRow GetARGBBlend();
+
+// Alpha Blend ARGB images and store to destination.
+// Source is pre-multiplied by alpha using ARGBAttenuate.
+// Alpha of destination is set to 255.
+LIBYUV_API
+int ARGBBlend(const uint8* src_argb0,
+ int src_stride_argb0,
+ const uint8* src_argb1,
+ int src_stride_argb1,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Alpha Blend plane and store to destination.
+// Source is not pre-multiplied by alpha.
+LIBYUV_API
+int BlendPlane(const uint8* src_y0,
+ int src_stride_y0,
+ const uint8* src_y1,
+ int src_stride_y1,
+ const uint8* alpha,
+ int alpha_stride,
+ uint8* dst_y,
+ int dst_stride_y,
+ int width,
+ int height);
+
+// Alpha Blend YUV images and store to destination.
+// Source is not pre-multiplied by alpha.
+// Alpha is full width x height and subsampled to half size to apply to UV.
+LIBYUV_API
+int I420Blend(const uint8* src_y0,
+ int src_stride_y0,
+ const uint8* src_u0,
+ int src_stride_u0,
+ const uint8* src_v0,
+ int src_stride_v0,
+ const uint8* src_y1,
+ int src_stride_y1,
+ const uint8* src_u1,
+ int src_stride_u1,
+ const uint8* src_v1,
+ int src_stride_v1,
+ const uint8* alpha,
+ int alpha_stride,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
+// Multiply ARGB image by ARGB image. Shifted down by 8. Saturates to 255.
+LIBYUV_API
+int ARGBMultiply(const uint8* src_argb0,
+ int src_stride_argb0,
+ const uint8* src_argb1,
+ int src_stride_argb1,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Add ARGB image with ARGB image. Saturates to 255.
+LIBYUV_API
+int ARGBAdd(const uint8* src_argb0,
+ int src_stride_argb0,
+ const uint8* src_argb1,
+ int src_stride_argb1,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Subtract ARGB image (argb1) from ARGB image (argb0). Saturates to 0.
+LIBYUV_API
+int ARGBSubtract(const uint8* src_argb0,
+ int src_stride_argb0,
+ const uint8* src_argb1,
+ int src_stride_argb1,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Convert I422 to YUY2.
+LIBYUV_API
+int I422ToYUY2(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_frame,
+ int dst_stride_frame,
+ int width,
+ int height);
+
+// Convert I422 to UYVY.
+LIBYUV_API
+int I422ToUYVY(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_frame,
+ int dst_stride_frame,
+ int width,
+ int height);
+
+// Convert unattentuated ARGB to preattenuated ARGB.
+LIBYUV_API
+int ARGBAttenuate(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Convert preattentuated ARGB to unattenuated ARGB.
+LIBYUV_API
+int ARGBUnattenuate(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Internal function - do not call directly.
+// Computes table of cumulative sum for image where the value is the sum
+// of all values above and to the left of the entry. Used by ARGBBlur.
+LIBYUV_API
+int ARGBComputeCumulativeSum(const uint8* src_argb,
+ int src_stride_argb,
+ int32* dst_cumsum,
+ int dst_stride32_cumsum,
+ int width,
+ int height);
+
+// Blur ARGB image.
+// dst_cumsum table of width * (height + 1) * 16 bytes aligned to
+// 16 byte boundary.
+// dst_stride32_cumsum is number of ints in a row (width * 4).
+// radius is number of pixels around the center. e.g. 1 = 3x3. 2=5x5.
+// Blur is optimized for radius of 5 (11x11) or less.
+LIBYUV_API
+int ARGBBlur(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int32* dst_cumsum,
+ int dst_stride32_cumsum,
+ int width,
+ int height,
+ int radius);
+
+// Multiply ARGB image by ARGB value.
+LIBYUV_API
+int ARGBShade(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height,
+ uint32 value);
+
+// Interpolate between two images using specified amount of interpolation
+// (0 to 255) and store to destination.
+// 'interpolation' is specified as 8 bit fraction where 0 means 100% src0
+// and 255 means 1% src0 and 99% src1.
+LIBYUV_API
+int InterpolatePlane(const uint8* src0,
+ int src_stride0,
+ const uint8* src1,
+ int src_stride1,
+ uint8* dst,
+ int dst_stride,
+ int width,
+ int height,
+ int interpolation);
+
+// Interpolate between two ARGB images using specified amount of interpolation
+// Internally calls InterpolatePlane with width * 4 (bpp).
+LIBYUV_API
+int ARGBInterpolate(const uint8* src_argb0,
+ int src_stride_argb0,
+ const uint8* src_argb1,
+ int src_stride_argb1,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height,
+ int interpolation);
+
+// Interpolate between two YUV images using specified amount of interpolation
+// Internally calls InterpolatePlane on each plane where the U and V planes
+// are half width and half height.
+LIBYUV_API
+int I420Interpolate(const uint8* src0_y,
+ int src0_stride_y,
+ const uint8* src0_u,
+ int src0_stride_u,
+ const uint8* src0_v,
+ int src0_stride_v,
+ const uint8* src1_y,
+ int src1_stride_y,
+ const uint8* src1_u,
+ int src1_stride_u,
+ const uint8* src1_v,
+ int src1_stride_v,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height,
+ int interpolation);
+
+#if defined(__pnacl__) || defined(__CLR_VER) || \
+ (defined(__i386__) && !defined(__SSE2__))
+#define LIBYUV_DISABLE_X86
+#endif
+// MemorySanitizer does not support assembly code yet. http://crbug.com/344505
+#if defined(__has_feature)
+#if __has_feature(memory_sanitizer)
+#define LIBYUV_DISABLE_X86
+#endif
+#endif
+// The following are available on all x86 platforms:
+#if !defined(LIBYUV_DISABLE_X86) && \
+ (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
+#define HAS_ARGBAFFINEROW_SSE2
+#endif
+
+// Row function for copying pixels from a source with a slope to a row
+// of destination. Useful for scaling, rotation, mirror, texture mapping.
+LIBYUV_API
+void ARGBAffineRow_C(const uint8* src_argb,
+ int src_argb_stride,
+ uint8* dst_argb,
+ const float* uv_dudv,
+ int width);
+LIBYUV_API
+void ARGBAffineRow_SSE2(const uint8* src_argb,
+ int src_argb_stride,
+ uint8* dst_argb,
+ const float* uv_dudv,
+ int width);
+
+// Shuffle ARGB channel order. e.g. BGRA to ARGB.
+// shuffler is 16 bytes and must be aligned.
+LIBYUV_API
+int ARGBShuffle(const uint8* src_bgra,
+ int src_stride_bgra,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ const uint8* shuffler,
+ int width,
+ int height);
+
+// Sobel ARGB effect with planar output.
+LIBYUV_API
+int ARGBSobelToPlane(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_y,
+ int dst_stride_y,
+ int width,
+ int height);
+
+// Sobel ARGB effect.
+LIBYUV_API
+int ARGBSobel(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Sobel ARGB effect w/ Sobel X, Sobel, Sobel Y in ARGB.
+LIBYUV_API
+int ARGBSobelXY(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
+
+#endif // INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_
diff --git a/libyuv/src/main/cpp/libyuv/include/libyuv/rotate.h b/libyuv/src/main/cpp/libyuv/include/libyuv/rotate.h
new file mode 100644
index 0000000..b9f7154
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/include/libyuv/rotate.h
@@ -0,0 +1,164 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_ROTATE_H_
+#define INCLUDE_LIBYUV_ROTATE_H_
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Supported rotation.
+typedef enum RotationMode {
+ kRotate0 = 0, // No rotation.
+ kRotate90 = 90, // Rotate 90 degrees clockwise.
+ kRotate180 = 180, // Rotate 180 degrees.
+ kRotate270 = 270, // Rotate 270 degrees clockwise.
+
+ // Deprecated.
+ kRotateNone = 0,
+ kRotateClockwise = 90,
+ kRotateCounterClockwise = 270,
+} RotationModeEnum;
+
+// Rotate I420 frame.
+LIBYUV_API
+int I420Rotate(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int src_width,
+ int src_height,
+ enum RotationMode mode);
+
+// Rotate NV12 input and store in I420.
+LIBYUV_API
+int NV12ToI420Rotate(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_uv,
+ int src_stride_uv,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int src_width,
+ int src_height,
+ enum RotationMode mode);
+
+// Rotate a plane by 0, 90, 180, or 270.
+LIBYUV_API
+int RotatePlane(const uint8* src,
+ int src_stride,
+ uint8* dst,
+ int dst_stride,
+ int src_width,
+ int src_height,
+ enum RotationMode mode);
+
+// Rotate planes by 90, 180, 270. Deprecated.
+LIBYUV_API
+void RotatePlane90(const uint8* src,
+ int src_stride,
+ uint8* dst,
+ int dst_stride,
+ int width,
+ int height);
+
+LIBYUV_API
+void RotatePlane180(const uint8* src,
+ int src_stride,
+ uint8* dst,
+ int dst_stride,
+ int width,
+ int height);
+
+LIBYUV_API
+void RotatePlane270(const uint8* src,
+ int src_stride,
+ uint8* dst,
+ int dst_stride,
+ int width,
+ int height);
+
+LIBYUV_API
+void RotateUV90(const uint8* src,
+ int src_stride,
+ uint8* dst_a,
+ int dst_stride_a,
+ uint8* dst_b,
+ int dst_stride_b,
+ int width,
+ int height);
+
+// Rotations for when U and V are interleaved.
+// These functions take one input pointer and
+// split the data into two buffers while
+// rotating them. Deprecated.
+LIBYUV_API
+void RotateUV180(const uint8* src,
+ int src_stride,
+ uint8* dst_a,
+ int dst_stride_a,
+ uint8* dst_b,
+ int dst_stride_b,
+ int width,
+ int height);
+
+LIBYUV_API
+void RotateUV270(const uint8* src,
+ int src_stride,
+ uint8* dst_a,
+ int dst_stride_a,
+ uint8* dst_b,
+ int dst_stride_b,
+ int width,
+ int height);
+
+// The 90 and 270 functions are based on transposes.
+// Doing a transpose with reversing the read/write
+// order will result in a rotation by +- 90 degrees.
+// Deprecated.
+LIBYUV_API
+void TransposePlane(const uint8* src,
+ int src_stride,
+ uint8* dst,
+ int dst_stride,
+ int width,
+ int height);
+
+LIBYUV_API
+void TransposeUV(const uint8* src,
+ int src_stride,
+ uint8* dst_a,
+ int dst_stride_a,
+ uint8* dst_b,
+ int dst_stride_b,
+ int width,
+ int height);
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
+
+#endif // INCLUDE_LIBYUV_ROTATE_H_
diff --git a/libyuv/src/main/cpp/libyuv/include/libyuv/rotate_argb.h b/libyuv/src/main/cpp/libyuv/include/libyuv/rotate_argb.h
new file mode 100644
index 0000000..be0190c
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/include/libyuv/rotate_argb.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_ROTATE_ARGB_H_
+#define INCLUDE_LIBYUV_ROTATE_ARGB_H_
+
+#include "libyuv/basic_types.h"
+#include "libyuv/rotate.h" // For RotationMode.
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Rotate ARGB frame
+LIBYUV_API
+int ARGBRotate(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int src_width,
+ int src_height,
+ enum RotationMode mode);
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
+
+#endif // INCLUDE_LIBYUV_ROTATE_ARGB_H_
diff --git a/libyuv/src/main/cpp/libyuv/include/libyuv/rotate_row.h b/libyuv/src/main/cpp/libyuv/include/libyuv/rotate_row.h
new file mode 100644
index 0000000..2c51584
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/include/libyuv/rotate_row.h
@@ -0,0 +1,228 @@
+/*
+ * Copyright 2013 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_ROTATE_ROW_H_
+#define INCLUDE_LIBYUV_ROTATE_ROW_H_
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#if defined(__pnacl__) || defined(__CLR_VER) || \
+ (defined(__i386__) && !defined(__SSE2__))
+#define LIBYUV_DISABLE_X86
+#endif
+// MemorySanitizer does not support assembly code yet. http://crbug.com/344505
+#if defined(__has_feature)
+#if __has_feature(memory_sanitizer)
+#define LIBYUV_DISABLE_X86
+#endif
+#endif
+// The following are available for Visual C and clangcl 32 bit:
+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
+#define HAS_TRANSPOSEWX8_SSSE3
+#define HAS_TRANSPOSEUVWX8_SSE2
+#endif
+
+// The following are available for GCC 32 or 64 bit but not NaCL for 64 bit:
+#if !defined(LIBYUV_DISABLE_X86) && \
+ (defined(__i386__) || \
+ (defined(__x86_64__) && !defined(__native_client__)))
+#define HAS_TRANSPOSEWX8_SSSE3
+#endif
+
+// The following are available for 64 bit GCC but not NaCL:
+#if !defined(LIBYUV_DISABLE_X86) && !defined(__native_client__) && \
+ defined(__x86_64__)
+#define HAS_TRANSPOSEWX8_FAST_SSSE3
+#define HAS_TRANSPOSEUVWX8_SSE2
+#endif
+
+#if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \
+ (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
+#define HAS_TRANSPOSEWX8_NEON
+#define HAS_TRANSPOSEUVWX8_NEON
+#endif
+
+#if !defined(LIBYUV_DISABLE_DSPR2) && !defined(__native_client__) && \
+ defined(__mips__) && defined(__mips_dsp) && (__mips_dsp_rev >= 2)
+#define HAS_TRANSPOSEWX8_DSPR2
+#define HAS_TRANSPOSEUVWX8_DSPR2
+#endif // defined(__mips__)
+
+#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
+#define HAS_TRANSPOSEWX16_MSA
+#define HAS_TRANSPOSEUVWX16_MSA
+#endif
+
+void TransposeWxH_C(const uint8* src,
+ int src_stride,
+ uint8* dst,
+ int dst_stride,
+ int width,
+ int height);
+
+void TransposeWx8_C(const uint8* src,
+ int src_stride,
+ uint8* dst,
+ int dst_stride,
+ int width);
+void TransposeWx16_C(const uint8* src,
+ int src_stride,
+ uint8* dst,
+ int dst_stride,
+ int width);
+void TransposeWx8_NEON(const uint8* src,
+ int src_stride,
+ uint8* dst,
+ int dst_stride,
+ int width);
+void TransposeWx8_SSSE3(const uint8* src,
+ int src_stride,
+ uint8* dst,
+ int dst_stride,
+ int width);
+void TransposeWx8_Fast_SSSE3(const uint8* src,
+ int src_stride,
+ uint8* dst,
+ int dst_stride,
+ int width);
+void TransposeWx8_DSPR2(const uint8* src,
+ int src_stride,
+ uint8* dst,
+ int dst_stride,
+ int width);
+void TransposeWx8_Fast_DSPR2(const uint8* src,
+ int src_stride,
+ uint8* dst,
+ int dst_stride,
+ int width);
+void TransposeWx16_MSA(const uint8* src,
+ int src_stride,
+ uint8* dst,
+ int dst_stride,
+ int width);
+
+void TransposeWx8_Any_NEON(const uint8* src,
+ int src_stride,
+ uint8* dst,
+ int dst_stride,
+ int width);
+void TransposeWx8_Any_SSSE3(const uint8* src,
+ int src_stride,
+ uint8* dst,
+ int dst_stride,
+ int width);
+void TransposeWx8_Fast_Any_SSSE3(const uint8* src,
+ int src_stride,
+ uint8* dst,
+ int dst_stride,
+ int width);
+void TransposeWx8_Any_DSPR2(const uint8* src,
+ int src_stride,
+ uint8* dst,
+ int dst_stride,
+ int width);
+void TransposeWx16_Any_MSA(const uint8* src,
+ int src_stride,
+ uint8* dst,
+ int dst_stride,
+ int width);
+
+void TransposeUVWxH_C(const uint8* src,
+ int src_stride,
+ uint8* dst_a,
+ int dst_stride_a,
+ uint8* dst_b,
+ int dst_stride_b,
+ int width,
+ int height);
+
+void TransposeUVWx8_C(const uint8* src,
+ int src_stride,
+ uint8* dst_a,
+ int dst_stride_a,
+ uint8* dst_b,
+ int dst_stride_b,
+ int width);
+void TransposeUVWx16_C(const uint8* src,
+ int src_stride,
+ uint8* dst_a,
+ int dst_stride_a,
+ uint8* dst_b,
+ int dst_stride_b,
+ int width);
+void TransposeUVWx8_SSE2(const uint8* src,
+ int src_stride,
+ uint8* dst_a,
+ int dst_stride_a,
+ uint8* dst_b,
+ int dst_stride_b,
+ int width);
+void TransposeUVWx8_NEON(const uint8* src,
+ int src_stride,
+ uint8* dst_a,
+ int dst_stride_a,
+ uint8* dst_b,
+ int dst_stride_b,
+ int width);
+void TransposeUVWx8_DSPR2(const uint8* src,
+ int src_stride,
+ uint8* dst_a,
+ int dst_stride_a,
+ uint8* dst_b,
+ int dst_stride_b,
+ int width);
+void TransposeUVWx16_MSA(const uint8* src,
+ int src_stride,
+ uint8* dst_a,
+ int dst_stride_a,
+ uint8* dst_b,
+ int dst_stride_b,
+ int width);
+
+void TransposeUVWx8_Any_SSE2(const uint8* src,
+ int src_stride,
+ uint8* dst_a,
+ int dst_stride_a,
+ uint8* dst_b,
+ int dst_stride_b,
+ int width);
+void TransposeUVWx8_Any_NEON(const uint8* src,
+ int src_stride,
+ uint8* dst_a,
+ int dst_stride_a,
+ uint8* dst_b,
+ int dst_stride_b,
+ int width);
+void TransposeUVWx8_Any_DSPR2(const uint8* src,
+ int src_stride,
+ uint8* dst_a,
+ int dst_stride_a,
+ uint8* dst_b,
+ int dst_stride_b,
+ int width);
+void TransposeUVWx16_Any_MSA(const uint8* src,
+ int src_stride,
+ uint8* dst_a,
+ int dst_stride_a,
+ uint8* dst_b,
+ int dst_stride_b,
+ int width);
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
+
+#endif // INCLUDE_LIBYUV_ROTATE_ROW_H_
diff --git a/libyuv/src/main/cpp/libyuv/include/libyuv/row.h b/libyuv/src/main/cpp/libyuv/include/libyuv/row.h
new file mode 100644
index 0000000..8e14dcb
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/include/libyuv/row.h
@@ -0,0 +1,3152 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_ROW_H_
+#define INCLUDE_LIBYUV_ROW_H_
+
+#include // For malloc.
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#define IS_ALIGNED(p, a) (!((uintptr_t)(p) & ((a)-1)))
+
+#define align_buffer_64(var, size) \
+ uint8* var##_mem = (uint8*)(malloc((size) + 63)); /* NOLINT */ \
+ uint8* var = (uint8*)(((intptr_t)(var##_mem) + 63) & ~63) /* NOLINT */
+
+#define free_aligned_buffer_64(var) \
+ free(var##_mem); \
+ var = 0
+
+#if defined(__pnacl__) || defined(__CLR_VER) || \
+ (defined(__i386__) && !defined(__SSE2__))
+#define LIBYUV_DISABLE_X86
+#endif
+// MemorySanitizer does not support assembly code yet. http://crbug.com/344505
+#if defined(__has_feature)
+#if __has_feature(memory_sanitizer)
+#define LIBYUV_DISABLE_X86
+#endif
+#endif
+// True if compiling for SSSE3 as a requirement.
+#if defined(__SSSE3__) || (defined(_M_IX86_FP) && (_M_IX86_FP >= 3))
+#define LIBYUV_SSSE3_ONLY
+#endif
+
+#if defined(__native_client__)
+#define LIBYUV_DISABLE_NEON
+#endif
+// clang >= 3.5.0 required for Arm64.
+#if defined(__clang__) && defined(__aarch64__) && !defined(LIBYUV_DISABLE_NEON)
+#if (__clang_major__ < 3) || (__clang_major__ == 3 && (__clang_minor__ < 5))
+#define LIBYUV_DISABLE_NEON
+#endif // clang >= 3.5
+#endif // __clang__
+
+// GCC >= 4.7.0 required for AVX2.
+#if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))
+#if (__GNUC__ > 4) || (__GNUC__ == 4 && (__GNUC_MINOR__ >= 7))
+#define GCC_HAS_AVX2 1
+#endif // GNUC >= 4.7
+#endif // __GNUC__
+
+// clang >= 3.4.0 required for AVX2.
+#if defined(__clang__) && (defined(__x86_64__) || defined(__i386__))
+#if (__clang_major__ > 3) || (__clang_major__ == 3 && (__clang_minor__ >= 4))
+#define CLANG_HAS_AVX2 1
+#endif // clang >= 3.4
+#endif // __clang__
+
+// Visual C 2012 required for AVX2.
+#if defined(_M_IX86) && !defined(__clang__) && defined(_MSC_VER) && \
+ _MSC_VER >= 1700
+#define VISUALC_HAS_AVX2 1
+#endif // VisualStudio >= 2012
+
+// The following are available on all x86 platforms:
+#if !defined(LIBYUV_DISABLE_X86) && \
+ (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
+// Conversions:
+#define HAS_ABGRTOUVROW_SSSE3
+#define HAS_ABGRTOYROW_SSSE3
+#define HAS_ARGB1555TOARGBROW_SSE2
+#define HAS_ARGB4444TOARGBROW_SSE2
+#define HAS_ARGBEXTRACTALPHAROW_SSE2
+#define HAS_ARGBSETROW_X86
+#define HAS_ARGBSHUFFLEROW_SSE2
+#define HAS_ARGBSHUFFLEROW_SSSE3
+#define HAS_ARGBTOARGB1555ROW_SSE2
+#define HAS_ARGBTOARGB4444ROW_SSE2
+#define HAS_ARGBTORAWROW_SSSE3
+#define HAS_ARGBTORGB24ROW_SSSE3
+#define HAS_ARGBTORGB565DITHERROW_SSE2
+#define HAS_ARGBTORGB565ROW_SSE2
+#define HAS_ARGBTOUV444ROW_SSSE3
+#define HAS_ARGBTOUVJROW_SSSE3
+#define HAS_ARGBTOUVROW_SSSE3
+#define HAS_ARGBTOYJROW_SSSE3
+#define HAS_ARGBTOYROW_SSSE3
+#define HAS_BGRATOUVROW_SSSE3
+#define HAS_BGRATOYROW_SSSE3
+#define HAS_COPYROW_ERMS
+#define HAS_COPYROW_SSE2
+#define HAS_H422TOARGBROW_SSSE3
+#define HAS_HALFFLOATROW_SSE2
+#define HAS_I400TOARGBROW_SSE2
+#define HAS_I422TOARGB1555ROW_SSSE3
+#define HAS_I422TOARGB4444ROW_SSSE3
+#define HAS_I422TOARGBROW_SSSE3
+#define HAS_I422TORGB24ROW_SSSE3
+#define HAS_I422TORGB565ROW_SSSE3
+#define HAS_I422TORGBAROW_SSSE3
+#define HAS_I422TOUYVYROW_SSE2
+#define HAS_I422TOYUY2ROW_SSE2
+#define HAS_I444TOARGBROW_SSSE3
+#define HAS_J400TOARGBROW_SSE2
+#define HAS_J422TOARGBROW_SSSE3
+#define HAS_MERGEUVROW_SSE2
+#define HAS_MIRRORROW_SSSE3
+#define HAS_MIRRORUVROW_SSSE3
+#define HAS_NV12TOARGBROW_SSSE3
+#define HAS_NV12TORGB565ROW_SSSE3
+#define HAS_NV21TOARGBROW_SSSE3
+#define HAS_RAWTOARGBROW_SSSE3
+#define HAS_RAWTORGB24ROW_SSSE3
+#define HAS_RAWTOYROW_SSSE3
+#define HAS_RGB24TOARGBROW_SSSE3
+#define HAS_RGB24TOYROW_SSSE3
+#define HAS_RGB565TOARGBROW_SSE2
+#define HAS_RGBATOUVROW_SSSE3
+#define HAS_RGBATOYROW_SSSE3
+#define HAS_SETROW_ERMS
+#define HAS_SETROW_X86
+#define HAS_SPLITUVROW_SSE2
+#define HAS_UYVYTOARGBROW_SSSE3
+#define HAS_UYVYTOUV422ROW_SSE2
+#define HAS_UYVYTOUVROW_SSE2
+#define HAS_UYVYTOYROW_SSE2
+#define HAS_YUY2TOARGBROW_SSSE3
+#define HAS_YUY2TOUV422ROW_SSE2
+#define HAS_YUY2TOUVROW_SSE2
+#define HAS_YUY2TOYROW_SSE2
+
+// Effects:
+#define HAS_ARGBADDROW_SSE2
+#define HAS_ARGBAFFINEROW_SSE2
+#define HAS_ARGBATTENUATEROW_SSSE3
+#define HAS_ARGBBLENDROW_SSSE3
+#define HAS_ARGBCOLORMATRIXROW_SSSE3
+#define HAS_ARGBCOLORTABLEROW_X86
+#define HAS_ARGBCOPYALPHAROW_SSE2
+#define HAS_ARGBCOPYYTOALPHAROW_SSE2
+#define HAS_ARGBGRAYROW_SSSE3
+#define HAS_ARGBLUMACOLORTABLEROW_SSSE3
+#define HAS_ARGBMIRRORROW_SSE2
+#define HAS_ARGBMULTIPLYROW_SSE2
+#define HAS_ARGBPOLYNOMIALROW_SSE2
+#define HAS_ARGBQUANTIZEROW_SSE2
+#define HAS_ARGBSEPIAROW_SSSE3
+#define HAS_ARGBSHADEROW_SSE2
+#define HAS_ARGBSUBTRACTROW_SSE2
+#define HAS_ARGBUNATTENUATEROW_SSE2
+#define HAS_BLENDPLANEROW_SSSE3
+#define HAS_COMPUTECUMULATIVESUMROW_SSE2
+#define HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
+#define HAS_INTERPOLATEROW_SSSE3
+#define HAS_RGBCOLORTABLEROW_X86
+#define HAS_SOBELROW_SSE2
+#define HAS_SOBELTOPLANEROW_SSE2
+#define HAS_SOBELXROW_SSE2
+#define HAS_SOBELXYROW_SSE2
+#define HAS_SOBELYROW_SSE2
+
+// The following functions fail on gcc/clang 32 bit with fpic and framepointer.
+// caveat: clangcl uses row_win.cc which works.
+#if defined(__x86_64__) || !defined(__pic__) || defined(__clang__) || \
+ defined(_MSC_VER)
+// TODO(fbarchard): fix build error on android_full_debug=1
+// https://code.google.com/p/libyuv/issues/detail?id=517
+#define HAS_I422ALPHATOARGBROW_SSSE3
+#endif
+#endif
+
+// The following are available on all x86 platforms, but
+// require VS2012, clang 3.4 or gcc 4.7.
+// The code supports NaCL but requires a new compiler and validator.
+#if !defined(LIBYUV_DISABLE_X86) && \
+ (defined(VISUALC_HAS_AVX2) || defined(CLANG_HAS_AVX2) || \
+ defined(GCC_HAS_AVX2))
+#define HAS_ARGBCOPYALPHAROW_AVX2
+#define HAS_ARGBCOPYYTOALPHAROW_AVX2
+#define HAS_ARGBEXTRACTALPHAROW_AVX2
+#define HAS_ARGBMIRRORROW_AVX2
+#define HAS_ARGBPOLYNOMIALROW_AVX2
+#define HAS_ARGBSHUFFLEROW_AVX2
+#define HAS_ARGBTORGB565DITHERROW_AVX2
+#define HAS_ARGBTOUVJROW_AVX2
+#define HAS_ARGBTOUVROW_AVX2
+#define HAS_ARGBTOYJROW_AVX2
+#define HAS_ARGBTOYROW_AVX2
+#define HAS_COPYROW_AVX
+#define HAS_H422TOARGBROW_AVX2
+#define HAS_HALFFLOATROW_AVX2
+// #define HAS_HALFFLOATROW_F16C // Enable to test halffloat cast
+#define HAS_I400TOARGBROW_AVX2
+#define HAS_I422TOARGB1555ROW_AVX2
+#define HAS_I422TOARGB4444ROW_AVX2
+#define HAS_I422TOARGBROW_AVX2
+#define HAS_I422TORGB24ROW_AVX2
+#define HAS_I422TORGB565ROW_AVX2
+#define HAS_I422TORGBAROW_AVX2
+#define HAS_I444TOARGBROW_AVX2
+#define HAS_INTERPOLATEROW_AVX2
+#define HAS_J422TOARGBROW_AVX2
+#define HAS_MERGEUVROW_AVX2
+#define HAS_MIRRORROW_AVX2
+#define HAS_NV12TOARGBROW_AVX2
+#define HAS_NV12TORGB565ROW_AVX2
+#define HAS_NV21TOARGBROW_AVX2
+#define HAS_SPLITUVROW_AVX2
+#define HAS_UYVYTOARGBROW_AVX2
+#define HAS_UYVYTOUV422ROW_AVX2
+#define HAS_UYVYTOUVROW_AVX2
+#define HAS_UYVYTOYROW_AVX2
+#define HAS_YUY2TOARGBROW_AVX2
+#define HAS_YUY2TOUV422ROW_AVX2
+#define HAS_YUY2TOUVROW_AVX2
+#define HAS_YUY2TOYROW_AVX2
+
+// Effects:
+#define HAS_ARGBADDROW_AVX2
+#define HAS_ARGBATTENUATEROW_AVX2
+#define HAS_ARGBMULTIPLYROW_AVX2
+#define HAS_ARGBSUBTRACTROW_AVX2
+#define HAS_ARGBUNATTENUATEROW_AVX2
+#define HAS_BLENDPLANEROW_AVX2
+
+#if defined(__x86_64__) || !defined(__pic__) || defined(__clang__) || \
+ defined(_MSC_VER)
+// TODO(fbarchard): fix build error on android_full_debug=1
+// https://code.google.com/p/libyuv/issues/detail?id=517
+#define HAS_I422ALPHATOARGBROW_AVX2
+#endif
+#endif
+
+// The following are available for AVX2 Visual C and clangcl 32 bit:
+// TODO(fbarchard): Port to gcc.
+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \
+ (defined(VISUALC_HAS_AVX2) || defined(CLANG_HAS_AVX2))
+#define HAS_ARGB1555TOARGBROW_AVX2
+#define HAS_ARGB4444TOARGBROW_AVX2
+#define HAS_ARGBTOARGB1555ROW_AVX2
+#define HAS_ARGBTOARGB4444ROW_AVX2
+#define HAS_ARGBTORGB565ROW_AVX2
+#define HAS_J400TOARGBROW_AVX2
+#define HAS_RGB565TOARGBROW_AVX2
+#endif
+
+// The following are also available on x64 Visual C.
+#if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && defined(_M_X64) && \
+ (!defined(__clang__) || defined(__SSSE3__))
+#define HAS_I422ALPHATOARGBROW_SSSE3
+#define HAS_I422TOARGBROW_SSSE3
+#endif
+
+// The following are available on Neon platforms:
+#if !defined(LIBYUV_DISABLE_NEON) && \
+ (defined(__aarch64__) || defined(__ARM_NEON__) || defined(LIBYUV_NEON))
+#define HAS_ABGRTOUVROW_NEON
+#define HAS_ABGRTOYROW_NEON
+#define HAS_ARGB1555TOARGBROW_NEON
+#define HAS_ARGB1555TOUVROW_NEON
+#define HAS_ARGB1555TOYROW_NEON
+#define HAS_ARGB4444TOARGBROW_NEON
+#define HAS_ARGB4444TOUVROW_NEON
+#define HAS_ARGB4444TOYROW_NEON
+#define HAS_ARGBEXTRACTALPHAROW_NEON
+#define HAS_ARGBSETROW_NEON
+#define HAS_ARGBTOARGB1555ROW_NEON
+#define HAS_ARGBTOARGB4444ROW_NEON
+#define HAS_ARGBTORAWROW_NEON
+#define HAS_ARGBTORGB24ROW_NEON
+#define HAS_ARGBTORGB565DITHERROW_NEON
+#define HAS_ARGBTORGB565ROW_NEON
+#define HAS_ARGBTOUV444ROW_NEON
+#define HAS_ARGBTOUVJROW_NEON
+#define HAS_ARGBTOUVROW_NEON
+#define HAS_ARGBTOYJROW_NEON
+#define HAS_ARGBTOYROW_NEON
+#define HAS_BGRATOUVROW_NEON
+#define HAS_BGRATOYROW_NEON
+#define HAS_COPYROW_NEON
+#define HAS_HALFFLOATROW_NEON
+#define HAS_I400TOARGBROW_NEON
+#define HAS_I422ALPHATOARGBROW_NEON
+#define HAS_I422TOARGB1555ROW_NEON
+#define HAS_I422TOARGB4444ROW_NEON
+#define HAS_I422TOARGBROW_NEON
+#define HAS_I422TORGB24ROW_NEON
+#define HAS_I422TORGB565ROW_NEON
+#define HAS_I422TORGBAROW_NEON
+#define HAS_I422TOUYVYROW_NEON
+#define HAS_I422TOYUY2ROW_NEON
+#define HAS_I444TOARGBROW_NEON
+#define HAS_J400TOARGBROW_NEON
+#define HAS_MERGEUVROW_NEON
+#define HAS_MIRRORROW_NEON
+#define HAS_MIRRORUVROW_NEON
+#define HAS_NV12TOARGBROW_NEON
+#define HAS_NV12TORGB565ROW_NEON
+#define HAS_NV21TOARGBROW_NEON
+#define HAS_RAWTOARGBROW_NEON
+#define HAS_RAWTORGB24ROW_NEON
+#define HAS_RAWTOUVROW_NEON
+#define HAS_RAWTOYROW_NEON
+#define HAS_RGB24TOARGBROW_NEON
+#define HAS_RGB24TOUVROW_NEON
+#define HAS_RGB24TOYROW_NEON
+#define HAS_RGB565TOARGBROW_NEON
+#define HAS_RGB565TOUVROW_NEON
+#define HAS_RGB565TOYROW_NEON
+#define HAS_RGBATOUVROW_NEON
+#define HAS_RGBATOYROW_NEON
+#define HAS_SETROW_NEON
+#define HAS_SPLITUVROW_NEON
+#define HAS_UYVYTOARGBROW_NEON
+#define HAS_UYVYTOUV422ROW_NEON
+#define HAS_UYVYTOUVROW_NEON
+#define HAS_UYVYTOYROW_NEON
+#define HAS_YUY2TOARGBROW_NEON
+#define HAS_YUY2TOUV422ROW_NEON
+#define HAS_YUY2TOUVROW_NEON
+#define HAS_YUY2TOYROW_NEON
+
+// Effects:
+#define HAS_ARGBADDROW_NEON
+#define HAS_ARGBATTENUATEROW_NEON
+#define HAS_ARGBBLENDROW_NEON
+#define HAS_ARGBCOLORMATRIXROW_NEON
+#define HAS_ARGBGRAYROW_NEON
+#define HAS_ARGBMIRRORROW_NEON
+#define HAS_ARGBMULTIPLYROW_NEON
+#define HAS_ARGBQUANTIZEROW_NEON
+#define HAS_ARGBSEPIAROW_NEON
+#define HAS_ARGBSHADEROW_NEON
+#define HAS_ARGBSHUFFLEROW_NEON
+#define HAS_ARGBSUBTRACTROW_NEON
+#define HAS_INTERPOLATEROW_NEON
+#define HAS_SOBELROW_NEON
+#define HAS_SOBELTOPLANEROW_NEON
+#define HAS_SOBELXROW_NEON
+#define HAS_SOBELXYROW_NEON
+#define HAS_SOBELYROW_NEON
+#endif
+
+// The following are available on Mips platforms:
+#if !defined(LIBYUV_DISABLE_DSPR2) && defined(__mips__) && \
+ (_MIPS_SIM == _MIPS_SIM_ABI32) && (__mips_isa_rev < 6)
+#define HAS_COPYROW_MIPS
+#if defined(__mips_dsp) && (__mips_dsp_rev >= 2)
+#define HAS_I422TOARGBROW_DSPR2
+#define HAS_INTERPOLATEROW_DSPR2
+#define HAS_MIRRORROW_DSPR2
+#define HAS_MIRRORUVROW_DSPR2
+#define HAS_SPLITUVROW_DSPR2
+#define HAS_RGB24TOARGBROW_DSPR2
+#define HAS_RAWTOARGBROW_DSPR2
+#define HAS_RGB565TOARGBROW_DSPR2
+#define HAS_ARGB1555TOARGBROW_DSPR2
+#define HAS_ARGB4444TOARGBROW_DSPR2
+#define HAS_I444TOARGBROW_DSPR2
+#define HAS_I422TOARGB4444ROW_DSPR2
+#define HAS_I422TOARGB1555ROW_DSPR2
+#define HAS_NV12TOARGBROW_DSPR2
+#define HAS_BGRATOUVROW_DSPR2
+#define HAS_BGRATOYROW_DSPR2
+#define HAS_ABGRTOUVROW_DSPR2
+#define HAS_ARGBTOYROW_DSPR2
+#define HAS_ABGRTOYROW_DSPR2
+#define HAS_RGBATOUVROW_DSPR2
+#define HAS_RGBATOYROW_DSPR2
+#define HAS_ARGBTOUVROW_DSPR2
+#endif
+#endif
+
+#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
+#define HAS_ARGBMIRRORROW_MSA
+#define HAS_I422TOUYVYROW_MSA
+#define HAS_I422TOYUY2ROW_MSA
+#define HAS_MIRRORROW_MSA
+#define HAS_UYVYTOUVROW_MSA
+#define HAS_UYVYTOYROW_MSA
+#define HAS_YUY2TOUV422ROW_MSA
+#define HAS_YUY2TOUVROW_MSA
+#define HAS_YUY2TOYROW_MSA
+#define HAS_ARGB4444TOARGBROW_MSA
+#define HAS_ARGBTOYROW_MSA
+#define HAS_ARGBTOUVROW_MSA
+#define HAS_I422TOARGBROW_MSA
+#define HAS_I422TORGBAROW_MSA
+#define HAS_I422ALPHATOARGBROW_MSA
+#define HAS_I422TORGB24ROW_MSA
+#define HAS_ARGBTORGB24ROW_MSA
+#define HAS_ARGBTORAWROW_MSA
+#define HAS_ARGBTORGB565ROW_MSA
+#define HAS_ARGBTOARGB1555ROW_MSA
+#define HAS_ARGBTOARGB4444ROW_MSA
+#define HAS_ARGBTOUV444ROW_MSA
+#define HAS_ARGBMULTIPLYROW_MSA
+#define HAS_ARGBADDROW_MSA
+#define HAS_ARGBSUBTRACTROW_MSA
+#define HAS_ARGBATTENUATEROW_MSA
+#define HAS_ARGBTORGB565DITHERROW_MSA
+#define HAS_ARGBSHUFFLEROW_MSA
+#define HAS_ARGBSHADEROW_MSA
+#define HAS_ARGBGRAYROW_MSA
+#define HAS_ARGBSEPIAROW_MSA
+#define HAS_ARGB1555TOARGBROW_MSA
+#define HAS_RGB565TOARGBROW_MSA
+#define HAS_RGB24TOARGBROW_MSA
+#define HAS_RAWTOARGBROW_MSA
+#define HAS_ARGB1555TOYROW_MSA
+#define HAS_RGB565TOYROW_MSA
+#define HAS_RGB24TOYROW_MSA
+#define HAS_RAWTOYROW_MSA
+#define HAS_ARGB1555TOUVROW_MSA
+#define HAS_RGB565TOUVROW_MSA
+#define HAS_RGB24TOUVROW_MSA
+#define HAS_RAWTOUVROW_MSA
+#define HAS_NV12TOARGBROW_MSA
+#define HAS_NV12TORGB565ROW_MSA
+#define HAS_NV21TOARGBROW_MSA
+#define HAS_SOBELROW_MSA
+#define HAS_SOBELTOPLANEROW_MSA
+#define HAS_SOBELXYROW_MSA
+#define HAS_ARGBTOYJROW_MSA
+#define HAS_BGRATOYROW_MSA
+#define HAS_ABGRTOYROW_MSA
+#define HAS_RGBATOYROW_MSA
+#define HAS_ARGBTOUVJROW_MSA
+#define HAS_BGRATOUVROW_MSA
+#define HAS_ABGRTOUVROW_MSA
+#define HAS_RGBATOUVROW_MSA
+#define HAS_I444TOARGBROW_MSA
+#define HAS_I400TOARGBROW_MSA
+#define HAS_J400TOARGBROW_MSA
+#define HAS_YUY2TOARGBROW_MSA
+#define HAS_UYVYTOARGBROW_MSA
+#define HAS_INTERPOLATEROW_MSA
+#define HAS_ARGBSETROW_MSA
+#define HAS_RAWTORGB24ROW_MSA
+#define HAS_MERGEUVROW_MSA
+#endif
+
+#if defined(_MSC_VER) && !defined(__CLR_VER) && !defined(__clang__)
+#if defined(VISUALC_HAS_AVX2)
+#define SIMD_ALIGNED(var) __declspec(align(32)) var
+#else
+#define SIMD_ALIGNED(var) __declspec(align(16)) var
+#endif
+typedef __declspec(align(16)) int16 vec16[8];
+typedef __declspec(align(16)) int32 vec32[4];
+typedef __declspec(align(16)) int8 vec8[16];
+typedef __declspec(align(16)) uint16 uvec16[8];
+typedef __declspec(align(16)) uint32 uvec32[4];
+typedef __declspec(align(16)) uint8 uvec8[16];
+typedef __declspec(align(32)) int16 lvec16[16];
+typedef __declspec(align(32)) int32 lvec32[8];
+typedef __declspec(align(32)) int8 lvec8[32];
+typedef __declspec(align(32)) uint16 ulvec16[16];
+typedef __declspec(align(32)) uint32 ulvec32[8];
+typedef __declspec(align(32)) uint8 ulvec8[32];
+#elif !defined(__pnacl__) && (defined(__GNUC__) || defined(__clang__))
+// Caveat GCC 4.2 to 4.7 have a known issue using vectors with const.
+#if defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2)
+#define SIMD_ALIGNED(var) var __attribute__((aligned(32)))
+#else
+#define SIMD_ALIGNED(var) var __attribute__((aligned(16)))
+#endif
+typedef int16 __attribute__((vector_size(16))) vec16;
+typedef int32 __attribute__((vector_size(16))) vec32;
+typedef int8 __attribute__((vector_size(16))) vec8;
+typedef uint16 __attribute__((vector_size(16))) uvec16;
+typedef uint32 __attribute__((vector_size(16))) uvec32;
+typedef uint8 __attribute__((vector_size(16))) uvec8;
+typedef int16 __attribute__((vector_size(32))) lvec16;
+typedef int32 __attribute__((vector_size(32))) lvec32;
+typedef int8 __attribute__((vector_size(32))) lvec8;
+typedef uint16 __attribute__((vector_size(32))) ulvec16;
+typedef uint32 __attribute__((vector_size(32))) ulvec32;
+typedef uint8 __attribute__((vector_size(32))) ulvec8;
+#else
+#define SIMD_ALIGNED(var) var
+typedef int16 vec16[8];
+typedef int32 vec32[4];
+typedef int8 vec8[16];
+typedef uint16 uvec16[8];
+typedef uint32 uvec32[4];
+typedef uint8 uvec8[16];
+typedef int16 lvec16[16];
+typedef int32 lvec32[8];
+typedef int8 lvec8[32];
+typedef uint16 ulvec16[16];
+typedef uint32 ulvec32[8];
+typedef uint8 ulvec8[32];
+#endif
+
+#if defined(__aarch64__)
+// This struct is for Arm64 color conversion.
+struct YuvConstants {
+ uvec16 kUVToRB;
+ uvec16 kUVToRB2;
+ uvec16 kUVToG;
+ uvec16 kUVToG2;
+ vec16 kUVBiasBGR;
+ vec32 kYToRgb;
+};
+#elif defined(__arm__)
+// This struct is for ArmV7 color conversion.
+struct YuvConstants {
+ uvec8 kUVToRB;
+ uvec8 kUVToG;
+ vec16 kUVBiasBGR;
+ vec32 kYToRgb;
+};
+#else
+// This struct is for Intel color conversion.
+struct YuvConstants {
+ int8 kUVToB[32];
+ int8 kUVToG[32];
+ int8 kUVToR[32];
+ int16 kUVBiasB[16];
+ int16 kUVBiasG[16];
+ int16 kUVBiasR[16];
+ int16 kYToRgb[16];
+};
+
+// Offsets into YuvConstants structure
+#define KUVTOB 0
+#define KUVTOG 32
+#define KUVTOR 64
+#define KUVBIASB 96
+#define KUVBIASG 128
+#define KUVBIASR 160
+#define KYTORGB 192
+#endif
+
+// Conversion matrix for YUV to RGB
+extern const struct YuvConstants SIMD_ALIGNED(kYuvI601Constants); // BT.601
+extern const struct YuvConstants SIMD_ALIGNED(kYuvJPEGConstants); // JPeg
+extern const struct YuvConstants SIMD_ALIGNED(kYuvH709Constants); // BT.709
+
+// Conversion matrix for YVU to BGR
+extern const struct YuvConstants SIMD_ALIGNED(kYvuI601Constants); // BT.601
+extern const struct YuvConstants SIMD_ALIGNED(kYvuJPEGConstants); // JPeg
+extern const struct YuvConstants SIMD_ALIGNED(kYvuH709Constants); // BT.709
+
+#if defined(__APPLE__) || defined(__x86_64__) || defined(__llvm__)
+#define OMITFP
+#else
+#define OMITFP __attribute__((optimize("omit-frame-pointer")))
+#endif
+
+// NaCL macros for GCC x86 and x64.
+#if defined(__native_client__)
+#define LABELALIGN ".p2align 5\n"
+#else
+#define LABELALIGN
+#endif
+#if defined(__native_client__) && defined(__x86_64__)
+// r14 is used for MEMOP macros.
+#define NACL_R14 "r14",
+#define BUNDLELOCK ".bundle_lock\n"
+#define BUNDLEUNLOCK ".bundle_unlock\n"
+#define MEMACCESS(base) "%%nacl:(%%r15,%q" #base ")"
+#define MEMACCESS2(offset, base) "%%nacl:" #offset "(%%r15,%q" #base ")"
+#define MEMLEA(offset, base) #offset "(%q" #base ")"
+#define MEMLEA3(offset, index, scale) #offset "(,%q" #index "," #scale ")"
+#define MEMLEA4(offset, base, index, scale) \
+ #offset "(%q" #base ",%q" #index "," #scale ")"
+#define MEMMOVESTRING(s, d) "%%nacl:(%q" #s "),%%nacl:(%q" #d "), %%r15"
+#define MEMSTORESTRING(reg, d) "%%" #reg ",%%nacl:(%q" #d "), %%r15"
+#define MEMOPREG(opcode, offset, base, index, scale, reg) \
+ BUNDLELOCK \
+ "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" #opcode \
+ " (%%r15,%%r14),%%" #reg "\n" BUNDLEUNLOCK
+#define MEMOPMEM(opcode, reg, offset, base, index, scale) \
+ BUNDLELOCK \
+ "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" #opcode \
+ " %%" #reg ",(%%r15,%%r14)\n" BUNDLEUNLOCK
+#define MEMOPARG(opcode, offset, base, index, scale, arg) \
+ BUNDLELOCK \
+ "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" #opcode \
+ " (%%r15,%%r14),%" #arg "\n" BUNDLEUNLOCK
+#define VMEMOPREG(opcode, offset, base, index, scale, reg1, reg2) \
+ BUNDLELOCK \
+ "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" #opcode \
+ " (%%r15,%%r14),%%" #reg1 ",%%" #reg2 "\n" BUNDLEUNLOCK
+#define VEXTOPMEM(op, sel, reg, offset, base, index, scale) \
+ BUNDLELOCK \
+ "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" #op \
+ " $" #sel ",%%" #reg ",(%%r15,%%r14)\n" BUNDLEUNLOCK
+#else // defined(__native_client__) && defined(__x86_64__)
+#define NACL_R14
+#define BUNDLEALIGN
+#define MEMACCESS(base) "(%" #base ")"
+#define MEMACCESS2(offset, base) #offset "(%" #base ")"
+#define MEMLEA(offset, base) #offset "(%" #base ")"
+#define MEMLEA3(offset, index, scale) #offset "(,%" #index "," #scale ")"
+#define MEMLEA4(offset, base, index, scale) \
+ #offset "(%" #base ",%" #index "," #scale ")"
+#define MEMMOVESTRING(s, d)
+#define MEMSTORESTRING(reg, d)
+#define MEMOPREG(opcode, offset, base, index, scale, reg) \
+ #opcode " " #offset "(%" #base ",%" #index "," #scale "),%%" #reg "\n"
+#define MEMOPMEM(opcode, reg, offset, base, index, scale) \
+ #opcode " %%" #reg "," #offset "(%" #base ",%" #index "," #scale ")\n"
+#define MEMOPARG(opcode, offset, base, index, scale, arg) \
+ #opcode " " #offset "(%" #base ",%" #index "," #scale "),%" #arg "\n"
+#define VMEMOPREG(opcode, offset, base, index, scale, reg1, reg2) \
+ #opcode " " #offset "(%" #base ",%" #index "," #scale "),%%" #reg1 \
+ ",%%" #reg2 "\n"
+#define VEXTOPMEM(op, sel, reg, offset, base, index, scale) \
+ #op " $" #sel ",%%" #reg "," #offset "(%" #base ",%" #index "," #scale ")\n"
+#endif // defined(__native_client__) && defined(__x86_64__)
+
+// Intel Code Analizer markers. Insert IACA_START IACA_END around code to be
+// measured and then run with iaca -64 libyuv_unittest.
+// IACA_ASM_START amd IACA_ASM_END are equivalents that can be used within
+// inline assembly blocks.
+// example of iaca:
+// ~/iaca-lin64/bin/iaca.sh -64 -analysis LATENCY out/Release/libyuv_unittest
+
+#if defined(__x86_64__) || defined(__i386__)
+
+#define IACA_ASM_START \
+ ".byte 0x0F, 0x0B\n" \
+ " movl $111, %%ebx\n" \
+ ".byte 0x64, 0x67, 0x90\n"
+
+#define IACA_ASM_END \
+ " movl $222, %%ebx\n" \
+ ".byte 0x64, 0x67, 0x90\n" \
+ ".byte 0x0F, 0x0B\n"
+
+#define IACA_SSC_MARK(MARK_ID) \
+ __asm__ __volatile__("\n\t movl $" #MARK_ID \
+ ", %%ebx" \
+ "\n\t .byte 0x64, 0x67, 0x90" \
+ : \
+ : \
+ : "memory");
+
+#define IACA_UD_BYTES __asm__ __volatile__("\n\t .byte 0x0F, 0x0B");
+
+#else /* Visual C */
+#define IACA_UD_BYTES \
+ { __asm _emit 0x0F __asm _emit 0x0B }
+
+#define IACA_SSC_MARK(x) \
+ { __asm mov ebx, x __asm _emit 0x64 __asm _emit 0x67 __asm _emit 0x90 }
+
+#define IACA_VC64_START __writegsbyte(111, 111);
+#define IACA_VC64_END __writegsbyte(222, 222);
+#endif
+
+#define IACA_START \
+ { \
+ IACA_UD_BYTES \
+ IACA_SSC_MARK(111) \
+ }
+#define IACA_END \
+ { \
+ IACA_SSC_MARK(222) \
+ IACA_UD_BYTES \
+ }
+
+void I444ToARGBRow_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToARGBRow_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422AlphaToARGBRow_NEON(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ const uint8* a_buf,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToARGBRow_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToRGBARow_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_rgba,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToRGB24Row_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToRGB565Row_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_rgb565,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToARGB1555Row_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb1555,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToARGB4444Row_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb4444,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV12ToARGBRow_NEON(const uint8* src_y,
+ const uint8* src_uv,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV12ToRGB565Row_NEON(const uint8* src_y,
+ const uint8* src_uv,
+ uint8* dst_rgb565,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV21ToARGBRow_NEON(const uint8* src_y,
+ const uint8* src_vu,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void YUY2ToARGBRow_NEON(const uint8* src_yuy2,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void UYVYToARGBRow_NEON(const uint8* src_uyvy,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I444ToARGBRow_MSA(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I444ToARGBRow_DSPR2(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToARGB4444Row_DSPR2(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb4444,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToARGB1555Row_DSPR2(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb1555,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV12ToARGBRow_DSPR2(const uint8* src_y,
+ const uint8* src_uv,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+
+void I422ToARGBRow_MSA(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToRGBARow_MSA(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_rgba,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422AlphaToARGBRow_MSA(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ const uint8* a_buf,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToRGB24Row_MSA(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToRGB565Row_MSA(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_rgb565,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToARGB4444Row_MSA(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb4444,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToARGB1555Row_MSA(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb1555,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV12ToARGBRow_MSA(const uint8* src_y,
+ const uint8* src_uv,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV12ToRGB565Row_MSA(const uint8* src_y,
+ const uint8* src_uv,
+ uint8* dst_rgb565,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV21ToARGBRow_MSA(const uint8* src_y,
+ const uint8* src_vu,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void YUY2ToARGBRow_MSA(const uint8* src_yuy2,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void UYVYToARGBRow_MSA(const uint8* src_uyvy,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+
+void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int width);
+void ARGBToYRow_Any_AVX2(const uint8* src_argb, uint8* dst_y, int width);
+void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width);
+void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int width);
+void ARGBToYJRow_Any_AVX2(const uint8* src_argb, uint8* dst_y, int width);
+void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width);
+void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int width);
+void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int width);
+void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int width);
+void RGB24ToYRow_SSSE3(const uint8* src_rgb24, uint8* dst_y, int width);
+void RAWToYRow_SSSE3(const uint8* src_raw, uint8* dst_y, int width);
+void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int width);
+void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int width);
+void ARGBToYRow_MSA(const uint8* src_argb, uint8* dst_y, int width);
+void ARGBToYJRow_MSA(const uint8* src_argb, uint8* dst_y, int width);
+void ARGBToUV444Row_NEON(const uint8* src_argb,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void ARGBToUVRow_NEON(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void ARGBToUV444Row_MSA(const uint8* src_argb,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void ARGBToUVRow_MSA(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void ARGBToUVJRow_NEON(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void BGRAToUVRow_NEON(const uint8* src_bgra,
+ int src_stride_bgra,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void ABGRToUVRow_NEON(const uint8* src_abgr,
+ int src_stride_abgr,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void RGBAToUVRow_NEON(const uint8* src_rgba,
+ int src_stride_rgba,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void RGB24ToUVRow_NEON(const uint8* src_rgb24,
+ int src_stride_rgb24,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void RAWToUVRow_NEON(const uint8* src_raw,
+ int src_stride_raw,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void RGB565ToUVRow_NEON(const uint8* src_rgb565,
+ int src_stride_rgb565,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void ARGB1555ToUVRow_NEON(const uint8* src_argb1555,
+ int src_stride_argb1555,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void ARGB4444ToUVRow_NEON(const uint8* src_argb4444,
+ int src_stride_argb4444,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void ARGBToUVJRow_MSA(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void BGRAToUVRow_MSA(const uint8* src_bgra,
+ int src_stride_bgra,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void ABGRToUVRow_MSA(const uint8* src_abgr,
+ int src_stride_abgr,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void RGBAToUVRow_MSA(const uint8* src_rgba,
+ int src_stride_rgba,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void RGB24ToUVRow_MSA(const uint8* src_rgb24,
+ int src_stride_rgb24,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void RAWToUVRow_MSA(const uint8* src_raw,
+ int src_stride_raw,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void RGB565ToUVRow_MSA(const uint8* src_rgb565,
+ int src_stride_rgb565,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void ARGB1555ToUVRow_MSA(const uint8* src_argb1555,
+ int src_stride_argb1555,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int width);
+void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int width);
+void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int width);
+void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int width);
+void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int width);
+void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int width);
+void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int width);
+void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int width);
+void BGRAToYRow_MSA(const uint8* src_bgra, uint8* dst_y, int width);
+void ABGRToYRow_MSA(const uint8* src_abgr, uint8* dst_y, int width);
+void RGBAToYRow_MSA(const uint8* src_rgba, uint8* dst_y, int width);
+void RGB24ToYRow_MSA(const uint8* src_rgb24, uint8* dst_y, int width);
+void RAWToYRow_MSA(const uint8* src_raw, uint8* dst_y, int width);
+void RGB565ToYRow_MSA(const uint8* src_rgb565, uint8* dst_y, int width);
+void ARGB1555ToYRow_MSA(const uint8* src_argb1555, uint8* dst_y, int width);
+void BGRAToUVRow_DSPR2(const uint8* src_bgra,
+ int src_stride_bgra,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void BGRAToYRow_DSPR2(const uint8* src_bgra, uint8* dst_y, int width);
+void ABGRToUVRow_DSPR2(const uint8* src_abgr,
+ int src_stride_abgr,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void ARGBToYRow_DSPR2(const uint8* src_argb, uint8* dst_y, int width);
+void ABGRToYRow_DSPR2(const uint8* src_abgr, uint8* dst_y, int width);
+void RGBAToUVRow_DSPR2(const uint8* src_rgba,
+ int src_stride_rgba,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void RGBAToYRow_DSPR2(const uint8* src_rgba, uint8* dst_y, int width);
+void ARGBToUVRow_DSPR2(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void ARGBToYRow_C(const uint8* src_argb, uint8* dst_y, int width);
+void ARGBToYJRow_C(const uint8* src_argb, uint8* dst_y, int width);
+void BGRAToYRow_C(const uint8* src_bgra, uint8* dst_y, int width);
+void ABGRToYRow_C(const uint8* src_abgr, uint8* dst_y, int width);
+void RGBAToYRow_C(const uint8* src_rgba, uint8* dst_y, int width);
+void RGB24ToYRow_C(const uint8* src_rgb24, uint8* dst_y, int width);
+void RAWToYRow_C(const uint8* src_raw, uint8* dst_y, int width);
+void RGB565ToYRow_C(const uint8* src_rgb565, uint8* dst_y, int width);
+void ARGB1555ToYRow_C(const uint8* src_argb1555, uint8* dst_y, int width);
+void ARGB4444ToYRow_C(const uint8* src_argb4444, uint8* dst_y, int width);
+void ARGBToYRow_Any_SSSE3(const uint8* src_argb, uint8* dst_y, int width);
+void ARGBToYJRow_Any_SSSE3(const uint8* src_argb, uint8* dst_y, int width);
+void BGRAToYRow_Any_SSSE3(const uint8* src_bgra, uint8* dst_y, int width);
+void ABGRToYRow_Any_SSSE3(const uint8* src_abgr, uint8* dst_y, int width);
+void RGBAToYRow_Any_SSSE3(const uint8* src_rgba, uint8* dst_y, int width);
+void RGB24ToYRow_Any_SSSE3(const uint8* src_rgb24, uint8* dst_y, int width);
+void RAWToYRow_Any_SSSE3(const uint8* src_raw, uint8* dst_y, int width);
+void ARGBToYRow_Any_NEON(const uint8* src_argb, uint8* dst_y, int width);
+void ARGBToYJRow_Any_NEON(const uint8* src_argb, uint8* dst_y, int width);
+void BGRAToYRow_Any_NEON(const uint8* src_bgra, uint8* dst_y, int width);
+void ABGRToYRow_Any_NEON(const uint8* src_abgr, uint8* dst_y, int width);
+void RGBAToYRow_Any_NEON(const uint8* src_rgba, uint8* dst_y, int width);
+void RGB24ToYRow_Any_NEON(const uint8* src_rgb24, uint8* dst_y, int width);
+void RAWToYRow_Any_NEON(const uint8* src_raw, uint8* dst_y, int width);
+void RGB565ToYRow_Any_NEON(const uint8* src_rgb565, uint8* dst_y, int width);
+void ARGB1555ToYRow_Any_NEON(const uint8* src_argb1555,
+ uint8* dst_y,
+ int width);
+void BGRAToYRow_Any_DSPR2(const uint8* src_bgra, uint8* dst_y, int width);
+void ARGBToYRow_Any_DSPR2(const uint8* src_argb, uint8* dst_y, int width);
+void ABGRToYRow_Any_DSPR2(const uint8* src_abgr, uint8* dst_y, int width);
+void RGBAToYRow_Any_DSPR2(const uint8* src_rgba, uint8* dst_y, int width);
+void ARGB4444ToYRow_Any_NEON(const uint8* src_argb4444,
+ uint8* dst_y,
+ int width);
+void BGRAToYRow_Any_MSA(const uint8* src_bgra, uint8* dst_y, int width);
+void ABGRToYRow_Any_MSA(const uint8* src_abgr, uint8* dst_y, int width);
+void RGBAToYRow_Any_MSA(const uint8* src_rgba, uint8* dst_y, int width);
+void ARGBToYJRow_Any_MSA(const uint8* src_argb, uint8* dst_y, int width);
+void ARGBToYRow_Any_MSA(const uint8* src_argb, uint8* dst_y, int width);
+void RGB24ToYRow_Any_MSA(const uint8* src_rgb24, uint8* dst_y, int width);
+void RAWToYRow_Any_MSA(const uint8* src_raw, uint8* dst_y, int width);
+void RGB565ToYRow_Any_MSA(const uint8* src_rgb565, uint8* dst_y, int width);
+void ARGB1555ToYRow_Any_MSA(const uint8* src_argb1555, uint8* dst_y, int width);
+
+void ARGBToUVRow_AVX2(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void ARGBToUVJRow_AVX2(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void ARGBToUVRow_SSSE3(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void ARGBToUVJRow_SSSE3(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void BGRAToUVRow_SSSE3(const uint8* src_bgra,
+ int src_stride_bgra,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void ABGRToUVRow_SSSE3(const uint8* src_abgr,
+ int src_stride_abgr,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void RGBAToUVRow_SSSE3(const uint8* src_rgba,
+ int src_stride_rgba,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void ARGBToUVRow_Any_AVX2(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void ARGBToUVJRow_Any_AVX2(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void ARGBToUVRow_Any_SSSE3(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void ARGBToUVJRow_Any_SSSE3(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void BGRAToUVRow_Any_SSSE3(const uint8* src_bgra,
+ int src_stride_bgra,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void ABGRToUVRow_Any_SSSE3(const uint8* src_abgr,
+ int src_stride_abgr,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void RGBAToUVRow_Any_SSSE3(const uint8* src_rgba,
+ int src_stride_rgba,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void ARGBToUV444Row_Any_NEON(const uint8* src_argb,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void ARGBToUVRow_Any_NEON(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void ARGBToUV444Row_Any_MSA(const uint8* src_argb,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void ARGBToUVRow_Any_MSA(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void ARGBToUVJRow_Any_NEON(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void BGRAToUVRow_Any_NEON(const uint8* src_bgra,
+ int src_stride_bgra,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void ABGRToUVRow_Any_NEON(const uint8* src_abgr,
+ int src_stride_abgr,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void RGBAToUVRow_Any_NEON(const uint8* src_rgba,
+ int src_stride_rgba,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void RGB24ToUVRow_Any_NEON(const uint8* src_rgb24,
+ int src_stride_rgb24,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void RAWToUVRow_Any_NEON(const uint8* src_raw,
+ int src_stride_raw,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void RGB565ToUVRow_Any_NEON(const uint8* src_rgb565,
+ int src_stride_rgb565,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void ARGB1555ToUVRow_Any_NEON(const uint8* src_argb1555,
+ int src_stride_argb1555,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void ARGB4444ToUVRow_Any_NEON(const uint8* src_argb4444,
+ int src_stride_argb4444,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void ARGBToUVJRow_Any_MSA(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void BGRAToUVRow_Any_MSA(const uint8* src_bgra,
+ int src_stride_bgra,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void ABGRToUVRow_Any_MSA(const uint8* src_abgr,
+ int src_stride_abgr,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void RGBAToUVRow_Any_MSA(const uint8* src_rgba,
+ int src_stride_rgba,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void RGB24ToUVRow_Any_MSA(const uint8* src_rgb24,
+ int src_stride_rgb24,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void RAWToUVRow_Any_MSA(const uint8* src_raw,
+ int src_stride_raw,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void RGB565ToUVRow_Any_MSA(const uint8* src_rgb565,
+ int src_stride_rgb565,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void ARGB1555ToUVRow_Any_MSA(const uint8* src_argb1555,
+ int src_stride_argb1555,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void BGRAToUVRow_Any_DSPR2(const uint8* src_bgra,
+ int src_stride_bgra,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void ABGRToUVRow_Any_DSPR2(const uint8* src_abgr,
+ int src_stride_abgr,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void RGBAToUVRow_Any_DSPR2(const uint8* src_rgba,
+ int src_stride_rgba,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void ARGBToUVRow_Any_DSPR2(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void ARGBToUVRow_C(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void ARGBToUVJRow_C(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void ARGBToUVRow_C(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void ARGBToUVJRow_C(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void BGRAToUVRow_C(const uint8* src_bgra,
+ int src_stride_bgra,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void ABGRToUVRow_C(const uint8* src_abgr,
+ int src_stride_abgr,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void RGBAToUVRow_C(const uint8* src_rgba,
+ int src_stride_rgba,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void RGB24ToUVRow_C(const uint8* src_rgb24,
+ int src_stride_rgb24,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void RAWToUVRow_C(const uint8* src_raw,
+ int src_stride_raw,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void RGB565ToUVRow_C(const uint8* src_rgb565,
+ int src_stride_rgb565,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void ARGB1555ToUVRow_C(const uint8* src_argb1555,
+ int src_stride_argb1555,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void ARGB4444ToUVRow_C(const uint8* src_argb4444,
+ int src_stride_argb4444,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+
+void ARGBToUV444Row_SSSE3(const uint8* src_argb,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void ARGBToUV444Row_Any_SSSE3(const uint8* src_argb,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+
+void ARGBToUV444Row_C(const uint8* src_argb,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+
+void MirrorRow_AVX2(const uint8* src, uint8* dst, int width);
+void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width);
+void MirrorRow_NEON(const uint8* src, uint8* dst, int width);
+void MirrorRow_DSPR2(const uint8* src, uint8* dst, int width);
+void MirrorRow_MSA(const uint8* src, uint8* dst, int width);
+void MirrorRow_C(const uint8* src, uint8* dst, int width);
+void MirrorRow_Any_AVX2(const uint8* src, uint8* dst, int width);
+void MirrorRow_Any_SSSE3(const uint8* src, uint8* dst, int width);
+void MirrorRow_Any_SSE2(const uint8* src, uint8* dst, int width);
+void MirrorRow_Any_NEON(const uint8* src, uint8* dst, int width);
+void MirrorRow_Any_MSA(const uint8* src, uint8* dst, int width);
+
+void MirrorUVRow_SSSE3(const uint8* src_uv,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void MirrorUVRow_NEON(const uint8* src_uv,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void MirrorUVRow_DSPR2(const uint8* src_uv,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void MirrorUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width);
+
+void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width);
+void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width);
+void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width);
+void ARGBMirrorRow_MSA(const uint8* src, uint8* dst, int width);
+void ARGBMirrorRow_C(const uint8* src, uint8* dst, int width);
+void ARGBMirrorRow_Any_AVX2(const uint8* src, uint8* dst, int width);
+void ARGBMirrorRow_Any_SSE2(const uint8* src, uint8* dst, int width);
+void ARGBMirrorRow_Any_NEON(const uint8* src, uint8* dst, int width);
+void ARGBMirrorRow_Any_MSA(const uint8* src, uint8* dst, int width);
+
+void SplitUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width);
+void SplitUVRow_SSE2(const uint8* src_uv,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void SplitUVRow_AVX2(const uint8* src_uv,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void SplitUVRow_NEON(const uint8* src_uv,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void SplitUVRow_DSPR2(const uint8* src_uv,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void SplitUVRow_Any_SSE2(const uint8* src_uv,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void SplitUVRow_Any_AVX2(const uint8* src_uv,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void SplitUVRow_Any_NEON(const uint8* src_uv,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void SplitUVRow_Any_DSPR2(const uint8* src_uv,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+
+void MergeUVRow_C(const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_uv,
+ int width);
+void MergeUVRow_SSE2(const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_uv,
+ int width);
+void MergeUVRow_AVX2(const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_uv,
+ int width);
+void MergeUVRow_NEON(const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_uv,
+ int width);
+void MergeUVRow_MSA(const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_uv,
+ int width);
+void MergeUVRow_Any_SSE2(const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_uv,
+ int width);
+void MergeUVRow_Any_AVX2(const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_uv,
+ int width);
+void MergeUVRow_Any_NEON(const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_uv,
+ int width);
+void MergeUVRow_Any_MSA(const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_uv,
+ int width);
+
+void CopyRow_SSE2(const uint8* src, uint8* dst, int count);
+void CopyRow_AVX(const uint8* src, uint8* dst, int count);
+void CopyRow_ERMS(const uint8* src, uint8* dst, int count);
+void CopyRow_NEON(const uint8* src, uint8* dst, int count);
+void CopyRow_MIPS(const uint8* src, uint8* dst, int count);
+void CopyRow_C(const uint8* src, uint8* dst, int count);
+void CopyRow_Any_SSE2(const uint8* src, uint8* dst, int count);
+void CopyRow_Any_AVX(const uint8* src, uint8* dst, int count);
+void CopyRow_Any_NEON(const uint8* src, uint8* dst, int count);
+
+void CopyRow_16_C(const uint16* src, uint16* dst, int count);
+
+void ARGBCopyAlphaRow_C(const uint8* src_argb, uint8* dst_argb, int width);
+void ARGBCopyAlphaRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width);
+void ARGBCopyAlphaRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width);
+void ARGBCopyAlphaRow_Any_SSE2(const uint8* src_argb,
+ uint8* dst_argb,
+ int width);
+void ARGBCopyAlphaRow_Any_AVX2(const uint8* src_argb,
+ uint8* dst_argb,
+ int width);
+
+void ARGBExtractAlphaRow_C(const uint8* src_argb, uint8* dst_a, int width);
+void ARGBExtractAlphaRow_SSE2(const uint8* src_argb, uint8* dst_a, int width);
+void ARGBExtractAlphaRow_AVX2(const uint8* src_argb, uint8* dst_a, int width);
+void ARGBExtractAlphaRow_NEON(const uint8* src_argb, uint8* dst_a, int width);
+void ARGBExtractAlphaRow_Any_SSE2(const uint8* src_argb,
+ uint8* dst_a,
+ int width);
+void ARGBExtractAlphaRow_Any_AVX2(const uint8* src_argb,
+ uint8* dst_a,
+ int width);
+void ARGBExtractAlphaRow_Any_NEON(const uint8* src_argb,
+ uint8* dst_a,
+ int width);
+
+void ARGBCopyYToAlphaRow_C(const uint8* src_y, uint8* dst_argb, int width);
+void ARGBCopyYToAlphaRow_SSE2(const uint8* src_y, uint8* dst_argb, int width);
+void ARGBCopyYToAlphaRow_AVX2(const uint8* src_y, uint8* dst_argb, int width);
+void ARGBCopyYToAlphaRow_Any_SSE2(const uint8* src_y,
+ uint8* dst_argb,
+ int width);
+void ARGBCopyYToAlphaRow_Any_AVX2(const uint8* src_y,
+ uint8* dst_argb,
+ int width);
+
+void SetRow_C(uint8* dst, uint8 v8, int count);
+void SetRow_X86(uint8* dst, uint8 v8, int count);
+void SetRow_ERMS(uint8* dst, uint8 v8, int count);
+void SetRow_NEON(uint8* dst, uint8 v8, int count);
+void SetRow_Any_X86(uint8* dst, uint8 v8, int count);
+void SetRow_Any_NEON(uint8* dst, uint8 v8, int count);
+
+void ARGBSetRow_C(uint8* dst_argb, uint32 v32, int count);
+void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int count);
+void ARGBSetRow_NEON(uint8* dst_argb, uint32 v32, int count);
+void ARGBSetRow_Any_NEON(uint8* dst_argb, uint32 v32, int count);
+void ARGBSetRow_MSA(uint8* dst_argb, uint32 v32, int count);
+void ARGBSetRow_Any_MSA(uint8* dst_argb, uint32 v32, int count);
+
+// ARGBShufflers for BGRAToARGB etc.
+void ARGBShuffleRow_C(const uint8* src_argb,
+ uint8* dst_argb,
+ const uint8* shuffler,
+ int width);
+void ARGBShuffleRow_SSE2(const uint8* src_argb,
+ uint8* dst_argb,
+ const uint8* shuffler,
+ int width);
+void ARGBShuffleRow_SSSE3(const uint8* src_argb,
+ uint8* dst_argb,
+ const uint8* shuffler,
+ int width);
+void ARGBShuffleRow_AVX2(const uint8* src_argb,
+ uint8* dst_argb,
+ const uint8* shuffler,
+ int width);
+void ARGBShuffleRow_NEON(const uint8* src_argb,
+ uint8* dst_argb,
+ const uint8* shuffler,
+ int width);
+void ARGBShuffleRow_MSA(const uint8* src_argb,
+ uint8* dst_argb,
+ const uint8* shuffler,
+ int width);
+void ARGBShuffleRow_Any_SSE2(const uint8* src_argb,
+ uint8* dst_argb,
+ const uint8* shuffler,
+ int width);
+void ARGBShuffleRow_Any_SSSE3(const uint8* src_argb,
+ uint8* dst_argb,
+ const uint8* shuffler,
+ int width);
+void ARGBShuffleRow_Any_AVX2(const uint8* src_argb,
+ uint8* dst_argb,
+ const uint8* shuffler,
+ int width);
+void ARGBShuffleRow_Any_NEON(const uint8* src_argb,
+ uint8* dst_argb,
+ const uint8* shuffler,
+ int width);
+void ARGBShuffleRow_Any_MSA(const uint8* src_argb,
+ uint8* dst_argb,
+ const uint8* shuffler,
+ int width);
+
+void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int width);
+void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int width);
+void RAWToRGB24Row_SSSE3(const uint8* src_raw, uint8* dst_rgb24, int width);
+void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb, int width);
+void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555,
+ uint8* dst_argb,
+ int width);
+void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444,
+ uint8* dst_argb,
+ int width);
+void RGB565ToARGBRow_AVX2(const uint8* src_rgb565, uint8* dst_argb, int width);
+void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555,
+ uint8* dst_argb,
+ int width);
+void ARGB4444ToARGBRow_AVX2(const uint8* src_argb4444,
+ uint8* dst_argb,
+ int width);
+
+void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int width);
+void RGB24ToARGBRow_MSA(const uint8* src_rgb24, uint8* dst_argb, int width);
+void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int width);
+void RAWToARGBRow_MSA(const uint8* src_raw, uint8* dst_argb, int width);
+void RAWToRGB24Row_NEON(const uint8* src_raw, uint8* dst_rgb24, int width);
+void RAWToRGB24Row_MSA(const uint8* src_raw, uint8* dst_rgb24, int width);
+void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int width);
+void RGB565ToARGBRow_MSA(const uint8* src_rgb565, uint8* dst_argb, int width);
+void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555,
+ uint8* dst_argb,
+ int width);
+void ARGB1555ToARGBRow_MSA(const uint8* src_argb1555,
+ uint8* dst_argb,
+ int width);
+void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444,
+ uint8* dst_argb,
+ int width);
+void RGB24ToARGBRow_DSPR2(const uint8* src_rgb24, uint8* dst_argb, int width);
+void RAWToARGBRow_DSPR2(const uint8* src_raw, uint8* dst_argb, int width);
+void RGB565ToARGBRow_DSPR2(const uint8* src_rgb565, uint8* dst_argb, int width);
+void ARGB1555ToARGBRow_DSPR2(const uint8* src_argb1555,
+ uint8* dst_argb,
+ int width);
+void ARGB4444ToARGBRow_DSPR2(const uint8* src_argb4444,
+ uint8* dst_argb,
+ int width);
+void ARGB4444ToARGBRow_MSA(const uint8* src_argb4444,
+ uint8* dst_argb,
+ int width);
+void RGB24ToARGBRow_C(const uint8* src_rgb24, uint8* dst_argb, int width);
+void RAWToARGBRow_C(const uint8* src_raw, uint8* dst_argb, int width);
+void RAWToRGB24Row_C(const uint8* src_raw, uint8* dst_rgb24, int width);
+void RGB565ToARGBRow_C(const uint8* src_rgb, uint8* dst_argb, int width);
+void ARGB1555ToARGBRow_C(const uint8* src_argb, uint8* dst_argb, int width);
+void ARGB4444ToARGBRow_C(const uint8* src_argb, uint8* dst_argb, int width);
+void RGB24ToARGBRow_Any_SSSE3(const uint8* src_rgb24,
+ uint8* dst_argb,
+ int width);
+void RAWToARGBRow_Any_SSSE3(const uint8* src_raw, uint8* dst_argb, int width);
+void RAWToRGB24Row_Any_SSSE3(const uint8* src_raw, uint8* dst_rgb24, int width);
+
+void RGB565ToARGBRow_Any_SSE2(const uint8* src_rgb565,
+ uint8* dst_argb,
+ int width);
+void ARGB1555ToARGBRow_Any_SSE2(const uint8* src_argb1555,
+ uint8* dst_argb,
+ int width);
+void ARGB4444ToARGBRow_Any_SSE2(const uint8* src_argb4444,
+ uint8* dst_argb,
+ int width);
+void RGB565ToARGBRow_Any_AVX2(const uint8* src_rgb565,
+ uint8* dst_argb,
+ int width);
+void ARGB1555ToARGBRow_Any_AVX2(const uint8* src_argb1555,
+ uint8* dst_argb,
+ int width);
+void ARGB4444ToARGBRow_Any_AVX2(const uint8* src_argb4444,
+ uint8* dst_argb,
+ int width);
+
+void RGB24ToARGBRow_Any_NEON(const uint8* src_rgb24,
+ uint8* dst_argb,
+ int width);
+void RGB24ToARGBRow_Any_MSA(const uint8* src_rgb24, uint8* dst_argb, int width);
+void RAWToARGBRow_Any_NEON(const uint8* src_raw, uint8* dst_argb, int width);
+void RAWToARGBRow_Any_MSA(const uint8* src_raw, uint8* dst_argb, int width);
+void RAWToRGB24Row_Any_NEON(const uint8* src_raw, uint8* dst_rgb24, int width);
+void RAWToRGB24Row_Any_MSA(const uint8* src_raw, uint8* dst_rgb24, int width);
+void RGB565ToARGBRow_Any_NEON(const uint8* src_rgb565,
+ uint8* dst_argb,
+ int width);
+void RGB565ToARGBRow_Any_MSA(const uint8* src_rgb565,
+ uint8* dst_argb,
+ int width);
+void ARGB1555ToARGBRow_Any_NEON(const uint8* src_argb1555,
+ uint8* dst_argb,
+ int width);
+void ARGB1555ToARGBRow_Any_MSA(const uint8* src_argb1555,
+ uint8* dst_argb,
+ int width);
+void ARGB4444ToARGBRow_Any_NEON(const uint8* src_argb4444,
+ uint8* dst_argb,
+ int width);
+void RGB24ToARGBRow_Any_DSPR2(const uint8* src_rgb24,
+ uint8* dst_argb,
+ int width);
+void RAWToARGBRow_Any_DSPR2(const uint8* src_raw, uint8* dst_argb, int width);
+void RGB565ToARGBRow_Any_DSPR2(const uint8* src_rgb565,
+ uint8* dst_argb,
+ int width);
+void ARGB1555ToARGBRow_Any_DSPR2(const uint8* src_argb1555,
+ uint8* dst_argb,
+ int width);
+void ARGB4444ToARGBRow_Any_DSPR2(const uint8* src_argb4444,
+ uint8* dst_argb,
+ int width);
+
+void ARGB4444ToARGBRow_Any_MSA(const uint8* src_argb4444,
+ uint8* dst_argb,
+ int width);
+
+void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width);
+void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width);
+void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width);
+void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width);
+void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width);
+
+void ARGBToRGB565DitherRow_C(const uint8* src_argb,
+ uint8* dst_rgb,
+ const uint32 dither4,
+ int width);
+void ARGBToRGB565DitherRow_SSE2(const uint8* src_argb,
+ uint8* dst_rgb,
+ const uint32 dither4,
+ int width);
+void ARGBToRGB565DitherRow_AVX2(const uint8* src_argb,
+ uint8* dst_rgb,
+ const uint32 dither4,
+ int width);
+
+void ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width);
+void ARGBToARGB1555Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width);
+void ARGBToARGB4444Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width);
+
+void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb, int width);
+void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_rgb, int width);
+void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb, int width);
+void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_rgb, int width);
+void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_rgb, int width);
+void ARGBToRGB565DitherRow_NEON(const uint8* src_argb,
+ uint8* dst_rgb,
+ const uint32 dither4,
+ int width);
+void ARGBToRGB24Row_MSA(const uint8* src_argb, uint8* dst_rgb, int width);
+void ARGBToRAWRow_MSA(const uint8* src_argb, uint8* dst_rgb, int width);
+void ARGBToRGB565Row_MSA(const uint8* src_argb, uint8* dst_rgb, int width);
+void ARGBToARGB1555Row_MSA(const uint8* src_argb, uint8* dst_rgb, int width);
+void ARGBToARGB4444Row_MSA(const uint8* src_argb, uint8* dst_rgb, int width);
+void ARGBToRGB565DitherRow_MSA(const uint8* src_argb,
+ uint8* dst_rgb,
+ const uint32 dither4,
+ int width);
+
+void ARGBToRGBARow_C(const uint8* src_argb, uint8* dst_rgb, int width);
+void ARGBToRGB24Row_C(const uint8* src_argb, uint8* dst_rgb, int width);
+void ARGBToRAWRow_C(const uint8* src_argb, uint8* dst_rgb, int width);
+void ARGBToRGB565Row_C(const uint8* src_argb, uint8* dst_rgb, int width);
+void ARGBToARGB1555Row_C(const uint8* src_argb, uint8* dst_rgb, int width);
+void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int width);
+
+void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int width);
+void J400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int width);
+void J400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width);
+void J400ToARGBRow_MSA(const uint8* src_y, uint8* dst_argb, int width);
+void J400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width);
+void J400ToARGBRow_Any_SSE2(const uint8* src_y, uint8* dst_argb, int width);
+void J400ToARGBRow_Any_AVX2(const uint8* src_y, uint8* dst_argb, int width);
+void J400ToARGBRow_Any_NEON(const uint8* src_y, uint8* dst_argb, int width);
+void J400ToARGBRow_Any_MSA(const uint8* src_y, uint8* dst_argb, int width);
+
+void I444ToARGBRow_C(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToARGBRow_C(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToARGBRow_C(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422AlphaToARGBRow_C(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ const uint8* a_buf,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV12ToARGBRow_C(const uint8* src_y,
+ const uint8* src_uv,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV12ToRGB565Row_C(const uint8* src_y,
+ const uint8* src_uv,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV21ToARGBRow_C(const uint8* src_y,
+ const uint8* src_uv,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void YUY2ToARGBRow_C(const uint8* src_yuy2,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void UYVYToARGBRow_C(const uint8* src_uyvy,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToRGBARow_C(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_rgba,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToRGB24Row_C(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToARGB4444Row_C(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb4444,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToARGB1555Row_C(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb4444,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToRGB565Row_C(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_rgb565,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToARGBRow_AVX2(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToARGBRow_AVX2(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToRGBARow_AVX2(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I444ToARGBRow_SSSE3(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I444ToARGBRow_AVX2(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I444ToARGBRow_SSSE3(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I444ToARGBRow_AVX2(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToARGBRow_SSSE3(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422AlphaToARGBRow_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ const uint8* a_buf,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422AlphaToARGBRow_AVX2(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ const uint8* a_buf,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToARGBRow_SSSE3(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV12ToARGBRow_SSSE3(const uint8* src_y,
+ const uint8* src_uv,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV12ToARGBRow_AVX2(const uint8* src_y,
+ const uint8* src_uv,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV12ToRGB565Row_SSSE3(const uint8* src_y,
+ const uint8* src_uv,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV12ToRGB565Row_AVX2(const uint8* src_y,
+ const uint8* src_uv,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV21ToARGBRow_SSSE3(const uint8* src_y,
+ const uint8* src_uv,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV21ToARGBRow_AVX2(const uint8* src_y,
+ const uint8* src_uv,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void YUY2ToARGBRow_SSSE3(const uint8* src_yuy2,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void UYVYToARGBRow_SSSE3(const uint8* src_uyvy,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void YUY2ToARGBRow_AVX2(const uint8* src_yuy2,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void UYVYToARGBRow_AVX2(const uint8* src_uyvy,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToRGBARow_SSSE3(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_rgba,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToARGB4444Row_SSSE3(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToARGB4444Row_AVX2(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToARGB1555Row_SSSE3(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToARGB1555Row_AVX2(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToRGB565Row_SSSE3(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToRGB565Row_AVX2(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToRGB24Row_SSSE3(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToRGB24Row_AVX2(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToARGBRow_Any_AVX2(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToRGBARow_Any_AVX2(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I444ToARGBRow_Any_SSSE3(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I444ToARGBRow_Any_AVX2(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToARGBRow_Any_SSSE3(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422AlphaToARGBRow_Any_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ const uint8* a_buf,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422AlphaToARGBRow_Any_AVX2(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ const uint8* a_buf,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV12ToARGBRow_Any_SSSE3(const uint8* src_y,
+ const uint8* src_uv,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV12ToARGBRow_Any_AVX2(const uint8* src_y,
+ const uint8* src_uv,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV21ToARGBRow_Any_SSSE3(const uint8* src_y,
+ const uint8* src_vu,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV21ToARGBRow_Any_AVX2(const uint8* src_y,
+ const uint8* src_vu,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV12ToRGB565Row_Any_SSSE3(const uint8* src_y,
+ const uint8* src_uv,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV12ToRGB565Row_Any_AVX2(const uint8* src_y,
+ const uint8* src_uv,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void YUY2ToARGBRow_Any_SSSE3(const uint8* src_yuy2,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void UYVYToARGBRow_Any_SSSE3(const uint8* src_uyvy,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void YUY2ToARGBRow_Any_AVX2(const uint8* src_yuy2,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void UYVYToARGBRow_Any_AVX2(const uint8* src_uyvy,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToRGBARow_Any_SSSE3(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_rgba,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToARGB4444Row_Any_SSSE3(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_rgba,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToARGB4444Row_Any_AVX2(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_rgba,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToARGB1555Row_Any_SSSE3(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_rgba,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToARGB1555Row_Any_AVX2(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_rgba,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToRGB565Row_Any_SSSE3(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_rgba,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToRGB565Row_Any_AVX2(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_rgba,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToRGB24Row_Any_SSSE3(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToRGB24Row_Any_AVX2(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+
+void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width);
+void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int width);
+void I400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int width);
+void I400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width);
+void I400ToARGBRow_MSA(const uint8* src_y, uint8* dst_argb, int width);
+void I400ToARGBRow_Any_SSE2(const uint8* src_y, uint8* dst_argb, int width);
+void I400ToARGBRow_Any_AVX2(const uint8* src_y, uint8* dst_argb, int width);
+void I400ToARGBRow_Any_NEON(const uint8* src_y, uint8* dst_argb, int width);
+void I400ToARGBRow_Any_MSA(const uint8* src_y, uint8* dst_argb, int width);
+
+// ARGB preattenuated alpha blend.
+void ARGBBlendRow_SSSE3(const uint8* src_argb,
+ const uint8* src_argb1,
+ uint8* dst_argb,
+ int width);
+void ARGBBlendRow_NEON(const uint8* src_argb,
+ const uint8* src_argb1,
+ uint8* dst_argb,
+ int width);
+void ARGBBlendRow_C(const uint8* src_argb,
+ const uint8* src_argb1,
+ uint8* dst_argb,
+ int width);
+
+// Unattenuated planar alpha blend.
+void BlendPlaneRow_SSSE3(const uint8* src0,
+ const uint8* src1,
+ const uint8* alpha,
+ uint8* dst,
+ int width);
+void BlendPlaneRow_Any_SSSE3(const uint8* src0,
+ const uint8* src1,
+ const uint8* alpha,
+ uint8* dst,
+ int width);
+void BlendPlaneRow_AVX2(const uint8* src0,
+ const uint8* src1,
+ const uint8* alpha,
+ uint8* dst,
+ int width);
+void BlendPlaneRow_Any_AVX2(const uint8* src0,
+ const uint8* src1,
+ const uint8* alpha,
+ uint8* dst,
+ int width);
+void BlendPlaneRow_C(const uint8* src0,
+ const uint8* src1,
+ const uint8* alpha,
+ uint8* dst,
+ int width);
+
+// ARGB multiply images. Same API as Blend, but these require
+// pointer and width alignment for SSE2.
+void ARGBMultiplyRow_C(const uint8* src_argb,
+ const uint8* src_argb1,
+ uint8* dst_argb,
+ int width);
+void ARGBMultiplyRow_SSE2(const uint8* src_argb,
+ const uint8* src_argb1,
+ uint8* dst_argb,
+ int width);
+void ARGBMultiplyRow_Any_SSE2(const uint8* src_argb,
+ const uint8* src_argb1,
+ uint8* dst_argb,
+ int width);
+void ARGBMultiplyRow_AVX2(const uint8* src_argb,
+ const uint8* src_argb1,
+ uint8* dst_argb,
+ int width);
+void ARGBMultiplyRow_Any_AVX2(const uint8* src_argb,
+ const uint8* src_argb1,
+ uint8* dst_argb,
+ int width);
+void ARGBMultiplyRow_NEON(const uint8* src_argb,
+ const uint8* src_argb1,
+ uint8* dst_argb,
+ int width);
+void ARGBMultiplyRow_Any_NEON(const uint8* src_argb,
+ const uint8* src_argb1,
+ uint8* dst_argb,
+ int width);
+void ARGBMultiplyRow_MSA(const uint8* src_argb,
+ const uint8* src_argb1,
+ uint8* dst_argb,
+ int width);
+void ARGBMultiplyRow_Any_MSA(const uint8* src_argb,
+ const uint8* src_argb1,
+ uint8* dst_argb,
+ int width);
+
+// ARGB add images.
+void ARGBAddRow_C(const uint8* src_argb,
+ const uint8* src_argb1,
+ uint8* dst_argb,
+ int width);
+void ARGBAddRow_SSE2(const uint8* src_argb,
+ const uint8* src_argb1,
+ uint8* dst_argb,
+ int width);
+void ARGBAddRow_Any_SSE2(const uint8* src_argb,
+ const uint8* src_argb1,
+ uint8* dst_argb,
+ int width);
+void ARGBAddRow_AVX2(const uint8* src_argb,
+ const uint8* src_argb1,
+ uint8* dst_argb,
+ int width);
+void ARGBAddRow_Any_AVX2(const uint8* src_argb,
+ const uint8* src_argb1,
+ uint8* dst_argb,
+ int width);
+void ARGBAddRow_NEON(const uint8* src_argb,
+ const uint8* src_argb1,
+ uint8* dst_argb,
+ int width);
+void ARGBAddRow_Any_NEON(const uint8* src_argb,
+ const uint8* src_argb1,
+ uint8* dst_argb,
+ int width);
+void ARGBAddRow_MSA(const uint8* src_argb,
+ const uint8* src_argb1,
+ uint8* dst_argb,
+ int width);
+void ARGBAddRow_Any_MSA(const uint8* src_argb,
+ const uint8* src_argb1,
+ uint8* dst_argb,
+ int width);
+
+// ARGB subtract images. Same API as Blend, but these require
+// pointer and width alignment for SSE2.
+void ARGBSubtractRow_C(const uint8* src_argb,
+ const uint8* src_argb1,
+ uint8* dst_argb,
+ int width);
+void ARGBSubtractRow_SSE2(const uint8* src_argb,
+ const uint8* src_argb1,
+ uint8* dst_argb,
+ int width);
+void ARGBSubtractRow_Any_SSE2(const uint8* src_argb,
+ const uint8* src_argb1,
+ uint8* dst_argb,
+ int width);
+void ARGBSubtractRow_AVX2(const uint8* src_argb,
+ const uint8* src_argb1,
+ uint8* dst_argb,
+ int width);
+void ARGBSubtractRow_Any_AVX2(const uint8* src_argb,
+ const uint8* src_argb1,
+ uint8* dst_argb,
+ int width);
+void ARGBSubtractRow_NEON(const uint8* src_argb,
+ const uint8* src_argb1,
+ uint8* dst_argb,
+ int width);
+void ARGBSubtractRow_Any_NEON(const uint8* src_argb,
+ const uint8* src_argb1,
+ uint8* dst_argb,
+ int width);
+void ARGBSubtractRow_MSA(const uint8* src_argb,
+ const uint8* src_argb1,
+ uint8* dst_argb,
+ int width);
+void ARGBSubtractRow_Any_MSA(const uint8* src_argb,
+ const uint8* src_argb1,
+ uint8* dst_argb,
+ int width);
+
+void ARGBToRGB24Row_Any_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width);
+void ARGBToRAWRow_Any_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width);
+void ARGBToRGB565Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int width);
+void ARGBToARGB1555Row_Any_SSE2(const uint8* src_argb,
+ uint8* dst_rgb,
+ int width);
+void ARGBToARGB4444Row_Any_SSE2(const uint8* src_argb,
+ uint8* dst_rgb,
+ int width);
+
+void ARGBToRGB565DitherRow_Any_SSE2(const uint8* src_argb,
+ uint8* dst_rgb,
+ const uint32 dither4,
+ int width);
+void ARGBToRGB565DitherRow_Any_AVX2(const uint8* src_argb,
+ uint8* dst_rgb,
+ const uint32 dither4,
+ int width);
+
+void ARGBToRGB565Row_Any_AVX2(const uint8* src_argb, uint8* dst_rgb, int width);
+void ARGBToARGB1555Row_Any_AVX2(const uint8* src_argb,
+ uint8* dst_rgb,
+ int width);
+void ARGBToARGB4444Row_Any_AVX2(const uint8* src_argb,
+ uint8* dst_rgb,
+ int width);
+
+void ARGBToRGB24Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int width);
+void ARGBToRAWRow_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int width);
+void ARGBToRGB565Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int width);
+void ARGBToARGB1555Row_Any_NEON(const uint8* src_argb,
+ uint8* dst_rgb,
+ int width);
+void ARGBToARGB4444Row_Any_NEON(const uint8* src_argb,
+ uint8* dst_rgb,
+ int width);
+void ARGBToRGB565DitherRow_Any_NEON(const uint8* src_argb,
+ uint8* dst_rgb,
+ const uint32 dither4,
+ int width);
+void ARGBToRGB24Row_Any_MSA(const uint8* src_argb, uint8* dst_rgb, int width);
+void ARGBToRAWRow_Any_MSA(const uint8* src_argb, uint8* dst_rgb, int width);
+void ARGBToRGB565Row_Any_MSA(const uint8* src_argb, uint8* dst_rgb, int width);
+void ARGBToARGB1555Row_Any_MSA(const uint8* src_argb,
+ uint8* dst_rgb,
+ int width);
+void ARGBToARGB4444Row_Any_MSA(const uint8* src_argb,
+ uint8* dst_rgb,
+ int width);
+void ARGBToRGB565DitherRow_Any_MSA(const uint8* src_argb,
+ uint8* dst_rgb,
+ const uint32 dither4,
+ int width);
+
+void I444ToARGBRow_Any_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToARGBRow_Any_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422AlphaToARGBRow_Any_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ const uint8* src_a,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToRGBARow_Any_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToRGB24Row_Any_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToARGB4444Row_Any_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToARGB1555Row_Any_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToRGB565Row_Any_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV12ToARGBRow_Any_NEON(const uint8* src_y,
+ const uint8* src_uv,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV21ToARGBRow_Any_NEON(const uint8* src_y,
+ const uint8* src_vu,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV12ToRGB565Row_Any_NEON(const uint8* src_y,
+ const uint8* src_uv,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void YUY2ToARGBRow_Any_NEON(const uint8* src_yuy2,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void UYVYToARGBRow_Any_NEON(const uint8* src_uyvy,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I444ToARGBRow_Any_DSPR2(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToARGB4444Row_Any_DSPR2(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToARGBRow_Any_DSPR2(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToARGBRow_DSPR2(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToARGB1555Row_Any_DSPR2(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I411ToARGBRow_Any_DSPR2(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV12ToARGBRow_Any_DSPR2(const uint8* src_y,
+ const uint8* src_uv,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToARGBRow_DSPR2(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I444ToARGBRow_Any_MSA(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToARGBRow_Any_MSA(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToRGBARow_Any_MSA(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422AlphaToARGBRow_Any_MSA(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ const uint8* src_a,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToRGB24Row_Any_MSA(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToRGB565Row_Any_MSA(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_rgb565,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToARGB4444Row_Any_MSA(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb4444,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToARGB1555Row_Any_MSA(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb1555,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV12ToARGBRow_Any_MSA(const uint8* src_y,
+ const uint8* src_uv,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV12ToRGB565Row_Any_MSA(const uint8* src_y,
+ const uint8* src_uv,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV21ToARGBRow_Any_MSA(const uint8* src_y,
+ const uint8* src_vu,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void YUY2ToARGBRow_Any_MSA(const uint8* src_yuy2,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void UYVYToARGBRow_Any_MSA(const uint8* src_uyvy,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+
+void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int width);
+void YUY2ToUVRow_AVX2(const uint8* src_yuy2,
+ int stride_yuy2,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int width);
+void YUY2ToUVRow_SSE2(const uint8* src_yuy2,
+ int stride_yuy2,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int width);
+void YUY2ToUVRow_NEON(const uint8* src_yuy2,
+ int stride_yuy2,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void YUY2ToUV422Row_NEON(const uint8* src_yuy2,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void YUY2ToYRow_MSA(const uint8* src_yuy2, uint8* dst_y, int width);
+void YUY2ToUVRow_MSA(const uint8* src_yuy2,
+ int stride_yuy2,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void YUY2ToUV422Row_MSA(const uint8* src_yuy2,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void YUY2ToYRow_C(const uint8* src_yuy2, uint8* dst_y, int width);
+void YUY2ToUVRow_C(const uint8* src_yuy2,
+ int stride_yuy2,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void YUY2ToUV422Row_C(const uint8* src_yuy2,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void YUY2ToYRow_Any_AVX2(const uint8* src_yuy2, uint8* dst_y, int width);
+void YUY2ToUVRow_Any_AVX2(const uint8* src_yuy2,
+ int stride_yuy2,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void YUY2ToUV422Row_Any_AVX2(const uint8* src_yuy2,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void YUY2ToYRow_Any_SSE2(const uint8* src_yuy2, uint8* dst_y, int width);
+void YUY2ToUVRow_Any_SSE2(const uint8* src_yuy2,
+ int stride_yuy2,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void YUY2ToUV422Row_Any_SSE2(const uint8* src_yuy2,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void YUY2ToYRow_Any_NEON(const uint8* src_yuy2, uint8* dst_y, int width);
+void YUY2ToUVRow_Any_NEON(const uint8* src_yuy2,
+ int stride_yuy2,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void YUY2ToUV422Row_Any_NEON(const uint8* src_yuy2,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void YUY2ToYRow_Any_MSA(const uint8* src_yuy2, uint8* dst_y, int width);
+void YUY2ToUVRow_Any_MSA(const uint8* src_yuy2,
+ int stride_yuy2,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void YUY2ToUV422Row_Any_MSA(const uint8* src_yuy2,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int width);
+void UYVYToUVRow_AVX2(const uint8* src_uyvy,
+ int stride_uyvy,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int width);
+void UYVYToUVRow_SSE2(const uint8* src_uyvy,
+ int stride_uyvy,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int width);
+void UYVYToUVRow_AVX2(const uint8* src_uyvy,
+ int stride_uyvy,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int width);
+void UYVYToUVRow_NEON(const uint8* src_uyvy,
+ int stride_uyvy,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void UYVYToUV422Row_NEON(const uint8* src_uyvy,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void UYVYToYRow_MSA(const uint8* src_uyvy, uint8* dst_y, int width);
+void UYVYToUVRow_MSA(const uint8* src_uyvy,
+ int stride_uyvy,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void UYVYToUV422Row_MSA(const uint8* src_uyvy,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+
+void UYVYToYRow_C(const uint8* src_uyvy, uint8* dst_y, int width);
+void UYVYToUVRow_C(const uint8* src_uyvy,
+ int stride_uyvy,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void UYVYToUV422Row_C(const uint8* src_uyvy,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void UYVYToYRow_Any_AVX2(const uint8* src_uyvy, uint8* dst_y, int width);
+void UYVYToUVRow_Any_AVX2(const uint8* src_uyvy,
+ int stride_uyvy,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void UYVYToUV422Row_Any_AVX2(const uint8* src_uyvy,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void UYVYToYRow_Any_SSE2(const uint8* src_uyvy, uint8* dst_y, int width);
+void UYVYToUVRow_Any_SSE2(const uint8* src_uyvy,
+ int stride_uyvy,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void UYVYToUV422Row_Any_SSE2(const uint8* src_uyvy,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void UYVYToYRow_Any_NEON(const uint8* src_uyvy, uint8* dst_y, int width);
+void UYVYToUVRow_Any_NEON(const uint8* src_uyvy,
+ int stride_uyvy,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void UYVYToUV422Row_Any_NEON(const uint8* src_uyvy,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void UYVYToYRow_Any_MSA(const uint8* src_uyvy, uint8* dst_y, int width);
+void UYVYToUVRow_Any_MSA(const uint8* src_uyvy,
+ int stride_uyvy,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void UYVYToUV422Row_Any_MSA(const uint8* src_uyvy,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+
+void I422ToYUY2Row_C(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_yuy2,
+ int width);
+void I422ToUYVYRow_C(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_uyvy,
+ int width);
+void I422ToYUY2Row_SSE2(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_yuy2,
+ int width);
+void I422ToUYVYRow_SSE2(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_uyvy,
+ int width);
+void I422ToYUY2Row_Any_SSE2(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_yuy2,
+ int width);
+void I422ToUYVYRow_Any_SSE2(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_uyvy,
+ int width);
+void I422ToYUY2Row_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_yuy2,
+ int width);
+void I422ToUYVYRow_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_uyvy,
+ int width);
+void I422ToYUY2Row_Any_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_yuy2,
+ int width);
+void I422ToUYVYRow_Any_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_uyvy,
+ int width);
+void I422ToYUY2Row_MSA(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_yuy2,
+ int width);
+void I422ToUYVYRow_MSA(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_uyvy,
+ int width);
+void I422ToYUY2Row_Any_MSA(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_yuy2,
+ int width);
+void I422ToUYVYRow_Any_MSA(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_uyvy,
+ int width);
+
+// Effects related row functions.
+void ARGBAttenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width);
+void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width);
+void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width);
+void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width);
+void ARGBAttenuateRow_MSA(const uint8* src_argb, uint8* dst_argb, int width);
+void ARGBAttenuateRow_Any_SSE2(const uint8* src_argb,
+ uint8* dst_argb,
+ int width);
+void ARGBAttenuateRow_Any_SSSE3(const uint8* src_argb,
+ uint8* dst_argb,
+ int width);
+void ARGBAttenuateRow_Any_AVX2(const uint8* src_argb,
+ uint8* dst_argb,
+ int width);
+void ARGBAttenuateRow_Any_NEON(const uint8* src_argb,
+ uint8* dst_argb,
+ int width);
+void ARGBAttenuateRow_Any_MSA(const uint8* src_argb,
+ uint8* dst_argb,
+ int width);
+
+// Inverse table for unattenuate, shared by C and SSE2.
+extern const uint32 fixed_invtbl8[256];
+void ARGBUnattenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width);
+void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width);
+void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width);
+void ARGBUnattenuateRow_Any_SSE2(const uint8* src_argb,
+ uint8* dst_argb,
+ int width);
+void ARGBUnattenuateRow_Any_AVX2(const uint8* src_argb,
+ uint8* dst_argb,
+ int width);
+
+void ARGBGrayRow_C(const uint8* src_argb, uint8* dst_argb, int width);
+void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width);
+void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width);
+void ARGBGrayRow_MSA(const uint8* src_argb, uint8* dst_argb, int width);
+
+void ARGBSepiaRow_C(uint8* dst_argb, int width);
+void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width);
+void ARGBSepiaRow_NEON(uint8* dst_argb, int width);
+void ARGBSepiaRow_MSA(uint8* dst_argb, int width);
+
+void ARGBColorMatrixRow_C(const uint8* src_argb,
+ uint8* dst_argb,
+ const int8* matrix_argb,
+ int width);
+void ARGBColorMatrixRow_SSSE3(const uint8* src_argb,
+ uint8* dst_argb,
+ const int8* matrix_argb,
+ int width);
+void ARGBColorMatrixRow_NEON(const uint8* src_argb,
+ uint8* dst_argb,
+ const int8* matrix_argb,
+ int width);
+
+void ARGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width);
+void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width);
+
+void RGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width);
+void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width);
+
+void ARGBQuantizeRow_C(uint8* dst_argb,
+ int scale,
+ int interval_size,
+ int interval_offset,
+ int width);
+void ARGBQuantizeRow_SSE2(uint8* dst_argb,
+ int scale,
+ int interval_size,
+ int interval_offset,
+ int width);
+void ARGBQuantizeRow_NEON(uint8* dst_argb,
+ int scale,
+ int interval_size,
+ int interval_offset,
+ int width);
+
+void ARGBShadeRow_C(const uint8* src_argb,
+ uint8* dst_argb,
+ int width,
+ uint32 value);
+void ARGBShadeRow_SSE2(const uint8* src_argb,
+ uint8* dst_argb,
+ int width,
+ uint32 value);
+void ARGBShadeRow_NEON(const uint8* src_argb,
+ uint8* dst_argb,
+ int width,
+ uint32 value);
+void ARGBShadeRow_MSA(const uint8* src_argb,
+ uint8* dst_argb,
+ int width,
+ uint32 value);
+
+// Used for blur.
+void CumulativeSumToAverageRow_SSE2(const int32* topleft,
+ const int32* botleft,
+ int width,
+ int area,
+ uint8* dst,
+ int count);
+void ComputeCumulativeSumRow_SSE2(const uint8* row,
+ int32* cumsum,
+ const int32* previous_cumsum,
+ int width);
+
+void CumulativeSumToAverageRow_C(const int32* topleft,
+ const int32* botleft,
+ int width,
+ int area,
+ uint8* dst,
+ int count);
+void ComputeCumulativeSumRow_C(const uint8* row,
+ int32* cumsum,
+ const int32* previous_cumsum,
+ int width);
+
+LIBYUV_API
+void ARGBAffineRow_C(const uint8* src_argb,
+ int src_argb_stride,
+ uint8* dst_argb,
+ const float* uv_dudv,
+ int width);
+LIBYUV_API
+void ARGBAffineRow_SSE2(const uint8* src_argb,
+ int src_argb_stride,
+ uint8* dst_argb,
+ const float* uv_dudv,
+ int width);
+
+// Used for I420Scale, ARGBScale, and ARGBInterpolate.
+void InterpolateRow_C(uint8* dst_ptr,
+ const uint8* src_ptr,
+ ptrdiff_t src_stride_ptr,
+ int width,
+ int source_y_fraction);
+void InterpolateRow_SSSE3(uint8* dst_ptr,
+ const uint8* src_ptr,
+ ptrdiff_t src_stride_ptr,
+ int width,
+ int source_y_fraction);
+void InterpolateRow_AVX2(uint8* dst_ptr,
+ const uint8* src_ptr,
+ ptrdiff_t src_stride_ptr,
+ int width,
+ int source_y_fraction);
+void InterpolateRow_NEON(uint8* dst_ptr,
+ const uint8* src_ptr,
+ ptrdiff_t src_stride_ptr,
+ int width,
+ int source_y_fraction);
+void InterpolateRow_DSPR2(uint8* dst_ptr,
+ const uint8* src_ptr,
+ ptrdiff_t src_stride_ptr,
+ int width,
+ int source_y_fraction);
+void InterpolateRow_MSA(uint8* dst_ptr,
+ const uint8* src_ptr,
+ ptrdiff_t src_stride_ptr,
+ int width,
+ int source_y_fraction);
+void InterpolateRow_Any_NEON(uint8* dst_ptr,
+ const uint8* src_ptr,
+ ptrdiff_t src_stride_ptr,
+ int width,
+ int source_y_fraction);
+void InterpolateRow_Any_SSSE3(uint8* dst_ptr,
+ const uint8* src_ptr,
+ ptrdiff_t src_stride_ptr,
+ int width,
+ int source_y_fraction);
+void InterpolateRow_Any_AVX2(uint8* dst_ptr,
+ const uint8* src_ptr,
+ ptrdiff_t src_stride_ptr,
+ int width,
+ int source_y_fraction);
+void InterpolateRow_Any_DSPR2(uint8* dst_ptr,
+ const uint8* src_ptr,
+ ptrdiff_t src_stride_ptr,
+ int width,
+ int source_y_fraction);
+void InterpolateRow_Any_MSA(uint8* dst_ptr,
+ const uint8* src_ptr,
+ ptrdiff_t src_stride_ptr,
+ int width,
+ int source_y_fraction);
+
+void InterpolateRow_16_C(uint16* dst_ptr,
+ const uint16* src_ptr,
+ ptrdiff_t src_stride_ptr,
+ int width,
+ int source_y_fraction);
+
+// Sobel images.
+void SobelXRow_C(const uint8* src_y0,
+ const uint8* src_y1,
+ const uint8* src_y2,
+ uint8* dst_sobelx,
+ int width);
+void SobelXRow_SSE2(const uint8* src_y0,
+ const uint8* src_y1,
+ const uint8* src_y2,
+ uint8* dst_sobelx,
+ int width);
+void SobelXRow_NEON(const uint8* src_y0,
+ const uint8* src_y1,
+ const uint8* src_y2,
+ uint8* dst_sobelx,
+ int width);
+void SobelYRow_C(const uint8* src_y0,
+ const uint8* src_y1,
+ uint8* dst_sobely,
+ int width);
+void SobelYRow_SSE2(const uint8* src_y0,
+ const uint8* src_y1,
+ uint8* dst_sobely,
+ int width);
+void SobelYRow_NEON(const uint8* src_y0,
+ const uint8* src_y1,
+ uint8* dst_sobely,
+ int width);
+void SobelRow_C(const uint8* src_sobelx,
+ const uint8* src_sobely,
+ uint8* dst_argb,
+ int width);
+void SobelRow_SSE2(const uint8* src_sobelx,
+ const uint8* src_sobely,
+ uint8* dst_argb,
+ int width);
+void SobelRow_NEON(const uint8* src_sobelx,
+ const uint8* src_sobely,
+ uint8* dst_argb,
+ int width);
+void SobelRow_MSA(const uint8* src_sobelx,
+ const uint8* src_sobely,
+ uint8* dst_argb,
+ int width);
+void SobelToPlaneRow_C(const uint8* src_sobelx,
+ const uint8* src_sobely,
+ uint8* dst_y,
+ int width);
+void SobelToPlaneRow_SSE2(const uint8* src_sobelx,
+ const uint8* src_sobely,
+ uint8* dst_y,
+ int width);
+void SobelToPlaneRow_NEON(const uint8* src_sobelx,
+ const uint8* src_sobely,
+ uint8* dst_y,
+ int width);
+void SobelToPlaneRow_MSA(const uint8* src_sobelx,
+ const uint8* src_sobely,
+ uint8* dst_y,
+ int width);
+void SobelXYRow_C(const uint8* src_sobelx,
+ const uint8* src_sobely,
+ uint8* dst_argb,
+ int width);
+void SobelXYRow_SSE2(const uint8* src_sobelx,
+ const uint8* src_sobely,
+ uint8* dst_argb,
+ int width);
+void SobelXYRow_NEON(const uint8* src_sobelx,
+ const uint8* src_sobely,
+ uint8* dst_argb,
+ int width);
+void SobelXYRow_MSA(const uint8* src_sobelx,
+ const uint8* src_sobely,
+ uint8* dst_argb,
+ int width);
+void SobelRow_Any_SSE2(const uint8* src_sobelx,
+ const uint8* src_sobely,
+ uint8* dst_argb,
+ int width);
+void SobelRow_Any_NEON(const uint8* src_sobelx,
+ const uint8* src_sobely,
+ uint8* dst_argb,
+ int width);
+void SobelRow_Any_MSA(const uint8* src_sobelx,
+ const uint8* src_sobely,
+ uint8* dst_argb,
+ int width);
+void SobelToPlaneRow_Any_SSE2(const uint8* src_sobelx,
+ const uint8* src_sobely,
+ uint8* dst_y,
+ int width);
+void SobelToPlaneRow_Any_NEON(const uint8* src_sobelx,
+ const uint8* src_sobely,
+ uint8* dst_y,
+ int width);
+void SobelToPlaneRow_Any_MSA(const uint8* src_sobelx,
+ const uint8* src_sobely,
+ uint8* dst_y,
+ int width);
+void SobelXYRow_Any_SSE2(const uint8* src_sobelx,
+ const uint8* src_sobely,
+ uint8* dst_argb,
+ int width);
+void SobelXYRow_Any_NEON(const uint8* src_sobelx,
+ const uint8* src_sobely,
+ uint8* dst_argb,
+ int width);
+void SobelXYRow_Any_MSA(const uint8* src_sobelx,
+ const uint8* src_sobely,
+ uint8* dst_argb,
+ int width);
+
+void ARGBPolynomialRow_C(const uint8* src_argb,
+ uint8* dst_argb,
+ const float* poly,
+ int width);
+void ARGBPolynomialRow_SSE2(const uint8* src_argb,
+ uint8* dst_argb,
+ const float* poly,
+ int width);
+void ARGBPolynomialRow_AVX2(const uint8* src_argb,
+ uint8* dst_argb,
+ const float* poly,
+ int width);
+
+// Scale and convert to half float.
+void HalfFloatRow_C(const uint16* src, uint16* dst, float scale, int width);
+void HalfFloatRow_SSE2(const uint16* src, uint16* dst, float scale, int width);
+void HalfFloatRow_Any_SSE2(const uint16* src,
+ uint16* dst,
+ float scale,
+ int width);
+void HalfFloatRow_AVX2(const uint16* src, uint16* dst, float scale, int width);
+void HalfFloatRow_Any_AVX2(const uint16* src,
+ uint16* dst,
+ float scale,
+ int width);
+void HalfFloatRow_F16C(const uint16* src, uint16* dst, float scale, int width);
+void HalfFloatRow_Any_F16C(const uint16* src,
+ uint16* dst,
+ float scale,
+ int width);
+void HalfFloat1Row_F16C(const uint16* src, uint16* dst, float scale, int width);
+void HalfFloat1Row_Any_F16C(const uint16* src,
+ uint16* dst,
+ float scale,
+ int width);
+void HalfFloatRow_NEON(const uint16* src, uint16* dst, float scale, int width);
+void HalfFloatRow_Any_NEON(const uint16* src,
+ uint16* dst,
+ float scale,
+ int width);
+void HalfFloat1Row_NEON(const uint16* src, uint16* dst, float scale, int width);
+void HalfFloat1Row_Any_NEON(const uint16* src,
+ uint16* dst,
+ float scale,
+ int width);
+
+void ARGBLumaColorTableRow_C(const uint8* src_argb,
+ uint8* dst_argb,
+ int width,
+ const uint8* luma,
+ uint32 lumacoeff);
+void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb,
+ uint8* dst_argb,
+ int width,
+ const uint8* luma,
+ uint32 lumacoeff);
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
+
+#endif // INCLUDE_LIBYUV_ROW_H_
diff --git a/libyuv/src/main/cpp/libyuv/include/libyuv/scale.h b/libyuv/src/main/cpp/libyuv/include/libyuv/scale.h
new file mode 100644
index 0000000..6d6b9a8
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/include/libyuv/scale.h
@@ -0,0 +1,142 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_SCALE_H_
+#define INCLUDE_LIBYUV_SCALE_H_
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Supported filtering.
+typedef enum FilterMode {
+ kFilterNone = 0, // Point sample; Fastest.
+ kFilterLinear = 1, // Filter horizontally only.
+ kFilterBilinear = 2, // Faster than box, but lower quality scaling down.
+ kFilterBox = 3 // Highest quality.
+} FilterModeEnum;
+
+// Scale a YUV plane.
+LIBYUV_API
+void ScalePlane(const uint8* src,
+ int src_stride,
+ int src_width,
+ int src_height,
+ uint8* dst,
+ int dst_stride,
+ int dst_width,
+ int dst_height,
+ enum FilterMode filtering);
+
+LIBYUV_API
+void ScalePlane_16(const uint16* src,
+ int src_stride,
+ int src_width,
+ int src_height,
+ uint16* dst,
+ int dst_stride,
+ int dst_width,
+ int dst_height,
+ enum FilterMode filtering);
+
+// Scales a YUV 4:2:0 image from the src width and height to the
+// dst width and height.
+// If filtering is kFilterNone, a simple nearest-neighbor algorithm is
+// used. This produces basic (blocky) quality at the fastest speed.
+// If filtering is kFilterBilinear, interpolation is used to produce a better
+// quality image, at the expense of speed.
+// If filtering is kFilterBox, averaging is used to produce ever better
+// quality image, at further expense of speed.
+// Returns 0 if successful.
+
+LIBYUV_API
+int I420Scale(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ int src_width,
+ int src_height,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int dst_width,
+ int dst_height,
+ enum FilterMode filtering);
+
+LIBYUV_API
+int I420Scale_16(const uint16* src_y,
+ int src_stride_y,
+ const uint16* src_u,
+ int src_stride_u,
+ const uint16* src_v,
+ int src_stride_v,
+ int src_width,
+ int src_height,
+ uint16* dst_y,
+ int dst_stride_y,
+ uint16* dst_u,
+ int dst_stride_u,
+ uint16* dst_v,
+ int dst_stride_v,
+ int dst_width,
+ int dst_height,
+ enum FilterMode filtering);
+
+#ifdef __cplusplus
+// Legacy API. Deprecated.
+LIBYUV_API
+int Scale(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ int src_stride_y,
+ int src_stride_u,
+ int src_stride_v,
+ int src_width,
+ int src_height,
+ uint8* dst_y,
+ uint8* dst_u,
+ uint8* dst_v,
+ int dst_stride_y,
+ int dst_stride_u,
+ int dst_stride_v,
+ int dst_width,
+ int dst_height,
+ LIBYUV_BOOL interpolate);
+
+// Legacy API. Deprecated.
+LIBYUV_API
+int ScaleOffset(const uint8* src_i420,
+ int src_width,
+ int src_height,
+ uint8* dst_i420,
+ int dst_width,
+ int dst_height,
+ int dst_yoffset,
+ LIBYUV_BOOL interpolate);
+
+// For testing, allow disabling of specialized scalers.
+LIBYUV_API
+void SetUseReferenceImpl(LIBYUV_BOOL use);
+#endif // __cplusplus
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
+
+#endif // INCLUDE_LIBYUV_SCALE_H_
diff --git a/libyuv/src/main/cpp/libyuv/include/libyuv/scale_argb.h b/libyuv/src/main/cpp/libyuv/include/libyuv/scale_argb.h
new file mode 100644
index 0000000..3d25e57
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/include/libyuv/scale_argb.h
@@ -0,0 +1,76 @@
+/*
+ * Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_SCALE_ARGB_H_
+#define INCLUDE_LIBYUV_SCALE_ARGB_H_
+
+#include "libyuv/basic_types.h"
+#include "libyuv/scale.h" // For FilterMode
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+LIBYUV_API
+int ARGBScale(const uint8* src_argb,
+ int src_stride_argb,
+ int src_width,
+ int src_height,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int dst_width,
+ int dst_height,
+ enum FilterMode filtering);
+
+// Clipped scale takes destination rectangle coordinates for clip values.
+LIBYUV_API
+int ARGBScaleClip(const uint8* src_argb,
+ int src_stride_argb,
+ int src_width,
+ int src_height,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int dst_width,
+ int dst_height,
+ int clip_x,
+ int clip_y,
+ int clip_width,
+ int clip_height,
+ enum FilterMode filtering);
+
+// Scale with YUV conversion to ARGB and clipping.
+LIBYUV_API
+int YUVToARGBScaleClip(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint32 src_fourcc,
+ int src_width,
+ int src_height,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ uint32 dst_fourcc,
+ int dst_width,
+ int dst_height,
+ int clip_x,
+ int clip_y,
+ int clip_width,
+ int clip_height,
+ enum FilterMode filtering);
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
+
+#endif // INCLUDE_LIBYUV_SCALE_ARGB_H_
diff --git a/libyuv/src/main/cpp/libyuv/include/libyuv/scale_row.h b/libyuv/src/main/cpp/libyuv/include/libyuv/scale_row.h
new file mode 100644
index 0000000..edb46cc
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/include/libyuv/scale_row.h
@@ -0,0 +1,929 @@
+/*
+ * Copyright 2013 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_SCALE_ROW_H_
+#define INCLUDE_LIBYUV_SCALE_ROW_H_
+
+#include "libyuv/basic_types.h"
+#include "libyuv/scale.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#if defined(__pnacl__) || defined(__CLR_VER) || \
+ (defined(__i386__) && !defined(__SSE2__))
+#define LIBYUV_DISABLE_X86
+#endif
+// MemorySanitizer does not support assembly code yet. http://crbug.com/344505
+#if defined(__has_feature)
+#if __has_feature(memory_sanitizer)
+#define LIBYUV_DISABLE_X86
+#endif
+#endif
+
+// GCC >= 4.7.0 required for AVX2.
+#if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))
+#if (__GNUC__ > 4) || (__GNUC__ == 4 && (__GNUC_MINOR__ >= 7))
+#define GCC_HAS_AVX2 1
+#endif // GNUC >= 4.7
+#endif // __GNUC__
+
+// clang >= 3.4.0 required for AVX2.
+#if defined(__clang__) && (defined(__x86_64__) || defined(__i386__))
+#if (__clang_major__ > 3) || (__clang_major__ == 3 && (__clang_minor__ >= 4))
+#define CLANG_HAS_AVX2 1
+#endif // clang >= 3.4
+#endif // __clang__
+
+// Visual C 2012 required for AVX2.
+#if defined(_M_IX86) && !defined(__clang__) && defined(_MSC_VER) && \
+ _MSC_VER >= 1700
+#define VISUALC_HAS_AVX2 1
+#endif // VisualStudio >= 2012
+
+// The following are available on all x86 platforms:
+#if !defined(LIBYUV_DISABLE_X86) && \
+ (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
+#define HAS_FIXEDDIV1_X86
+#define HAS_FIXEDDIV_X86
+#define HAS_SCALEARGBCOLS_SSE2
+#define HAS_SCALEARGBCOLSUP2_SSE2
+#define HAS_SCALEARGBFILTERCOLS_SSSE3
+#define HAS_SCALEARGBROWDOWN2_SSE2
+#define HAS_SCALEARGBROWDOWNEVEN_SSE2
+#define HAS_SCALECOLSUP2_SSE2
+#define HAS_SCALEFILTERCOLS_SSSE3
+#define HAS_SCALEROWDOWN2_SSSE3
+#define HAS_SCALEROWDOWN34_SSSE3
+#define HAS_SCALEROWDOWN38_SSSE3
+#define HAS_SCALEROWDOWN4_SSSE3
+#define HAS_SCALEADDROW_SSE2
+#endif
+
+// The following are available on all x86 platforms, but
+// require VS2012, clang 3.4 or gcc 4.7.
+// The code supports NaCL but requires a new compiler and validator.
+#if !defined(LIBYUV_DISABLE_X86) && \
+ (defined(VISUALC_HAS_AVX2) || defined(CLANG_HAS_AVX2) || \
+ defined(GCC_HAS_AVX2))
+#define HAS_SCALEADDROW_AVX2
+#define HAS_SCALEROWDOWN2_AVX2
+#define HAS_SCALEROWDOWN4_AVX2
+#endif
+
+// The following are available on Neon platforms:
+#if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \
+ (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
+#define HAS_SCALEARGBCOLS_NEON
+#define HAS_SCALEARGBROWDOWN2_NEON
+#define HAS_SCALEARGBROWDOWNEVEN_NEON
+#define HAS_SCALEFILTERCOLS_NEON
+#define HAS_SCALEROWDOWN2_NEON
+#define HAS_SCALEROWDOWN34_NEON
+#define HAS_SCALEROWDOWN38_NEON
+#define HAS_SCALEROWDOWN4_NEON
+#define HAS_SCALEARGBFILTERCOLS_NEON
+#endif
+
+// The following are available on Mips platforms:
+#if !defined(LIBYUV_DISABLE_DSPR2) && !defined(__native_client__) && \
+ defined(__mips__) && defined(__mips_dsp) && (__mips_dsp_rev >= 2)
+#define HAS_SCALEROWDOWN2_DSPR2
+#define HAS_SCALEROWDOWN4_DSPR2
+#define HAS_SCALEROWDOWN34_DSPR2
+#define HAS_SCALEROWDOWN38_DSPR2
+#define HAS_SCALEADDROW_DSPR2
+#endif
+
+#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
+#define HAS_SCALEARGBROWDOWN2_MSA
+#define HAS_SCALEARGBROWDOWNEVEN_MSA
+#define HAS_SCALEROWDOWN2_MSA
+#define HAS_SCALEROWDOWN4_MSA
+#define HAS_SCALEROWDOWN38_MSA
+#define HAS_SCALEADDROW_MSA
+#endif
+
+// Scale ARGB vertically with bilinear interpolation.
+void ScalePlaneVertical(int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint8* src_argb,
+ uint8* dst_argb,
+ int x,
+ int y,
+ int dy,
+ int bpp,
+ enum FilterMode filtering);
+
+void ScalePlaneVertical_16(int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint16* src_argb,
+ uint16* dst_argb,
+ int x,
+ int y,
+ int dy,
+ int wpp,
+ enum FilterMode filtering);
+
+// Simplify the filtering based on scale factors.
+enum FilterMode ScaleFilterReduce(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ enum FilterMode filtering);
+
+// Divide num by div and return as 16.16 fixed point result.
+int FixedDiv_C(int num, int div);
+int FixedDiv_X86(int num, int div);
+// Divide num - 1 by div - 1 and return as 16.16 fixed point result.
+int FixedDiv1_C(int num, int div);
+int FixedDiv1_X86(int num, int div);
+#ifdef HAS_FIXEDDIV_X86
+#define FixedDiv FixedDiv_X86
+#define FixedDiv1 FixedDiv1_X86
+#else
+#define FixedDiv FixedDiv_C
+#define FixedDiv1 FixedDiv1_C
+#endif
+
+// Compute slope values for stepping.
+void ScaleSlope(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ enum FilterMode filtering,
+ int* x,
+ int* y,
+ int* dx,
+ int* dy);
+
+void ScaleRowDown2_C(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst,
+ int dst_width);
+void ScaleRowDown2_16_C(const uint16* src_ptr,
+ ptrdiff_t src_stride,
+ uint16* dst,
+ int dst_width);
+void ScaleRowDown2Linear_C(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst,
+ int dst_width);
+void ScaleRowDown2Linear_16_C(const uint16* src_ptr,
+ ptrdiff_t src_stride,
+ uint16* dst,
+ int dst_width);
+void ScaleRowDown2Box_C(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst,
+ int dst_width);
+void ScaleRowDown2Box_Odd_C(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst,
+ int dst_width);
+void ScaleRowDown2Box_16_C(const uint16* src_ptr,
+ ptrdiff_t src_stride,
+ uint16* dst,
+ int dst_width);
+void ScaleRowDown4_C(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst,
+ int dst_width);
+void ScaleRowDown4_16_C(const uint16* src_ptr,
+ ptrdiff_t src_stride,
+ uint16* dst,
+ int dst_width);
+void ScaleRowDown4Box_C(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst,
+ int dst_width);
+void ScaleRowDown4Box_16_C(const uint16* src_ptr,
+ ptrdiff_t src_stride,
+ uint16* dst,
+ int dst_width);
+void ScaleRowDown34_C(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst,
+ int dst_width);
+void ScaleRowDown34_16_C(const uint16* src_ptr,
+ ptrdiff_t src_stride,
+ uint16* dst,
+ int dst_width);
+void ScaleRowDown34_0_Box_C(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* d,
+ int dst_width);
+void ScaleRowDown34_0_Box_16_C(const uint16* src_ptr,
+ ptrdiff_t src_stride,
+ uint16* d,
+ int dst_width);
+void ScaleRowDown34_1_Box_C(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* d,
+ int dst_width);
+void ScaleRowDown34_1_Box_16_C(const uint16* src_ptr,
+ ptrdiff_t src_stride,
+ uint16* d,
+ int dst_width);
+void ScaleCols_C(uint8* dst_ptr,
+ const uint8* src_ptr,
+ int dst_width,
+ int x,
+ int dx);
+void ScaleCols_16_C(uint16* dst_ptr,
+ const uint16* src_ptr,
+ int dst_width,
+ int x,
+ int dx);
+void ScaleColsUp2_C(uint8* dst_ptr,
+ const uint8* src_ptr,
+ int dst_width,
+ int,
+ int);
+void ScaleColsUp2_16_C(uint16* dst_ptr,
+ const uint16* src_ptr,
+ int dst_width,
+ int,
+ int);
+void ScaleFilterCols_C(uint8* dst_ptr,
+ const uint8* src_ptr,
+ int dst_width,
+ int x,
+ int dx);
+void ScaleFilterCols_16_C(uint16* dst_ptr,
+ const uint16* src_ptr,
+ int dst_width,
+ int x,
+ int dx);
+void ScaleFilterCols64_C(uint8* dst_ptr,
+ const uint8* src_ptr,
+ int dst_width,
+ int x,
+ int dx);
+void ScaleFilterCols64_16_C(uint16* dst_ptr,
+ const uint16* src_ptr,
+ int dst_width,
+ int x,
+ int dx);
+void ScaleRowDown38_C(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst,
+ int dst_width);
+void ScaleRowDown38_16_C(const uint16* src_ptr,
+ ptrdiff_t src_stride,
+ uint16* dst,
+ int dst_width);
+void ScaleRowDown38_3_Box_C(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width);
+void ScaleRowDown38_3_Box_16_C(const uint16* src_ptr,
+ ptrdiff_t src_stride,
+ uint16* dst_ptr,
+ int dst_width);
+void ScaleRowDown38_2_Box_C(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width);
+void ScaleRowDown38_2_Box_16_C(const uint16* src_ptr,
+ ptrdiff_t src_stride,
+ uint16* dst_ptr,
+ int dst_width);
+void ScaleAddRow_C(const uint8* src_ptr, uint16* dst_ptr, int src_width);
+void ScaleAddRow_16_C(const uint16* src_ptr, uint32* dst_ptr, int src_width);
+void ScaleARGBRowDown2_C(const uint8* src_argb,
+ ptrdiff_t src_stride,
+ uint8* dst_argb,
+ int dst_width);
+void ScaleARGBRowDown2Linear_C(const uint8* src_argb,
+ ptrdiff_t src_stride,
+ uint8* dst_argb,
+ int dst_width);
+void ScaleARGBRowDown2Box_C(const uint8* src_argb,
+ ptrdiff_t src_stride,
+ uint8* dst_argb,
+ int dst_width);
+void ScaleARGBRowDownEven_C(const uint8* src_argb,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8* dst_argb,
+ int dst_width);
+void ScaleARGBRowDownEvenBox_C(const uint8* src_argb,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8* dst_argb,
+ int dst_width);
+void ScaleARGBCols_C(uint8* dst_argb,
+ const uint8* src_argb,
+ int dst_width,
+ int x,
+ int dx);
+void ScaleARGBCols64_C(uint8* dst_argb,
+ const uint8* src_argb,
+ int dst_width,
+ int x,
+ int dx);
+void ScaleARGBColsUp2_C(uint8* dst_argb,
+ const uint8* src_argb,
+ int dst_width,
+ int,
+ int);
+void ScaleARGBFilterCols_C(uint8* dst_argb,
+ const uint8* src_argb,
+ int dst_width,
+ int x,
+ int dx);
+void ScaleARGBFilterCols64_C(uint8* dst_argb,
+ const uint8* src_argb,
+ int dst_width,
+ int x,
+ int dx);
+
+// Specialized scalers for x86.
+void ScaleRowDown2_SSSE3(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width);
+void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width);
+void ScaleRowDown2Box_SSSE3(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width);
+void ScaleRowDown2_AVX2(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width);
+void ScaleRowDown2Linear_AVX2(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width);
+void ScaleRowDown2Box_AVX2(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width);
+void ScaleRowDown4_SSSE3(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width);
+void ScaleRowDown4Box_SSSE3(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width);
+void ScaleRowDown4_AVX2(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width);
+void ScaleRowDown4Box_AVX2(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width);
+
+void ScaleRowDown34_SSSE3(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width);
+void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width);
+void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width);
+void ScaleRowDown38_SSSE3(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width);
+void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width);
+void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width);
+void ScaleRowDown2_Any_SSSE3(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width);
+void ScaleRowDown2Linear_Any_SSSE3(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width);
+void ScaleRowDown2Box_Any_SSSE3(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width);
+void ScaleRowDown2Box_Odd_SSSE3(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width);
+void ScaleRowDown2_Any_AVX2(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width);
+void ScaleRowDown2Linear_Any_AVX2(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width);
+void ScaleRowDown2Box_Any_AVX2(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width);
+void ScaleRowDown2Box_Odd_AVX2(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width);
+void ScaleRowDown4_Any_SSSE3(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width);
+void ScaleRowDown4Box_Any_SSSE3(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width);
+void ScaleRowDown4_Any_AVX2(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width);
+void ScaleRowDown4Box_Any_AVX2(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width);
+
+void ScaleRowDown34_Any_SSSE3(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width);
+void ScaleRowDown34_1_Box_Any_SSSE3(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width);
+void ScaleRowDown34_0_Box_Any_SSSE3(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width);
+void ScaleRowDown38_Any_SSSE3(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width);
+void ScaleRowDown38_3_Box_Any_SSSE3(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width);
+void ScaleRowDown38_2_Box_Any_SSSE3(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width);
+
+void ScaleAddRow_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width);
+void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width);
+void ScaleAddRow_Any_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width);
+void ScaleAddRow_Any_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width);
+
+void ScaleFilterCols_SSSE3(uint8* dst_ptr,
+ const uint8* src_ptr,
+ int dst_width,
+ int x,
+ int dx);
+void ScaleColsUp2_SSE2(uint8* dst_ptr,
+ const uint8* src_ptr,
+ int dst_width,
+ int x,
+ int dx);
+
+// ARGB Column functions
+void ScaleARGBCols_SSE2(uint8* dst_argb,
+ const uint8* src_argb,
+ int dst_width,
+ int x,
+ int dx);
+void ScaleARGBFilterCols_SSSE3(uint8* dst_argb,
+ const uint8* src_argb,
+ int dst_width,
+ int x,
+ int dx);
+void ScaleARGBColsUp2_SSE2(uint8* dst_argb,
+ const uint8* src_argb,
+ int dst_width,
+ int x,
+ int dx);
+void ScaleARGBFilterCols_NEON(uint8* dst_argb,
+ const uint8* src_argb,
+ int dst_width,
+ int x,
+ int dx);
+void ScaleARGBCols_NEON(uint8* dst_argb,
+ const uint8* src_argb,
+ int dst_width,
+ int x,
+ int dx);
+void ScaleARGBFilterCols_Any_NEON(uint8* dst_argb,
+ const uint8* src_argb,
+ int dst_width,
+ int x,
+ int dx);
+void ScaleARGBCols_Any_NEON(uint8* dst_argb,
+ const uint8* src_argb,
+ int dst_width,
+ int x,
+ int dx);
+
+// ARGB Row functions
+void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
+ ptrdiff_t src_stride,
+ uint8* dst_argb,
+ int dst_width);
+void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
+ ptrdiff_t src_stride,
+ uint8* dst_argb,
+ int dst_width);
+void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
+ ptrdiff_t src_stride,
+ uint8* dst_argb,
+ int dst_width);
+void ScaleARGBRowDown2_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst,
+ int dst_width);
+void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb,
+ ptrdiff_t src_stride,
+ uint8* dst_argb,
+ int dst_width);
+void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst,
+ int dst_width);
+void ScaleARGBRowDown2_MSA(const uint8_t* src_argb,
+ ptrdiff_t src_stride,
+ uint8_t* dst_argb,
+ int dst_width);
+void ScaleARGBRowDown2Linear_MSA(const uint8_t* src_argb,
+ ptrdiff_t src_stride,
+ uint8_t* dst_argb,
+ int dst_width);
+void ScaleARGBRowDown2Box_MSA(const uint8_t* src_argb,
+ ptrdiff_t src_stride,
+ uint8_t* dst_argb,
+ int dst_width);
+void ScaleARGBRowDown2_Any_SSE2(const uint8* src_argb,
+ ptrdiff_t src_stride,
+ uint8* dst_argb,
+ int dst_width);
+void ScaleARGBRowDown2Linear_Any_SSE2(const uint8* src_argb,
+ ptrdiff_t src_stride,
+ uint8* dst_argb,
+ int dst_width);
+void ScaleARGBRowDown2Box_Any_SSE2(const uint8* src_argb,
+ ptrdiff_t src_stride,
+ uint8* dst_argb,
+ int dst_width);
+void ScaleARGBRowDown2_Any_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst,
+ int dst_width);
+void ScaleARGBRowDown2Linear_Any_NEON(const uint8* src_argb,
+ ptrdiff_t src_stride,
+ uint8* dst_argb,
+ int dst_width);
+void ScaleARGBRowDown2Box_Any_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst,
+ int dst_width);
+void ScaleARGBRowDown2_Any_MSA(const uint8_t* src_argb,
+ ptrdiff_t src_stride,
+ uint8_t* dst_argb,
+ int dst_width);
+void ScaleARGBRowDown2Linear_Any_MSA(const uint8_t* src_argb,
+ ptrdiff_t src_stride,
+ uint8_t* dst_argb,
+ int dst_width);
+void ScaleARGBRowDown2Box_Any_MSA(const uint8_t* src_argb,
+ ptrdiff_t src_stride,
+ uint8_t* dst_argb,
+ int dst_width);
+
+void ScaleARGBRowDownEven_SSE2(const uint8* src_argb,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8* dst_argb,
+ int dst_width);
+void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8* dst_argb,
+ int dst_width);
+void ScaleARGBRowDownEven_NEON(const uint8* src_argb,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8* dst_argb,
+ int dst_width);
+void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8* dst_argb,
+ int dst_width);
+void ScaleARGBRowDownEven_MSA(const uint8_t* src_argb,
+ ptrdiff_t src_stride,
+ int32_t src_stepx,
+ uint8_t* dst_argb,
+ int dst_width);
+void ScaleARGBRowDownEvenBox_MSA(const uint8* src_argb,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8* dst_argb,
+ int dst_width);
+void ScaleARGBRowDownEven_Any_SSE2(const uint8* src_argb,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8* dst_argb,
+ int dst_width);
+void ScaleARGBRowDownEvenBox_Any_SSE2(const uint8* src_argb,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8* dst_argb,
+ int dst_width);
+void ScaleARGBRowDownEven_Any_NEON(const uint8* src_argb,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8* dst_argb,
+ int dst_width);
+void ScaleARGBRowDownEvenBox_Any_NEON(const uint8* src_argb,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8* dst_argb,
+ int dst_width);
+void ScaleARGBRowDownEven_Any_MSA(const uint8_t* src_argb,
+ ptrdiff_t src_stride,
+ int32_t src_stepx,
+ uint8_t* dst_argb,
+ int dst_width);
+void ScaleARGBRowDownEvenBox_Any_MSA(const uint8* src_argb,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8* dst_argb,
+ int dst_width);
+
+// ScaleRowDown2Box also used by planar functions
+// NEON downscalers with interpolation.
+
+// Note - not static due to reuse in convert for 444 to 420.
+void ScaleRowDown2_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst,
+ int dst_width);
+void ScaleRowDown2Linear_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst,
+ int dst_width);
+void ScaleRowDown2Box_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst,
+ int dst_width);
+
+void ScaleRowDown4_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width);
+void ScaleRowDown4Box_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width);
+
+// Down scale from 4 to 3 pixels. Use the neon multilane read/write
+// to load up the every 4th pixel into a 4 different registers.
+// Point samples 32 pixels to 24 pixels.
+void ScaleRowDown34_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width);
+void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width);
+void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width);
+
+// 32 -> 12
+void ScaleRowDown38_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width);
+// 32x3 -> 12x1
+void ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width);
+// 32x2 -> 12x1
+void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width);
+
+void ScaleRowDown2_Any_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst,
+ int dst_width);
+void ScaleRowDown2Linear_Any_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst,
+ int dst_width);
+void ScaleRowDown2Box_Any_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst,
+ int dst_width);
+void ScaleRowDown2Box_Odd_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst,
+ int dst_width);
+void ScaleRowDown4_Any_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width);
+void ScaleRowDown4Box_Any_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width);
+void ScaleRowDown34_Any_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width);
+void ScaleRowDown34_0_Box_Any_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width);
+void ScaleRowDown34_1_Box_Any_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width);
+// 32 -> 12
+void ScaleRowDown38_Any_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width);
+// 32x3 -> 12x1
+void ScaleRowDown38_3_Box_Any_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width);
+// 32x2 -> 12x1
+void ScaleRowDown38_2_Box_Any_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width);
+
+void ScaleAddRow_NEON(const uint8* src_ptr, uint16* dst_ptr, int src_width);
+void ScaleAddRow_Any_NEON(const uint8* src_ptr, uint16* dst_ptr, int src_width);
+
+void ScaleFilterCols_NEON(uint8* dst_ptr,
+ const uint8* src_ptr,
+ int dst_width,
+ int x,
+ int dx);
+
+void ScaleFilterCols_Any_NEON(uint8* dst_ptr,
+ const uint8* src_ptr,
+ int dst_width,
+ int x,
+ int dx);
+
+void ScaleRowDown2_DSPR2(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst,
+ int dst_width);
+void ScaleRowDown2Box_DSPR2(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst,
+ int dst_width);
+void ScaleRowDown4_DSPR2(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst,
+ int dst_width);
+void ScaleRowDown4Box_DSPR2(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst,
+ int dst_width);
+void ScaleRowDown34_DSPR2(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst,
+ int dst_width);
+void ScaleRowDown34_0_Box_DSPR2(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* d,
+ int dst_width);
+void ScaleRowDown34_1_Box_DSPR2(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* d,
+ int dst_width);
+void ScaleRowDown38_DSPR2(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst,
+ int dst_width);
+void ScaleRowDown38_2_Box_DSPR2(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width);
+void ScaleRowDown38_3_Box_DSPR2(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width);
+void ScaleAddRow_DSPR2(const uint8* src_ptr, uint16* dst_ptr, int src_width);
+void ScaleAddRow_Any_DSPR2(const uint8* src_ptr,
+ uint16* dst_ptr,
+ int src_width);
+
+void ScaleRowDown2_MSA(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width);
+void ScaleRowDown2Linear_MSA(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width);
+void ScaleRowDown2Box_MSA(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width);
+void ScaleRowDown4_MSA(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width);
+void ScaleRowDown4Box_MSA(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width);
+void ScaleRowDown38_MSA(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width);
+void ScaleRowDown38_2_Box_MSA(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleRowDown38_3_Box_MSA(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleAddRow_MSA(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width);
+void ScaleRowDown2_Any_MSA(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width);
+void ScaleRowDown2Linear_Any_MSA(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width);
+void ScaleRowDown2Box_Any_MSA(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width);
+void ScaleRowDown4_Any_MSA(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width);
+void ScaleRowDown4Box_Any_MSA(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width);
+void ScaleRowDown38_Any_MSA(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width);
+void ScaleRowDown38_2_Box_Any_MSA(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleRowDown38_3_Box_Any_MSA(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleAddRow_Any_MSA(const uint8_t* src_ptr,
+ uint16_t* dst_ptr,
+ int src_width);
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
+
+#endif // INCLUDE_LIBYUV_SCALE_ROW_H_
diff --git a/libyuv/src/main/cpp/libyuv/include/libyuv/version.h b/libyuv/src/main/cpp/libyuv/include/libyuv/version.h
new file mode 100644
index 0000000..7a624d9
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/include/libyuv/version.h
@@ -0,0 +1,16 @@
+/*
+ * Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_VERSION_H_
+#define INCLUDE_LIBYUV_VERSION_H_
+
+#define LIBYUV_VERSION 1662
+
+#endif // INCLUDE_LIBYUV_VERSION_H_
diff --git a/libyuv/src/main/cpp/libyuv/include/libyuv/video_common.h b/libyuv/src/main/cpp/libyuv/include/libyuv/video_common.h
new file mode 100644
index 0000000..f3711c4
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/include/libyuv/video_common.h
@@ -0,0 +1,184 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+// Common definitions for video, including fourcc and VideoFormat.
+
+#ifndef INCLUDE_LIBYUV_VIDEO_COMMON_H_
+#define INCLUDE_LIBYUV_VIDEO_COMMON_H_
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+//////////////////////////////////////////////////////////////////////////////
+// Definition of FourCC codes
+//////////////////////////////////////////////////////////////////////////////
+
+// Convert four characters to a FourCC code.
+// Needs to be a macro otherwise the OS X compiler complains when the kFormat*
+// constants are used in a switch.
+#ifdef __cplusplus
+#define FOURCC(a, b, c, d) \
+ ((static_cast(a)) | (static_cast(b) << 8) | \
+ (static_cast(c) << 16) | (static_cast(d) << 24))
+#else
+#define FOURCC(a, b, c, d) \
+ (((uint32)(a)) | ((uint32)(b) << 8) | /* NOLINT */ \
+ ((uint32)(c) << 16) | ((uint32)(d) << 24)) /* NOLINT */
+#endif
+
+// Some pages discussing FourCC codes:
+// http://www.fourcc.org/yuv.php
+// http://v4l2spec.bytesex.org/spec/book1.htm
+// http://developer.apple.com/quicktime/icefloe/dispatch020.html
+// http://msdn.microsoft.com/library/windows/desktop/dd206750.aspx#nv12
+// http://people.xiph.org/~xiphmont/containers/nut/nut4cc.txt
+
+// FourCC codes grouped according to implementation efficiency.
+// Primary formats should convert in 1 efficient step.
+// Secondary formats are converted in 2 steps.
+// Auxilliary formats call primary converters.
+enum FourCC {
+ // 8 Primary YUV formats: 5 planar, 2 biplanar, 2 packed.
+ FOURCC_I420 = FOURCC('I', '4', '2', '0'),
+ FOURCC_I422 = FOURCC('I', '4', '2', '2'),
+ FOURCC_I444 = FOURCC('I', '4', '4', '4'),
+ FOURCC_I411 = FOURCC('I', '4', '1', '1'), // deprecated.
+ FOURCC_I400 = FOURCC('I', '4', '0', '0'),
+ FOURCC_NV21 = FOURCC('N', 'V', '2', '1'),
+ FOURCC_NV12 = FOURCC('N', 'V', '1', '2'),
+ FOURCC_YUY2 = FOURCC('Y', 'U', 'Y', '2'),
+ FOURCC_UYVY = FOURCC('U', 'Y', 'V', 'Y'),
+
+ // 1 Secondary YUV format: row biplanar.
+ FOURCC_M420 = FOURCC('M', '4', '2', '0'),
+ FOURCC_Q420 = FOURCC('Q', '4', '2', '0'), // deprecated.
+
+ // 9 Primary RGB formats: 4 32 bpp, 2 24 bpp, 3 16 bpp.
+ FOURCC_ARGB = FOURCC('A', 'R', 'G', 'B'),
+ FOURCC_BGRA = FOURCC('B', 'G', 'R', 'A'),
+ FOURCC_ABGR = FOURCC('A', 'B', 'G', 'R'),
+ FOURCC_24BG = FOURCC('2', '4', 'B', 'G'),
+ FOURCC_RAW = FOURCC('r', 'a', 'w', ' '),
+ FOURCC_RGBA = FOURCC('R', 'G', 'B', 'A'),
+ FOURCC_RGBP = FOURCC('R', 'G', 'B', 'P'), // rgb565 LE.
+ FOURCC_RGBO = FOURCC('R', 'G', 'B', 'O'), // argb1555 LE.
+ FOURCC_R444 = FOURCC('R', '4', '4', '4'), // argb4444 LE.
+
+ // 4 Secondary RGB formats: 4 Bayer Patterns. deprecated.
+ FOURCC_RGGB = FOURCC('R', 'G', 'G', 'B'),
+ FOURCC_BGGR = FOURCC('B', 'G', 'G', 'R'),
+ FOURCC_GRBG = FOURCC('G', 'R', 'B', 'G'),
+ FOURCC_GBRG = FOURCC('G', 'B', 'R', 'G'),
+
+ // 1 Primary Compressed YUV format.
+ FOURCC_MJPG = FOURCC('M', 'J', 'P', 'G'),
+
+ // 5 Auxiliary YUV variations: 3 with U and V planes are swapped, 1 Alias.
+ FOURCC_YV12 = FOURCC('Y', 'V', '1', '2'),
+ FOURCC_YV16 = FOURCC('Y', 'V', '1', '6'),
+ FOURCC_YV24 = FOURCC('Y', 'V', '2', '4'),
+ FOURCC_YU12 = FOURCC('Y', 'U', '1', '2'), // Linux version of I420.
+ FOURCC_J420 = FOURCC('J', '4', '2', '0'),
+ FOURCC_J400 = FOURCC('J', '4', '0', '0'), // unofficial fourcc
+ FOURCC_H420 = FOURCC('H', '4', '2', '0'), // unofficial fourcc
+
+ // 14 Auxiliary aliases. CanonicalFourCC() maps these to canonical fourcc.
+ FOURCC_IYUV = FOURCC('I', 'Y', 'U', 'V'), // Alias for I420.
+ FOURCC_YU16 = FOURCC('Y', 'U', '1', '6'), // Alias for I422.
+ FOURCC_YU24 = FOURCC('Y', 'U', '2', '4'), // Alias for I444.
+ FOURCC_YUYV = FOURCC('Y', 'U', 'Y', 'V'), // Alias for YUY2.
+ FOURCC_YUVS = FOURCC('y', 'u', 'v', 's'), // Alias for YUY2 on Mac.
+ FOURCC_HDYC = FOURCC('H', 'D', 'Y', 'C'), // Alias for UYVY.
+ FOURCC_2VUY = FOURCC('2', 'v', 'u', 'y'), // Alias for UYVY on Mac.
+ FOURCC_JPEG = FOURCC('J', 'P', 'E', 'G'), // Alias for MJPG.
+ FOURCC_DMB1 = FOURCC('d', 'm', 'b', '1'), // Alias for MJPG on Mac.
+ FOURCC_BA81 = FOURCC('B', 'A', '8', '1'), // Alias for BGGR.
+ FOURCC_RGB3 = FOURCC('R', 'G', 'B', '3'), // Alias for RAW.
+ FOURCC_BGR3 = FOURCC('B', 'G', 'R', '3'), // Alias for 24BG.
+ FOURCC_CM32 = FOURCC(0, 0, 0, 32), // Alias for BGRA kCMPixelFormat_32ARGB
+ FOURCC_CM24 = FOURCC(0, 0, 0, 24), // Alias for RAW kCMPixelFormat_24RGB
+ FOURCC_L555 = FOURCC('L', '5', '5', '5'), // Alias for RGBO.
+ FOURCC_L565 = FOURCC('L', '5', '6', '5'), // Alias for RGBP.
+ FOURCC_5551 = FOURCC('5', '5', '5', '1'), // Alias for RGBO.
+
+ // 1 Auxiliary compressed YUV format set aside for capturer.
+ FOURCC_H264 = FOURCC('H', '2', '6', '4'),
+
+ // Match any fourcc.
+ FOURCC_ANY = -1,
+};
+
+enum FourCCBpp {
+ // Canonical fourcc codes used in our code.
+ FOURCC_BPP_I420 = 12,
+ FOURCC_BPP_I422 = 16,
+ FOURCC_BPP_I444 = 24,
+ FOURCC_BPP_I411 = 12,
+ FOURCC_BPP_I400 = 8,
+ FOURCC_BPP_NV21 = 12,
+ FOURCC_BPP_NV12 = 12,
+ FOURCC_BPP_YUY2 = 16,
+ FOURCC_BPP_UYVY = 16,
+ FOURCC_BPP_M420 = 12,
+ FOURCC_BPP_Q420 = 12,
+ FOURCC_BPP_ARGB = 32,
+ FOURCC_BPP_BGRA = 32,
+ FOURCC_BPP_ABGR = 32,
+ FOURCC_BPP_RGBA = 32,
+ FOURCC_BPP_24BG = 24,
+ FOURCC_BPP_RAW = 24,
+ FOURCC_BPP_RGBP = 16,
+ FOURCC_BPP_RGBO = 16,
+ FOURCC_BPP_R444 = 16,
+ FOURCC_BPP_RGGB = 8,
+ FOURCC_BPP_BGGR = 8,
+ FOURCC_BPP_GRBG = 8,
+ FOURCC_BPP_GBRG = 8,
+ FOURCC_BPP_YV12 = 12,
+ FOURCC_BPP_YV16 = 16,
+ FOURCC_BPP_YV24 = 24,
+ FOURCC_BPP_YU12 = 12,
+ FOURCC_BPP_J420 = 12,
+ FOURCC_BPP_J400 = 8,
+ FOURCC_BPP_H420 = 12,
+ FOURCC_BPP_MJPG = 0, // 0 means unknown.
+ FOURCC_BPP_H264 = 0,
+ FOURCC_BPP_IYUV = 12,
+ FOURCC_BPP_YU16 = 16,
+ FOURCC_BPP_YU24 = 24,
+ FOURCC_BPP_YUYV = 16,
+ FOURCC_BPP_YUVS = 16,
+ FOURCC_BPP_HDYC = 16,
+ FOURCC_BPP_2VUY = 16,
+ FOURCC_BPP_JPEG = 1,
+ FOURCC_BPP_DMB1 = 1,
+ FOURCC_BPP_BA81 = 8,
+ FOURCC_BPP_RGB3 = 24,
+ FOURCC_BPP_BGR3 = 24,
+ FOURCC_BPP_CM32 = 32,
+ FOURCC_BPP_CM24 = 24,
+
+ // Match any fourcc.
+ FOURCC_BPP_ANY = 0, // 0 means unknown.
+};
+
+// Converts fourcc aliases into canonical ones.
+LIBYUV_API uint32 CanonicalFourCC(uint32 fourcc);
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
+
+#endif // INCLUDE_LIBYUV_VIDEO_COMMON_H_
diff --git a/libyuv/src/main/cpp/libyuv/infra/config/OWNERS b/libyuv/src/main/cpp/libyuv/infra/config/OWNERS
new file mode 100644
index 0000000..02eccd5
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/infra/config/OWNERS
@@ -0,0 +1,3 @@
+set noparent
+agable@chromium.org
+kjellander@chromium.org
diff --git a/libyuv/src/main/cpp/libyuv/infra/config/README.md b/libyuv/src/main/cpp/libyuv/infra/config/README.md
new file mode 100644
index 0000000..c036d61
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/infra/config/README.md
@@ -0,0 +1 @@
+This directory contains configuration files for infra services.
diff --git a/libyuv/src/main/cpp/libyuv/infra/config/cq.cfg b/libyuv/src/main/cpp/libyuv/infra/config/cq.cfg
new file mode 100644
index 0000000..c719010
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/infra/config/cq.cfg
@@ -0,0 +1,65 @@
+# Commit Queue configuration file. The documentation of the format can be found
+# at http://luci-config.appspot.com/schemas/projects/refs:cq.cfg.
+
+version: 1
+cq_name: "libyuv"
+cq_status_url: "https://chromium-cq-status.appspot.com"
+git_repo_url: "https://chromium.googlesource.com/libyuv/libyuv.git"
+
+gerrit {}
+rietveld {
+ url: "https://codereview.chromium.org"
+}
+
+
+verifiers {
+ reviewer_lgtm {
+ committer_list: "project-libyuv-committers"
+ dry_run_access_list: "project-libyuv-tryjob-access"
+ }
+ gerrit_cq_ability {
+ committer_list: "project-libyuv-committers"
+ dry_run_access_list: "project-libyuv-tryjob-access"
+ }
+
+ try_job {
+ buckets {
+ name: "master.tryserver.libyuv"
+ builders { name: "win" }
+ builders { name: "win_rel" }
+ builders { name: "win_x64_rel" }
+ builders { name: "win_clang" }
+ builders { name: "win_clang_rel" }
+ builders { name: "win_x64_clang_rel" }
+ builders { name: "mac" }
+ builders { name: "mac_rel" }
+ builders { name: "mac_asan" }
+ builders { name: "ios" }
+ builders { name: "ios_rel" }
+ builders { name: "ios_arm64" }
+ builders { name: "ios_arm64_rel" }
+ builders { name: "linux" }
+ builders { name: "linux_rel" }
+ builders {
+ name: "linux_gcc"
+ experiment_percentage: 100
+ }
+ builders { name: "linux_memcheck" }
+ builders { name: "linux_tsan2" }
+ builders { name: "linux_asan" }
+ builders { name: "linux_msan" }
+ builders { name: "linux_ubsan" }
+ builders { name: "linux_ubsan_vptr" }
+ builders { name: "android" }
+ builders { name: "android_rel" }
+ builders { name: "android_clang" }
+ builders { name: "android_arm64" }
+ builders { name: "android_x86" }
+ builders { name: "android_x64" }
+ builders {
+ name: "android_mips"
+ experiment_percentage: 100
+ }
+ }
+ }
+}
diff --git a/libyuv/src/main/cpp/libyuv/libyuv.gni b/libyuv/src/main/cpp/libyuv/libyuv.gni
new file mode 100644
index 0000000..89e4d38
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/libyuv.gni
@@ -0,0 +1,20 @@
+# Copyright 2016 The LibYuv Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style license
+# that can be found in the LICENSE file in the root of the source
+# tree. An additional intellectual property rights grant can be found
+# in the file PATENTS. All contributing project authors may
+# be found in the AUTHORS file in the root of the source tree.
+
+import("//build_overrides/build.gni")
+import("//build/config/arm.gni")
+import("//build/config/mips.gni")
+
+declare_args() {
+ libyuv_include_tests = !build_with_chromium
+ libyuv_disable_jpeg = false
+ libyuv_use_neon = (current_cpu == "arm64" ||
+ (current_cpu == "arm" && (arm_use_neon || arm_optionally_use_neon)))
+ libyuv_use_msa = (current_cpu == "mips64el" || current_cpu == "mipsel") &&
+ mips_use_msa
+}
diff --git a/libyuv/src/main/cpp/libyuv/libyuv.gyp b/libyuv/src/main/cpp/libyuv/libyuv.gyp
new file mode 100644
index 0000000..f73a1a4
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/libyuv.gyp
@@ -0,0 +1,162 @@
+# Copyright 2011 The LibYuv Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style license
+# that can be found in the LICENSE file in the root of the source
+# tree. An additional intellectual property rights grant can be found
+# in the file PATENTS. All contributing project authors may
+# be found in the AUTHORS file in the root of the source tree.
+
+{
+ 'includes': [
+ 'libyuv.gypi',
+ ],
+ # Make sure that if we are being compiled to an xcodeproj, nothing tries to
+ # include a .pch.
+ 'xcode_settings': {
+ 'GCC_PREFIX_HEADER': '',
+ 'GCC_PRECOMPILE_PREFIX_HEADER': 'NO',
+ },
+ 'variables': {
+ 'use_system_libjpeg%': 0,
+ # Can be enabled if your jpeg has GYP support.
+ 'libyuv_disable_jpeg%': 1,
+ # 'chromium_code' treats libyuv as internal and increases warning level.
+ 'chromium_code': 1,
+ # clang compiler default variable usable by other apps that include libyuv.
+ 'clang%': 0,
+ # Link-Time Optimizations.
+ 'use_lto%': 0,
+ 'mips_msa%': 0, # Default to msa off.
+ 'build_neon': 0,
+ 'build_msa': 0,
+ 'conditions': [
+ ['(target_arch == "armv7" or target_arch == "armv7s" or \
+ (target_arch == "arm" and arm_version >= 7) or target_arch == "arm64")\
+ and (arm_neon == 1 or arm_neon_optional == 1)', {
+ 'build_neon': 1,
+ }],
+ ['(target_arch == "mipsel" or target_arch == "mips64el")\
+ and (mips_msa == 1)',
+ {
+ 'build_msa': 1,
+ }],
+ ],
+ },
+
+ 'targets': [
+ {
+ 'target_name': 'libyuv',
+ # Change type to 'shared_library' to build .so or .dll files.
+ 'type': 'static_library',
+ 'variables': {
+ 'optimize': 'max', # enable O2 and ltcg.
+ },
+ # Allows libyuv.a redistributable library without external dependencies.
+ 'standalone_static_library': 1,
+ 'conditions': [
+ # Disable -Wunused-parameter
+ ['clang == 1', {
+ 'cflags': [
+ '-Wno-unused-parameter',
+ ],
+ }],
+ ['build_neon != 0', {
+ 'defines': [
+ 'LIBYUV_NEON',
+ ],
+ 'cflags!': [
+ '-mfpu=vfp',
+ '-mfpu=vfpv3',
+ '-mfpu=vfpv3-d16',
+ # '-mthumb', # arm32 not thumb
+ ],
+ 'conditions': [
+ # Disable LTO in libyuv_neon target due to gcc 4.9 compiler bug.
+ ['clang == 0 and use_lto == 1', {
+ 'cflags!': [
+ '-flto',
+ '-ffat-lto-objects',
+ ],
+ }],
+ # arm64 does not need -mfpu=neon option as neon is not optional
+ ['target_arch != "arm64"', {
+ 'cflags': [
+ '-mfpu=neon',
+ # '-marm', # arm32 not thumb
+ ],
+ }],
+ ],
+ }],
+ ['build_msa != 0', {
+ 'defines': [
+ 'LIBYUV_MSA',
+ ],
+ }],
+ ['OS != "ios" and libyuv_disable_jpeg != 1', {
+ 'defines': [
+ 'HAVE_JPEG'
+ ],
+ 'conditions': [
+ # Caveat system jpeg support may not support motion jpeg
+ [ 'use_system_libjpeg == 1', {
+ 'dependencies': [
+ '<(DEPTH)/third_party/libjpeg/libjpeg.gyp:libjpeg',
+ ],
+ }, {
+ 'dependencies': [
+ '<(DEPTH)/third_party/libjpeg_turbo/libjpeg.gyp:libjpeg',
+ ],
+ }],
+ [ 'use_system_libjpeg == 1', {
+ 'link_settings': {
+ 'libraries': [
+ '-ljpeg',
+ ],
+ }
+ }],
+ ],
+ }],
+ ], #conditions
+ 'defines': [
+ # Enable the following 3 macros to turn off assembly for specified CPU.
+ # 'LIBYUV_DISABLE_X86',
+ # 'LIBYUV_DISABLE_NEON',
+ # 'LIBYUV_DISABLE_DSPR2',
+ # Enable the following macro to build libyuv as a shared library (dll).
+ # 'LIBYUV_USING_SHARED_LIBRARY',
+ # TODO(fbarchard): Make these into gyp defines.
+ ],
+ 'include_dirs': [
+ 'include',
+ '.',
+ ],
+ 'direct_dependent_settings': {
+ 'include_dirs': [
+ 'include',
+ '.',
+ ],
+ 'conditions': [
+ ['OS == "android" and target_arch == "arm64"', {
+ 'ldflags': [
+ '-Wl,--dynamic-linker,/system/bin/linker64',
+ ],
+ }],
+ ['OS == "android" and target_arch != "arm64"', {
+ 'ldflags': [
+ '-Wl,--dynamic-linker,/system/bin/linker',
+ ],
+ }],
+ ], #conditions
+ },
+ 'sources': [
+ '<@(libyuv_sources)',
+ ],
+ },
+ ], # targets.
+}
+
+# Local Variables:
+# tab-width:2
+# indent-tabs-mode:nil
+# End:
+# vim: set expandtab tabstop=2 shiftwidth=2:
diff --git a/libyuv/src/main/cpp/libyuv/libyuv.gypi b/libyuv/src/main/cpp/libyuv/libyuv.gypi
new file mode 100644
index 0000000..18b2fec
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/libyuv.gypi
@@ -0,0 +1,83 @@
+# Copyright 2014 The LibYuv Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style license
+# that can be found in the LICENSE file in the root of the source
+# tree. An additional intellectual property rights grant can be found
+# in the file PATENTS. All contributing project authors may
+# be found in the AUTHORS file in the root of the source tree.
+
+{
+ 'variables': {
+ 'libyuv_sources': [
+ # includes.
+ 'include/libyuv.h',
+ 'include/libyuv/basic_types.h',
+ 'include/libyuv/compare.h',
+ 'include/libyuv/convert.h',
+ 'include/libyuv/convert_argb.h',
+ 'include/libyuv/convert_from.h',
+ 'include/libyuv/convert_from_argb.h',
+ 'include/libyuv/cpu_id.h',
+ 'include/libyuv/macros_msa.h',
+ 'include/libyuv/mjpeg_decoder.h',
+ 'include/libyuv/planar_functions.h',
+ 'include/libyuv/rotate.h',
+ 'include/libyuv/rotate_argb.h',
+ 'include/libyuv/rotate_row.h',
+ 'include/libyuv/row.h',
+ 'include/libyuv/scale.h',
+ 'include/libyuv/scale_argb.h',
+ 'include/libyuv/scale_row.h',
+ 'include/libyuv/version.h',
+ 'include/libyuv/video_common.h',
+
+ # sources.
+ 'source/compare.cc',
+ 'source/compare_common.cc',
+ 'source/compare_gcc.cc',
+ 'source/compare_neon.cc',
+ 'source/compare_neon64.cc',
+ 'source/compare_win.cc',
+ 'source/convert.cc',
+ 'source/convert_argb.cc',
+ 'source/convert_from.cc',
+ 'source/convert_from_argb.cc',
+ 'source/convert_jpeg.cc',
+ 'source/convert_to_argb.cc',
+ 'source/convert_to_i420.cc',
+ 'source/cpu_id.cc',
+ 'source/mjpeg_decoder.cc',
+ 'source/mjpeg_validate.cc',
+ 'source/planar_functions.cc',
+ 'source/rotate.cc',
+ 'source/rotate_any.cc',
+ 'source/rotate_argb.cc',
+ 'source/rotate_common.cc',
+ 'source/rotate_gcc.cc',
+ 'source/rotate_dspr2.cc',
+ 'source/rotate_msa.cc',
+ 'source/rotate_neon.cc',
+ 'source/rotate_neon64.cc',
+ 'source/rotate_win.cc',
+ 'source/row_any.cc',
+ 'source/row_common.cc',
+ 'source/row_gcc.cc',
+ 'source/row_dspr2.cc',
+ 'source/row_msa.cc',
+ 'source/row_neon.cc',
+ 'source/row_neon64.cc',
+ 'source/row_win.cc',
+ 'source/scale.cc',
+ 'source/scale_any.cc',
+ 'source/scale_argb.cc',
+ 'source/scale_common.cc',
+ 'source/scale_gcc.cc',
+ 'source/scale_dspr2.cc',
+ 'source/scale_msa.cc',
+ 'source/scale_neon.cc',
+ 'source/scale_neon64.cc',
+ 'source/scale_win.cc',
+ 'source/video_common.cc',
+ ],
+ }
+}
diff --git a/libyuv/src/main/cpp/libyuv/libyuv_nacl.gyp b/libyuv/src/main/cpp/libyuv/libyuv_nacl.gyp
new file mode 100644
index 0000000..b8fe57e
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/libyuv_nacl.gyp
@@ -0,0 +1,37 @@
+# Copyright 2014 The LibYuv Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style license
+# that can be found in the LICENSE file in the root of the source
+# tree. An additional intellectual property rights grant can be found
+# in the file PATENTS. All contributing project authors may
+# be found in the AUTHORS file in the root of the source tree.
+
+{
+ 'includes': [
+ 'libyuv.gypi',
+ '../../native_client/build/untrusted.gypi',
+ ],
+ 'targets': [
+ {
+ 'target_name': 'libyuv_nacl',
+ 'type': 'none',
+ 'variables': {
+ 'nlib_target': 'libyuv_nacl.a',
+ 'build_glibc': 0,
+ 'build_newlib': 0,
+ 'build_pnacl_newlib': 1,
+ },
+ 'include_dirs': [
+ 'include',
+ ],
+ 'direct_dependent_settings': {
+ 'include_dirs': [
+ 'include',
+ ],
+ },
+ 'sources': [
+ '<@(libyuv_sources)',
+ ],
+ }, # target libyuv_nacl
+ ]
+}
diff --git a/libyuv/src/main/cpp/libyuv/libyuv_test.gyp b/libyuv/src/main/cpp/libyuv/libyuv_test.gyp
new file mode 100644
index 0000000..4222cf2
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/libyuv_test.gyp
@@ -0,0 +1,204 @@
+# Copyright 2011 The LibYuv Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style license
+# that can be found in the LICENSE file in the root of the source
+# tree. An additional intellectual property rights grant can be found
+# in the file PATENTS. All contributing project authors may
+# be found in the AUTHORS file in the root of the source tree.
+
+{
+ 'variables': {
+ # Can be enabled if your jpeg has GYP support.
+ 'libyuv_disable_jpeg%': 1,
+ 'mips_msa%': 0, # Default to msa off.
+ },
+ 'targets': [
+ {
+ 'target_name': 'libyuv_unittest',
+ 'type': '<(gtest_target_type)',
+ 'dependencies': [
+ 'libyuv.gyp:libyuv',
+ 'testing/gtest.gyp:gtest',
+ 'third_party/gflags/gflags.gyp:gflags',
+ ],
+ 'direct_dependent_settings': {
+ 'defines': [
+ 'GTEST_RELATIVE_PATH',
+ ],
+ },
+ 'export_dependent_settings': [
+ '<(DEPTH)/testing/gtest.gyp:gtest',
+ ],
+ 'sources': [
+ # headers
+ 'unit_test/unit_test.h',
+
+ # sources
+ 'unit_test/basictypes_test.cc',
+ 'unit_test/compare_test.cc',
+ 'unit_test/color_test.cc',
+ 'unit_test/convert_test.cc',
+ 'unit_test/cpu_test.cc',
+ 'unit_test/cpu_thread_test.cc',
+ 'unit_test/math_test.cc',
+ 'unit_test/planar_test.cc',
+ 'unit_test/rotate_argb_test.cc',
+ 'unit_test/rotate_test.cc',
+ 'unit_test/scale_argb_test.cc',
+ 'unit_test/scale_test.cc',
+ 'unit_test/unit_test.cc',
+ 'unit_test/video_common_test.cc',
+ ],
+ 'conditions': [
+ ['OS=="linux"', {
+ 'cflags': [
+ '-fexceptions',
+ ],
+ }],
+ [ 'OS == "ios"', {
+ 'xcode_settings': {
+ 'DEBUGGING_SYMBOLS': 'YES',
+ 'DEBUG_INFORMATION_FORMAT' : 'dwarf-with-dsym',
+ # Work around compile issue with isosim.mm, see
+ # https://code.google.com/p/libyuv/issues/detail?id=548 for details.
+ 'WARNING_CFLAGS': [
+ '-Wno-sometimes-uninitialized',
+ ],
+ },
+ 'cflags': [
+ '-Wno-sometimes-uninitialized',
+ ],
+ }],
+ [ 'OS != "ios" and libyuv_disable_jpeg != 1', {
+ 'defines': [
+ 'HAVE_JPEG',
+ ],
+ }],
+ ['OS=="android"', {
+ 'dependencies': [
+ '<(DEPTH)/testing/android/native_test.gyp:native_test_native_code',
+ ],
+ }],
+ # TODO(YangZhang): These lines can be removed when high accuracy
+ # YUV to RGB to Neon is ported.
+ [ '(target_arch == "armv7" or target_arch == "armv7s" \
+ or (target_arch == "arm" and arm_version >= 7) \
+ or target_arch == "arm64") \
+ and (arm_neon == 1 or arm_neon_optional == 1)', {
+ 'defines': [
+ 'LIBYUV_NEON'
+ ],
+ }],
+ [ '(target_arch == "mipsel" or target_arch == "mips64el") \
+ and (mips_msa == 1)', {
+ 'defines': [
+ 'LIBYUV_MSA'
+ ],
+ }],
+ ], # conditions
+ 'defines': [
+ # Enable the following 3 macros to turn off assembly for specified CPU.
+ # 'LIBYUV_DISABLE_X86',
+ # 'LIBYUV_DISABLE_NEON',
+ # 'LIBYUV_DISABLE_DSPR2',
+ # Enable the following macro to build libyuv as a shared library (dll).
+ # 'LIBYUV_USING_SHARED_LIBRARY',
+ ],
+ },
+ {
+ 'target_name': 'compare',
+ 'type': 'executable',
+ 'dependencies': [
+ 'libyuv.gyp:libyuv',
+ ],
+ 'sources': [
+ # sources
+ 'util/compare.cc',
+ ],
+ 'conditions': [
+ ['OS=="linux"', {
+ 'cflags': [
+ '-fexceptions',
+ ],
+ }],
+ ], # conditions
+ },
+ {
+ 'target_name': 'yuvconvert',
+ 'type': 'executable',
+ 'dependencies': [
+ 'libyuv.gyp:libyuv',
+ ],
+ 'sources': [
+ # sources
+ 'util/yuvconvert.cc',
+ ],
+ 'conditions': [
+ ['OS=="linux"', {
+ 'cflags': [
+ '-fexceptions',
+ ],
+ }],
+ ], # conditions
+ },
+ # TODO(fbarchard): Enable SSE2 and OpenMP for better performance.
+ {
+ 'target_name': 'psnr',
+ 'type': 'executable',
+ 'sources': [
+ # sources
+ 'util/psnr_main.cc',
+ 'util/psnr.cc',
+ 'util/ssim.cc',
+ ],
+ 'dependencies': [
+ 'libyuv.gyp:libyuv',
+ ],
+ 'conditions': [
+ [ 'OS != "ios" and libyuv_disable_jpeg != 1', {
+ 'defines': [
+ 'HAVE_JPEG',
+ ],
+ }],
+ ], # conditions
+ },
+
+ {
+ 'target_name': 'cpuid',
+ 'type': 'executable',
+ 'sources': [
+ # sources
+ 'util/cpuid.c',
+ ],
+ 'dependencies': [
+ 'libyuv.gyp:libyuv',
+ ],
+ },
+ ], # targets
+ 'conditions': [
+ ['OS=="android"', {
+ 'targets': [
+ {
+ 'target_name': 'yuv_unittest_apk',
+ 'type': 'none',
+ 'variables': {
+ 'test_suite_name': 'yuv_unittest',
+ 'input_shlib_path': '<(SHARED_LIB_DIR)/(SHARED_LIB_PREFIX)libyuv_unittest<(SHARED_LIB_SUFFIX)',
+ },
+ 'includes': [
+ 'build/apk_test.gypi',
+ ],
+ 'dependencies': [
+ 'libyuv_unittest',
+ ],
+ },
+ ],
+ }],
+ ],
+}
+
+# Local Variables:
+# tab-width:2
+# indent-tabs-mode:nil
+# End:
+# vim: set expandtab tabstop=2 shiftwidth=2:
diff --git a/libyuv/src/main/cpp/libyuv/linux.mk b/libyuv/src/main/cpp/libyuv/linux.mk
new file mode 100644
index 0000000..1dd527c
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/linux.mk
@@ -0,0 +1,83 @@
+# This is a generic makefile for libyuv for gcc.
+# make -f linux.mk CXX=clang++
+
+CC?=gcc
+CFLAGS?=-O2 -fomit-frame-pointer
+CFLAGS+=-Iinclude/
+
+CXX?=g++
+CXXFLAGS?=-O2 -fomit-frame-pointer
+CXXFLAGS+=-Iinclude/
+
+LOCAL_OBJ_FILES := \
+ source/compare.o \
+ source/compare_common.o \
+ source/compare_gcc.o \
+ source/compare_neon64.o \
+ source/compare_neon.o \
+ source/compare_win.o \
+ source/convert_argb.o \
+ source/convert.o \
+ source/convert_from_argb.o \
+ source/convert_from.o \
+ source/convert_jpeg.o \
+ source/convert_to_argb.o \
+ source/convert_to_i420.o \
+ source/cpu_id.o \
+ source/mjpeg_decoder.o \
+ source/mjpeg_validate.o \
+ source/planar_functions.o \
+ source/rotate_any.o \
+ source/rotate_argb.o \
+ source/rotate.o \
+ source/rotate_common.o \
+ source/rotate_gcc.o \
+ source/rotate_dspr2.o \
+ source/rotate_neon64.o \
+ source/rotate_neon.o \
+ source/rotate_win.o \
+ source/row_any.o \
+ source/row_common.o \
+ source/row_gcc.o \
+ source/row_dspr2.o \
+ source/row_neon64.o \
+ source/row_neon.o \
+ source/row_win.o \
+ source/scale_any.o \
+ source/scale_argb.o \
+ source/scale.o \
+ source/scale_common.o \
+ source/scale_gcc.o \
+ source/scale_dspr2.o \
+ source/scale_neon64.o \
+ source/scale_neon.o \
+ source/scale_win.o \
+ source/video_common.o
+
+.cc.o:
+ $(CXX) -c $(CXXFLAGS) $*.cc -o $*.o
+
+.c.o:
+ $(CC) -c $(CFLAGS) $*.c -o $*.o
+
+all: libyuv.a yuvconvert cpuid psnr
+
+libyuv.a: $(LOCAL_OBJ_FILES)
+ $(AR) $(ARFLAGS) $@ $(LOCAL_OBJ_FILES)
+
+# A C++ test utility that uses libyuv conversion.
+yuvconvert: util/yuvconvert.cc libyuv.a
+ $(CXX) $(CXXFLAGS) -Iutil/ -o $@ util/yuvconvert.cc libyuv.a
+
+# A standalone test utility
+psnr: util/psnr.cc
+ $(CXX) $(CXXFLAGS) -Iutil/ -o $@ util/psnr.cc util/psnr_main.cc util/ssim.cc
+
+# A C test utility that uses libyuv conversion from C.
+# gcc 4.4 and older require -fno-exceptions to avoid link error on __gxx_personality_v0
+# CC=gcc-4.4 CXXFLAGS=-fno-exceptions CXX=g++-4.4 make -f linux.mk
+cpuid: util/cpuid.c libyuv.a
+ $(CC) $(CFLAGS) -o $@ util/cpuid.c libyuv.a
+
+clean:
+ /bin/rm -f source/*.o *.ii *.s libyuv.a convert cpuid psnr
diff --git a/libyuv/src/main/cpp/libyuv/public.mk b/libyuv/src/main/cpp/libyuv/public.mk
new file mode 100644
index 0000000..1342307
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/public.mk
@@ -0,0 +1,13 @@
+# This file contains all the common make variables which are useful for
+# anyone depending on this library.
+# Note that dependencies on NDK are not directly listed since NDK auto adds
+# them.
+
+LIBYUV_INCLUDES := $(LIBYUV_PATH)/include
+
+LIBYUV_C_FLAGS :=
+
+LIBYUV_CPP_FLAGS :=
+
+LIBYUV_LDLIBS :=
+LIBYUV_DEP_MODULES :=
diff --git a/libyuv/src/main/cpp/libyuv/pylintrc b/libyuv/src/main/cpp/libyuv/pylintrc
new file mode 100644
index 0000000..b8bea33
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/pylintrc
@@ -0,0 +1,17 @@
+[MESSAGES CONTROL]
+
+# Disable the message, report, category or checker with the given id(s).
+# TODO(kjellander): Reduce this list to as small as possible.
+disable=I0010,I0011,bad-continuation,broad-except,duplicate-code,eval-used,exec-used,fixme,invalid-name,missing-docstring,no-init,no-member,too-few-public-methods,too-many-ancestors,too-many-arguments,too-many-branches,too-many-function-args,too-many-instance-attributes,too-many-lines,too-many-locals,too-many-public-methods,too-many-return-statements,too-many-statements
+
+
+[REPORTS]
+
+# Don't write out full reports, just messages.
+reports=no
+
+
+[FORMAT]
+
+# We use two spaces for indents, instead of the usual four spaces or tab.
+indent-string=' '
diff --git a/libyuv/src/main/cpp/libyuv/source/compare.cc b/libyuv/src/main/cpp/libyuv/source/compare.cc
new file mode 100644
index 0000000..3f7f147
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/source/compare.cc
@@ -0,0 +1,403 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/compare.h"
+
+#include
+#include
+#ifdef _OPENMP
+#include
+#endif
+
+#include "libyuv/basic_types.h"
+#include "libyuv/compare_row.h"
+#include "libyuv/cpu_id.h"
+#include "libyuv/row.h"
+#include "libyuv/video_common.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// hash seed of 5381 recommended.
+LIBYUV_API
+uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed) {
+ const int kBlockSize = 1 << 15; // 32768;
+ int remainder;
+ uint32 (*HashDjb2_SSE)(const uint8* src, int count, uint32 seed) = HashDjb2_C;
+#if defined(HAS_HASHDJB2_SSE41)
+ if (TestCpuFlag(kCpuHasSSE41)) {
+ HashDjb2_SSE = HashDjb2_SSE41;
+ }
+#endif
+#if defined(HAS_HASHDJB2_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ HashDjb2_SSE = HashDjb2_AVX2;
+ }
+#endif
+
+ while (count >= (uint64)(kBlockSize)) {
+ seed = HashDjb2_SSE(src, kBlockSize, seed);
+ src += kBlockSize;
+ count -= kBlockSize;
+ }
+ remainder = (int)count & ~15;
+ if (remainder) {
+ seed = HashDjb2_SSE(src, remainder, seed);
+ src += remainder;
+ count -= remainder;
+ }
+ remainder = (int)count & 15;
+ if (remainder) {
+ seed = HashDjb2_C(src, remainder, seed);
+ }
+ return seed;
+}
+
+static uint32 ARGBDetectRow_C(const uint8* argb, int width) {
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ if (argb[0] != 255) { // First byte is not Alpha of 255, so not ARGB.
+ return FOURCC_BGRA;
+ }
+ if (argb[3] != 255) { // 4th byte is not Alpha of 255, so not BGRA.
+ return FOURCC_ARGB;
+ }
+ if (argb[4] != 255) { // Second pixel first byte is not Alpha of 255.
+ return FOURCC_BGRA;
+ }
+ if (argb[7] != 255) { // Second pixel 4th byte is not Alpha of 255.
+ return FOURCC_ARGB;
+ }
+ argb += 8;
+ }
+ if (width & 1) {
+ if (argb[0] != 255) { // First byte is not Alpha of 255, so not ARGB.
+ return FOURCC_BGRA;
+ }
+ if (argb[3] != 255) { // 4th byte is not Alpha of 255, so not BGRA.
+ return FOURCC_ARGB;
+ }
+ }
+ return 0;
+}
+
+// Scan an opaque argb image and return fourcc based on alpha offset.
+// Returns FOURCC_ARGB, FOURCC_BGRA, or 0 if unknown.
+LIBYUV_API
+uint32 ARGBDetect(const uint8* argb, int stride_argb, int width, int height) {
+ uint32 fourcc = 0;
+ int h;
+
+ // Coalesce rows.
+ if (stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ stride_argb = 0;
+ }
+ for (h = 0; h < height && fourcc == 0; ++h) {
+ fourcc = ARGBDetectRow_C(argb, width);
+ argb += stride_argb;
+ }
+ return fourcc;
+}
+
+LIBYUV_API
+uint64 ComputeHammingDistance(const uint8* src_a,
+ const uint8* src_b,
+ int count) {
+ const int kBlockSize = 65536;
+ int remainder = count & (kBlockSize - 1) & ~31;
+ uint64 diff = 0;
+ int i;
+ uint32 (*HammingDistance)(const uint8* src_a, const uint8* src_b, int count) =
+ HammingDistance_C;
+#if defined(HAS_HAMMINGDISTANCE_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ HammingDistance = HammingDistance_NEON;
+ }
+#endif
+#if defined(HAS_HAMMINGDISTANCE_X86)
+ if (TestCpuFlag(kCpuHasX86)) {
+ HammingDistance = HammingDistance_X86;
+ }
+#endif
+#if defined(HAS_HAMMINGDISTANCE_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ HammingDistance = HammingDistance_AVX2;
+ }
+#endif
+#ifdef _OPENMP
+#pragma omp parallel for reduction(+ : diff)
+#endif
+ for (i = 0; i < (count - (kBlockSize - 1)); i += kBlockSize) {
+ diff += HammingDistance(src_a + i, src_b + i, kBlockSize);
+ }
+ src_a += count & ~(kBlockSize - 1);
+ src_b += count & ~(kBlockSize - 1);
+ if (remainder) {
+ diff += HammingDistance(src_a, src_b, remainder);
+ src_a += remainder;
+ src_b += remainder;
+ }
+ remainder = count & 31;
+ if (remainder) {
+ diff += HammingDistance_C(src_a, src_b, remainder);
+ }
+ return diff;
+}
+
+// TODO(fbarchard): Refactor into row function.
+LIBYUV_API
+uint64 ComputeSumSquareError(const uint8* src_a,
+ const uint8* src_b,
+ int count) {
+ // SumSquareError returns values 0 to 65535 for each squared difference.
+ // Up to 65536 of those can be summed and remain within a uint32.
+ // After each block of 65536 pixels, accumulate into a uint64.
+ const int kBlockSize = 65536;
+ int remainder = count & (kBlockSize - 1) & ~31;
+ uint64 sse = 0;
+ int i;
+ uint32 (*SumSquareError)(const uint8* src_a, const uint8* src_b, int count) =
+ SumSquareError_C;
+#if defined(HAS_SUMSQUAREERROR_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ SumSquareError = SumSquareError_NEON;
+ }
+#endif
+#if defined(HAS_SUMSQUAREERROR_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ // Note only used for multiples of 16 so count is not checked.
+ SumSquareError = SumSquareError_SSE2;
+ }
+#endif
+#if defined(HAS_SUMSQUAREERROR_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ // Note only used for multiples of 32 so count is not checked.
+ SumSquareError = SumSquareError_AVX2;
+ }
+#endif
+#ifdef _OPENMP
+#pragma omp parallel for reduction(+ : sse)
+#endif
+ for (i = 0; i < (count - (kBlockSize - 1)); i += kBlockSize) {
+ sse += SumSquareError(src_a + i, src_b + i, kBlockSize);
+ }
+ src_a += count & ~(kBlockSize - 1);
+ src_b += count & ~(kBlockSize - 1);
+ if (remainder) {
+ sse += SumSquareError(src_a, src_b, remainder);
+ src_a += remainder;
+ src_b += remainder;
+ }
+ remainder = count & 31;
+ if (remainder) {
+ sse += SumSquareError_C(src_a, src_b, remainder);
+ }
+ return sse;
+}
+
+LIBYUV_API
+uint64 ComputeSumSquareErrorPlane(const uint8* src_a,
+ int stride_a,
+ const uint8* src_b,
+ int stride_b,
+ int width,
+ int height) {
+ uint64 sse = 0;
+ int h;
+ // Coalesce rows.
+ if (stride_a == width && stride_b == width) {
+ width *= height;
+ height = 1;
+ stride_a = stride_b = 0;
+ }
+ for (h = 0; h < height; ++h) {
+ sse += ComputeSumSquareError(src_a, src_b, width);
+ src_a += stride_a;
+ src_b += stride_b;
+ }
+ return sse;
+}
+
+LIBYUV_API
+double SumSquareErrorToPsnr(uint64 sse, uint64 count) {
+ double psnr;
+ if (sse > 0) {
+ double mse = (double)count / (double)sse;
+ psnr = 10.0 * log10(255.0 * 255.0 * mse);
+ } else {
+ psnr = kMaxPsnr; // Limit to prevent divide by 0
+ }
+
+ if (psnr > kMaxPsnr)
+ psnr = kMaxPsnr;
+
+ return psnr;
+}
+
+LIBYUV_API
+double CalcFramePsnr(const uint8* src_a,
+ int stride_a,
+ const uint8* src_b,
+ int stride_b,
+ int width,
+ int height) {
+ const uint64 samples = width * height;
+ const uint64 sse = ComputeSumSquareErrorPlane(src_a, stride_a, src_b,
+ stride_b, width, height);
+ return SumSquareErrorToPsnr(sse, samples);
+}
+
+LIBYUV_API
+double I420Psnr(const uint8* src_y_a,
+ int stride_y_a,
+ const uint8* src_u_a,
+ int stride_u_a,
+ const uint8* src_v_a,
+ int stride_v_a,
+ const uint8* src_y_b,
+ int stride_y_b,
+ const uint8* src_u_b,
+ int stride_u_b,
+ const uint8* src_v_b,
+ int stride_v_b,
+ int width,
+ int height) {
+ const uint64 sse_y = ComputeSumSquareErrorPlane(src_y_a, stride_y_a, src_y_b,
+ stride_y_b, width, height);
+ const int width_uv = (width + 1) >> 1;
+ const int height_uv = (height + 1) >> 1;
+ const uint64 sse_u = ComputeSumSquareErrorPlane(
+ src_u_a, stride_u_a, src_u_b, stride_u_b, width_uv, height_uv);
+ const uint64 sse_v = ComputeSumSquareErrorPlane(
+ src_v_a, stride_v_a, src_v_b, stride_v_b, width_uv, height_uv);
+ const uint64 samples = width * height + 2 * (width_uv * height_uv);
+ const uint64 sse = sse_y + sse_u + sse_v;
+ return SumSquareErrorToPsnr(sse, samples);
+}
+
+static const int64 cc1 = 26634; // (64^2*(.01*255)^2
+static const int64 cc2 = 239708; // (64^2*(.03*255)^2
+
+static double Ssim8x8_C(const uint8* src_a,
+ int stride_a,
+ const uint8* src_b,
+ int stride_b) {
+ int64 sum_a = 0;
+ int64 sum_b = 0;
+ int64 sum_sq_a = 0;
+ int64 sum_sq_b = 0;
+ int64 sum_axb = 0;
+
+ int i;
+ for (i = 0; i < 8; ++i) {
+ int j;
+ for (j = 0; j < 8; ++j) {
+ sum_a += src_a[j];
+ sum_b += src_b[j];
+ sum_sq_a += src_a[j] * src_a[j];
+ sum_sq_b += src_b[j] * src_b[j];
+ sum_axb += src_a[j] * src_b[j];
+ }
+
+ src_a += stride_a;
+ src_b += stride_b;
+ }
+
+ {
+ const int64 count = 64;
+ // scale the constants by number of pixels
+ const int64 c1 = (cc1 * count * count) >> 12;
+ const int64 c2 = (cc2 * count * count) >> 12;
+
+ const int64 sum_a_x_sum_b = sum_a * sum_b;
+
+ const int64 ssim_n = (2 * sum_a_x_sum_b + c1) *
+ (2 * count * sum_axb - 2 * sum_a_x_sum_b + c2);
+
+ const int64 sum_a_sq = sum_a * sum_a;
+ const int64 sum_b_sq = sum_b * sum_b;
+
+ const int64 ssim_d =
+ (sum_a_sq + sum_b_sq + c1) *
+ (count * sum_sq_a - sum_a_sq + count * sum_sq_b - sum_b_sq + c2);
+
+ if (ssim_d == 0.0) {
+ return DBL_MAX;
+ }
+ return ssim_n * 1.0 / ssim_d;
+ }
+}
+
+// We are using a 8x8 moving window with starting location of each 8x8 window
+// on the 4x4 pixel grid. Such arrangement allows the windows to overlap
+// block boundaries to penalize blocking artifacts.
+LIBYUV_API
+double CalcFrameSsim(const uint8* src_a,
+ int stride_a,
+ const uint8* src_b,
+ int stride_b,
+ int width,
+ int height) {
+ int samples = 0;
+ double ssim_total = 0;
+ double (*Ssim8x8)(const uint8* src_a, int stride_a, const uint8* src_b,
+ int stride_b) = Ssim8x8_C;
+
+ // sample point start with each 4x4 location
+ int i;
+ for (i = 0; i < height - 8; i += 4) {
+ int j;
+ for (j = 0; j < width - 8; j += 4) {
+ ssim_total += Ssim8x8(src_a + j, stride_a, src_b + j, stride_b);
+ samples++;
+ }
+
+ src_a += stride_a * 4;
+ src_b += stride_b * 4;
+ }
+
+ ssim_total /= samples;
+ return ssim_total;
+}
+
+LIBYUV_API
+double I420Ssim(const uint8* src_y_a,
+ int stride_y_a,
+ const uint8* src_u_a,
+ int stride_u_a,
+ const uint8* src_v_a,
+ int stride_v_a,
+ const uint8* src_y_b,
+ int stride_y_b,
+ const uint8* src_u_b,
+ int stride_u_b,
+ const uint8* src_v_b,
+ int stride_v_b,
+ int width,
+ int height) {
+ const double ssim_y =
+ CalcFrameSsim(src_y_a, stride_y_a, src_y_b, stride_y_b, width, height);
+ const int width_uv = (width + 1) >> 1;
+ const int height_uv = (height + 1) >> 1;
+ const double ssim_u = CalcFrameSsim(src_u_a, stride_u_a, src_u_b, stride_u_b,
+ width_uv, height_uv);
+ const double ssim_v = CalcFrameSsim(src_v_a, stride_v_a, src_v_b, stride_v_b,
+ width_uv, height_uv);
+ return ssim_y * 0.8 + 0.1 * (ssim_u + ssim_v);
+}
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/libyuv/src/main/cpp/libyuv/source/compare_common.cc b/libyuv/src/main/cpp/libyuv/source/compare_common.cc
new file mode 100644
index 0000000..d3e46fb
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/source/compare_common.cc
@@ -0,0 +1,88 @@
+/*
+ * Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/basic_types.h"
+
+#include "libyuv/compare_row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#if ORIGINAL_OPT
+uint32 HammingDistance_C(const uint8* src_a, const uint8* src_b, int count) {
+ uint32 diff = 0u;
+
+ int i;
+ for (i = 0; i < count; ++i) {
+ int x = src_a[i] ^ src_b[i];
+ if (x & 1)
+ ++diff;
+ if (x & 2)
+ ++diff;
+ if (x & 4)
+ ++diff;
+ if (x & 8)
+ ++diff;
+ if (x & 16)
+ ++diff;
+ if (x & 32)
+ ++diff;
+ if (x & 64)
+ ++diff;
+ if (x & 128)
+ ++diff;
+ }
+ return diff;
+}
+#endif
+
+// Hakmem method for hamming distance.
+uint32 HammingDistance_C(const uint8* src_a, const uint8* src_b, int count) {
+ uint32 diff = 0u;
+
+ int i;
+ for (i = 0; i < count - 3; i += 4) {
+ uint32 x = *((uint32*)src_a) ^ *((uint32*)src_b);
+ uint32 u = x - ((x >> 1) & 0x55555555);
+ u = ((u >> 2) & 0x33333333) + (u & 0x33333333);
+ diff += ((((u + (u >> 4)) & 0x0f0f0f0f) * 0x01010101) >> 24);
+ src_a += 4;
+ src_b += 4;
+ }
+ return diff;
+}
+
+uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b, int count) {
+ uint32 sse = 0u;
+ int i;
+ for (i = 0; i < count; ++i) {
+ int diff = src_a[i] - src_b[i];
+ sse += (uint32)(diff * diff);
+ }
+ return sse;
+}
+
+// hash seed of 5381 recommended.
+// Internal C version of HashDjb2 with int sized count for efficiency.
+uint32 HashDjb2_C(const uint8* src, int count, uint32 seed) {
+ uint32 hash = seed;
+ int i;
+ for (i = 0; i < count; ++i) {
+ hash += (hash << 5) + src[i];
+ }
+ return hash;
+}
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/libyuv/src/main/cpp/libyuv/source/compare_gcc.cc b/libyuv/src/main/cpp/libyuv/source/compare_gcc.cc
new file mode 100644
index 0000000..994fb10
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/source/compare_gcc.cc
@@ -0,0 +1,163 @@
+/*
+ * Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/basic_types.h"
+
+#include "libyuv/compare_row.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for GCC x86 and x64.
+#if !defined(LIBYUV_DISABLE_X86) && \
+ (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
+
+uint32 HammingDistance_X86(const uint8* src_a, const uint8* src_b, int count) {
+ uint32 diff = 0u;
+
+ int i;
+ for (i = 0; i < count - 7; i += 8) {
+ uint64 x = *((uint64*)src_a) ^ *((uint64*)src_b);
+ src_a += 8;
+ src_b += 8;
+ diff += __builtin_popcountll(x);
+ }
+ return diff;
+}
+
+uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
+ uint32 sse;
+ asm volatile (
+ "pxor %%xmm0,%%xmm0 \n"
+ "pxor %%xmm5,%%xmm5 \n"
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm1 \n"
+ "lea " MEMLEA(0x10, 0) ",%0 \n"
+ "movdqu " MEMACCESS(1) ",%%xmm2 \n"
+ "lea " MEMLEA(0x10, 1) ",%1 \n"
+ "movdqa %%xmm1,%%xmm3 \n"
+ "psubusb %%xmm2,%%xmm1 \n"
+ "psubusb %%xmm3,%%xmm2 \n"
+ "por %%xmm2,%%xmm1 \n"
+ "movdqa %%xmm1,%%xmm2 \n"
+ "punpcklbw %%xmm5,%%xmm1 \n"
+ "punpckhbw %%xmm5,%%xmm2 \n"
+ "pmaddwd %%xmm1,%%xmm1 \n"
+ "pmaddwd %%xmm2,%%xmm2 \n"
+ "paddd %%xmm1,%%xmm0 \n"
+ "paddd %%xmm2,%%xmm0 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+
+ "pshufd $0xee,%%xmm0,%%xmm1 \n"
+ "paddd %%xmm1,%%xmm0 \n"
+ "pshufd $0x1,%%xmm0,%%xmm1 \n"
+ "paddd %%xmm1,%%xmm0 \n"
+ "movd %%xmm0,%3 \n"
+
+ : "+r"(src_a), // %0
+ "+r"(src_b), // %1
+ "+r"(count), // %2
+ "=g"(sse) // %3
+ :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+ );
+ return sse;
+}
+
+static uvec32 kHash16x33 = {0x92d9e201, 0, 0, 0}; // 33 ^ 16
+static uvec32 kHashMul0 = {
+ 0x0c3525e1, // 33 ^ 15
+ 0xa3476dc1, // 33 ^ 14
+ 0x3b4039a1, // 33 ^ 13
+ 0x4f5f0981, // 33 ^ 12
+};
+static uvec32 kHashMul1 = {
+ 0x30f35d61, // 33 ^ 11
+ 0x855cb541, // 33 ^ 10
+ 0x040a9121, // 33 ^ 9
+ 0x747c7101, // 33 ^ 8
+};
+static uvec32 kHashMul2 = {
+ 0xec41d4e1, // 33 ^ 7
+ 0x4cfa3cc1, // 33 ^ 6
+ 0x025528a1, // 33 ^ 5
+ 0x00121881, // 33 ^ 4
+};
+static uvec32 kHashMul3 = {
+ 0x00008c61, // 33 ^ 3
+ 0x00000441, // 33 ^ 2
+ 0x00000021, // 33 ^ 1
+ 0x00000001, // 33 ^ 0
+};
+
+uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
+ uint32 hash;
+ asm volatile (
+ "movd %2,%%xmm0 \n"
+ "pxor %%xmm7,%%xmm7 \n"
+ "movdqa %4,%%xmm6 \n"
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm1 \n"
+ "lea " MEMLEA(0x10, 0) ",%0 \n"
+ "pmulld %%xmm6,%%xmm0 \n"
+ "movdqa %5,%%xmm5 \n"
+ "movdqa %%xmm1,%%xmm2 \n"
+ "punpcklbw %%xmm7,%%xmm2 \n"
+ "movdqa %%xmm2,%%xmm3 \n"
+ "punpcklwd %%xmm7,%%xmm3 \n"
+ "pmulld %%xmm5,%%xmm3 \n"
+ "movdqa %6,%%xmm5 \n"
+ "movdqa %%xmm2,%%xmm4 \n"
+ "punpckhwd %%xmm7,%%xmm4 \n"
+ "pmulld %%xmm5,%%xmm4 \n"
+ "movdqa %7,%%xmm5 \n"
+ "punpckhbw %%xmm7,%%xmm1 \n"
+ "movdqa %%xmm1,%%xmm2 \n"
+ "punpcklwd %%xmm7,%%xmm2 \n"
+ "pmulld %%xmm5,%%xmm2 \n"
+ "movdqa %8,%%xmm5 \n"
+ "punpckhwd %%xmm7,%%xmm1 \n"
+ "pmulld %%xmm5,%%xmm1 \n"
+ "paddd %%xmm4,%%xmm3 \n"
+ "paddd %%xmm2,%%xmm1 \n"
+ "paddd %%xmm3,%%xmm1 \n"
+ "pshufd $0xe,%%xmm1,%%xmm2 \n"
+ "paddd %%xmm2,%%xmm1 \n"
+ "pshufd $0x1,%%xmm1,%%xmm2 \n"
+ "paddd %%xmm2,%%xmm1 \n"
+ "paddd %%xmm1,%%xmm0 \n"
+ "sub $0x10,%1 \n"
+ "jg 1b \n"
+ "movd %%xmm0,%3 \n"
+ : "+r"(src), // %0
+ "+r"(count), // %1
+ "+rm"(seed), // %2
+ "=g"(hash) // %3
+ : "m"(kHash16x33), // %4
+ "m"(kHashMul0), // %5
+ "m"(kHashMul1), // %6
+ "m"(kHashMul2), // %7
+ "m"(kHashMul3) // %8
+ : "memory", "cc"
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+ );
+ return hash;
+}
+#endif // defined(__x86_64__) || (defined(__i386__) && !defined(__pic__)))
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/libyuv/src/main/cpp/libyuv/source/compare_neon.cc b/libyuv/src/main/cpp/libyuv/source/compare_neon.cc
new file mode 100644
index 0000000..0f374a7
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/source/compare_neon.cc
@@ -0,0 +1,98 @@
+/*
+ * Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/basic_types.h"
+
+#include "libyuv/compare_row.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
+ !defined(__aarch64__)
+
+// 256 bits at a time
+// uses short accumulator which restricts count to 131 KB
+uint32 HammingDistance_NEON(const uint8* src_a, const uint8* src_b, int count) {
+ uint32 diff;
+
+ asm volatile (
+ "vmov.u16 q4, #0 \n" // accumulator
+
+ "1: \n"
+ "vld1.8 {q0, q1}, [%0]! \n"
+ "vld1.8 {q2, q3}, [%1]! \n"
+ "veor.32 q0, q0, q2 \n"
+ "veor.32 q1, q1, q3 \n"
+ "vcnt.i8 q0, q0 \n"
+ "vcnt.i8 q1, q1 \n"
+ "subs %2, %2, #32 \n"
+ "vadd.u8 q0, q0, q1 \n" // 16 byte counts
+ "vpadal.u8 q4, q0 \n" // 8 shorts
+ "bgt 1b \n"
+
+ "vpaddl.u16 q0, q4 \n" // 4 ints
+ "vpadd.u32 d0, d0, d1 \n"
+ "vpadd.u32 d0, d0, d0 \n"
+ "vmov.32 %3, d0[0] \n"
+
+ : "+r"(src_a),
+ "+r"(src_b),
+ "+r"(count),
+ "=r"(diff)
+ :
+ : "cc", "q0", "q1", "q2", "q3", "q4");
+ return diff;
+}
+
+uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
+ uint32 sse;
+ asm volatile (
+ "vmov.u8 q8, #0 \n"
+ "vmov.u8 q10, #0 \n"
+ "vmov.u8 q9, #0 \n"
+ "vmov.u8 q11, #0 \n"
+
+ "1: \n"
+ "vld1.8 {q0}, [%0]! \n"
+ "vld1.8 {q1}, [%1]! \n"
+ "subs %2, %2, #16 \n"
+ "vsubl.u8 q2, d0, d2 \n"
+ "vsubl.u8 q3, d1, d3 \n"
+ "vmlal.s16 q8, d4, d4 \n"
+ "vmlal.s16 q9, d6, d6 \n"
+ "vmlal.s16 q10, d5, d5 \n"
+ "vmlal.s16 q11, d7, d7 \n"
+ "bgt 1b \n"
+
+ "vadd.u32 q8, q8, q9 \n"
+ "vadd.u32 q10, q10, q11 \n"
+ "vadd.u32 q11, q8, q10 \n"
+ "vpaddl.u32 q1, q11 \n"
+ "vadd.u64 d0, d2, d3 \n"
+ "vmov.32 %3, d0[0] \n"
+ : "+r"(src_a),
+ "+r"(src_b),
+ "+r"(count),
+ "=r"(sse)
+ :
+ : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
+ return sse;
+}
+
+#endif // defined(__ARM_NEON__) && !defined(__aarch64__)
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/libyuv/src/main/cpp/libyuv/source/compare_neon64.cc b/libyuv/src/main/cpp/libyuv/source/compare_neon64.cc
new file mode 100644
index 0000000..da404cc
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/source/compare_neon64.cc
@@ -0,0 +1,92 @@
+/*
+ * Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/basic_types.h"
+
+#include "libyuv/compare_row.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+
+// 256 bits at a time
+// uses short accumulator which restricts count to 131 KB
+uint32 HammingDistance_NEON(const uint8* src_a, const uint8* src_b, int count) {
+ uint32 diff;
+ asm volatile (
+ "movi v4.8h, #0 \n"
+
+ "1: \n"
+ "ld1 {v0.16b, v1.16b}, [%0], #32 \n"
+ "ld1 {v2.16b, v3.16b}, [%1], #32 \n"
+ "eor v0.16b, v0.16b, v2.16b \n"
+ "eor v1.16b, v1.16b, v3.16b \n"
+ "cnt v0.16b, v0.16b \n"
+ "cnt v1.16b, v1.16b \n"
+ "subs %w2, %w2, #32 \n"
+ "add v0.16b, v0.16b, v1.16b \n"
+ "uadalp v4.8h, v0.16b \n"
+ "b.gt 1b \n"
+
+ "uaddlv s4, v4.8h \n"
+ "fmov %w3, s4 \n"
+ : "+r"(src_a),
+ "+r"(src_b),
+ "+r"(count),
+ "=r"(diff)
+ :
+ : "cc", "v0", "v1", "v2", "v3", "v4");
+ return diff;
+}
+
+uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
+ uint32 sse;
+ asm volatile (
+ "eor v16.16b, v16.16b, v16.16b \n"
+ "eor v18.16b, v18.16b, v18.16b \n"
+ "eor v17.16b, v17.16b, v17.16b \n"
+ "eor v19.16b, v19.16b, v19.16b \n"
+
+ "1: \n"
+ "ld1 {v0.16b}, [%0], #16 \n"
+ "ld1 {v1.16b}, [%1], #16 \n"
+ "subs %w2, %w2, #16 \n"
+ "usubl v2.8h, v0.8b, v1.8b \n"
+ "usubl2 v3.8h, v0.16b, v1.16b \n"
+ "smlal v16.4s, v2.4h, v2.4h \n"
+ "smlal v17.4s, v3.4h, v3.4h \n"
+ "smlal2 v18.4s, v2.8h, v2.8h \n"
+ "smlal2 v19.4s, v3.8h, v3.8h \n"
+ "b.gt 1b \n"
+
+ "add v16.4s, v16.4s, v17.4s \n"
+ "add v18.4s, v18.4s, v19.4s \n"
+ "add v19.4s, v16.4s, v18.4s \n"
+ "addv s0, v19.4s \n"
+ "fmov %w3, s0 \n"
+ : "+r"(src_a),
+ "+r"(src_b),
+ "+r"(count),
+ "=r"(sse)
+ :
+ : "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19");
+ return sse;
+}
+
+#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/libyuv/src/main/cpp/libyuv/source/compare_win.cc b/libyuv/src/main/cpp/libyuv/source/compare_win.cc
new file mode 100644
index 0000000..5ca5917
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/source/compare_win.cc
@@ -0,0 +1,239 @@
+/*
+ * Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/basic_types.h"
+
+#include "libyuv/compare_row.h"
+#include "libyuv/row.h"
+
+#if defined(_MSC_VER)
+#include // For __popcnt
+#endif
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for 32 bit Visual C x86 and clangcl
+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
+
+uint32 HammingDistance_X86(const uint8* src_a, const uint8* src_b, int count) {
+ uint32 diff = 0u;
+
+ int i;
+ for (i = 0; i < count - 3; i += 4) {
+ uint32 x = *((uint32*)src_a) ^ *((uint32*)src_b);
+ src_a += 4;
+ src_b += 4;
+ diff += __popcnt(x);
+ }
+ return diff;
+}
+
+__declspec(naked) uint32
+ SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
+ __asm {
+ mov eax, [esp + 4] // src_a
+ mov edx, [esp + 8] // src_b
+ mov ecx, [esp + 12] // count
+ pxor xmm0, xmm0
+ pxor xmm5, xmm5
+
+ wloop:
+ movdqu xmm1, [eax]
+ lea eax, [eax + 16]
+ movdqu xmm2, [edx]
+ lea edx, [edx + 16]
+ movdqa xmm3, xmm1 // abs trick
+ psubusb xmm1, xmm2
+ psubusb xmm2, xmm3
+ por xmm1, xmm2
+ movdqa xmm2, xmm1
+ punpcklbw xmm1, xmm5
+ punpckhbw xmm2, xmm5
+ pmaddwd xmm1, xmm1
+ pmaddwd xmm2, xmm2
+ paddd xmm0, xmm1
+ paddd xmm0, xmm2
+ sub ecx, 16
+ jg wloop
+
+ pshufd xmm1, xmm0, 0xee
+ paddd xmm0, xmm1
+ pshufd xmm1, xmm0, 0x01
+ paddd xmm0, xmm1
+ movd eax, xmm0
+ ret
+ }
+}
+
+// Visual C 2012 required for AVX2.
+#if _MSC_VER >= 1700
+// C4752: found Intel(R) Advanced Vector Extensions; consider using /arch:AVX.
+#pragma warning(disable : 4752)
+__declspec(naked) uint32
+ SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) {
+ __asm {
+ mov eax, [esp + 4] // src_a
+ mov edx, [esp + 8] // src_b
+ mov ecx, [esp + 12] // count
+ vpxor ymm0, ymm0, ymm0 // sum
+ vpxor ymm5, ymm5, ymm5 // constant 0 for unpck
+ sub edx, eax
+
+ wloop:
+ vmovdqu ymm1, [eax]
+ vmovdqu ymm2, [eax + edx]
+ lea eax, [eax + 32]
+ vpsubusb ymm3, ymm1, ymm2 // abs difference trick
+ vpsubusb ymm2, ymm2, ymm1
+ vpor ymm1, ymm2, ymm3
+ vpunpcklbw ymm2, ymm1, ymm5 // u16. mutates order.
+ vpunpckhbw ymm1, ymm1, ymm5
+ vpmaddwd ymm2, ymm2, ymm2 // square + hadd to u32.
+ vpmaddwd ymm1, ymm1, ymm1
+ vpaddd ymm0, ymm0, ymm1
+ vpaddd ymm0, ymm0, ymm2
+ sub ecx, 32
+ jg wloop
+
+ vpshufd ymm1, ymm0, 0xee // 3, 2 + 1, 0 both lanes.
+ vpaddd ymm0, ymm0, ymm1
+ vpshufd ymm1, ymm0, 0x01 // 1 + 0 both lanes.
+ vpaddd ymm0, ymm0, ymm1
+ vpermq ymm1, ymm0, 0x02 // high + low lane.
+ vpaddd ymm0, ymm0, ymm1
+ vmovd eax, xmm0
+ vzeroupper
+ ret
+ }
+}
+#endif // _MSC_VER >= 1700
+
+uvec32 kHash16x33 = {0x92d9e201, 0, 0, 0}; // 33 ^ 16
+uvec32 kHashMul0 = {
+ 0x0c3525e1, // 33 ^ 15
+ 0xa3476dc1, // 33 ^ 14
+ 0x3b4039a1, // 33 ^ 13
+ 0x4f5f0981, // 33 ^ 12
+};
+uvec32 kHashMul1 = {
+ 0x30f35d61, // 33 ^ 11
+ 0x855cb541, // 33 ^ 10
+ 0x040a9121, // 33 ^ 9
+ 0x747c7101, // 33 ^ 8
+};
+uvec32 kHashMul2 = {
+ 0xec41d4e1, // 33 ^ 7
+ 0x4cfa3cc1, // 33 ^ 6
+ 0x025528a1, // 33 ^ 5
+ 0x00121881, // 33 ^ 4
+};
+uvec32 kHashMul3 = {
+ 0x00008c61, // 33 ^ 3
+ 0x00000441, // 33 ^ 2
+ 0x00000021, // 33 ^ 1
+ 0x00000001, // 33 ^ 0
+};
+
+__declspec(naked) uint32
+ HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
+ __asm {
+ mov eax, [esp + 4] // src
+ mov ecx, [esp + 8] // count
+ movd xmm0, [esp + 12] // seed
+
+ pxor xmm7, xmm7 // constant 0 for unpck
+ movdqa xmm6, xmmword ptr kHash16x33
+
+ wloop:
+ movdqu xmm1, [eax] // src[0-15]
+ lea eax, [eax + 16]
+ pmulld xmm0, xmm6 // hash *= 33 ^ 16
+ movdqa xmm5, xmmword ptr kHashMul0
+ movdqa xmm2, xmm1
+ punpcklbw xmm2, xmm7 // src[0-7]
+ movdqa xmm3, xmm2
+ punpcklwd xmm3, xmm7 // src[0-3]
+ pmulld xmm3, xmm5
+ movdqa xmm5, xmmword ptr kHashMul1
+ movdqa xmm4, xmm2
+ punpckhwd xmm4, xmm7 // src[4-7]
+ pmulld xmm4, xmm5
+ movdqa xmm5, xmmword ptr kHashMul2
+ punpckhbw xmm1, xmm7 // src[8-15]
+ movdqa xmm2, xmm1
+ punpcklwd xmm2, xmm7 // src[8-11]
+ pmulld xmm2, xmm5
+ movdqa xmm5, xmmword ptr kHashMul3
+ punpckhwd xmm1, xmm7 // src[12-15]
+ pmulld xmm1, xmm5
+ paddd xmm3, xmm4 // add 16 results
+ paddd xmm1, xmm2
+ paddd xmm1, xmm3
+
+ pshufd xmm2, xmm1, 0x0e // upper 2 dwords
+ paddd xmm1, xmm2
+ pshufd xmm2, xmm1, 0x01
+ paddd xmm1, xmm2
+ paddd xmm0, xmm1
+ sub ecx, 16
+ jg wloop
+
+ movd eax, xmm0 // return hash
+ ret
+ }
+}
+
+// Visual C 2012 required for AVX2.
+#if _MSC_VER >= 1700
+__declspec(naked) uint32
+ HashDjb2_AVX2(const uint8* src, int count, uint32 seed) {
+ __asm {
+ mov eax, [esp + 4] // src
+ mov ecx, [esp + 8] // count
+ vmovd xmm0, [esp + 12] // seed
+
+ wloop:
+ vpmovzxbd xmm3, [eax] // src[0-3]
+ vpmulld xmm0, xmm0, xmmword ptr kHash16x33 // hash *= 33 ^ 16
+ vpmovzxbd xmm4, [eax + 4] // src[4-7]
+ vpmulld xmm3, xmm3, xmmword ptr kHashMul0
+ vpmovzxbd xmm2, [eax + 8] // src[8-11]
+ vpmulld xmm4, xmm4, xmmword ptr kHashMul1
+ vpmovzxbd xmm1, [eax + 12] // src[12-15]
+ vpmulld xmm2, xmm2, xmmword ptr kHashMul2
+ lea eax, [eax + 16]
+ vpmulld xmm1, xmm1, xmmword ptr kHashMul3
+ vpaddd xmm3, xmm3, xmm4 // add 16 results
+ vpaddd xmm1, xmm1, xmm2
+ vpaddd xmm1, xmm1, xmm3
+ vpshufd xmm2, xmm1, 0x0e // upper 2 dwords
+ vpaddd xmm1, xmm1,xmm2
+ vpshufd xmm2, xmm1, 0x01
+ vpaddd xmm1, xmm1, xmm2
+ vpaddd xmm0, xmm0, xmm1
+ sub ecx, 16
+ jg wloop
+
+ vmovd eax, xmm0 // return hash
+ vzeroupper
+ ret
+ }
+}
+#endif // _MSC_VER >= 1700
+
+#endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/libyuv/src/main/cpp/libyuv/source/convert.cc b/libyuv/src/main/cpp/libyuv/source/convert.cc
new file mode 100644
index 0000000..dfa83a5
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/source/convert.cc
@@ -0,0 +1,1718 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/convert.h"
+
+#include "libyuv/basic_types.h"
+#include "libyuv/cpu_id.h"
+#include "libyuv/planar_functions.h"
+#include "libyuv/rotate.h"
+#include "libyuv/row.h"
+#include "libyuv/scale.h" // For ScalePlane()
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#define SUBSAMPLE(v, a, s) (v < 0) ? (-((-v + a) >> s)) : ((v + a) >> s)
+static __inline int Abs(int v) {
+ return v >= 0 ? v : -v;
+}
+
+// Any I4xx To I420 format with mirroring.
+static int I4xxToI420(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int src_y_width,
+ int src_y_height,
+ int src_uv_width,
+ int src_uv_height) {
+ const int dst_y_width = Abs(src_y_width);
+ const int dst_y_height = Abs(src_y_height);
+ const int dst_uv_width = SUBSAMPLE(dst_y_width, 1, 1);
+ const int dst_uv_height = SUBSAMPLE(dst_y_height, 1, 1);
+ if (src_uv_width == 0 || src_uv_height == 0) {
+ return -1;
+ }
+ if (dst_y) {
+ ScalePlane(src_y, src_stride_y, src_y_width, src_y_height, dst_y,
+ dst_stride_y, dst_y_width, dst_y_height, kFilterBilinear);
+ }
+ ScalePlane(src_u, src_stride_u, src_uv_width, src_uv_height, dst_u,
+ dst_stride_u, dst_uv_width, dst_uv_height, kFilterBilinear);
+ ScalePlane(src_v, src_stride_v, src_uv_width, src_uv_height, dst_v,
+ dst_stride_v, dst_uv_width, dst_uv_height, kFilterBilinear);
+ return 0;
+}
+
+// Copy I420 with optional flipping
+// TODO(fbarchard): Use Scale plane which supports mirroring, but ensure
+// is does row coalescing.
+LIBYUV_API
+int I420Copy(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ int halfwidth = (width + 1) >> 1;
+ int halfheight = (height + 1) >> 1;
+ if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ halfheight = (height + 1) >> 1;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_u = src_u + (halfheight - 1) * src_stride_u;
+ src_v = src_v + (halfheight - 1) * src_stride_v;
+ src_stride_y = -src_stride_y;
+ src_stride_u = -src_stride_u;
+ src_stride_v = -src_stride_v;
+ }
+
+ if (dst_y) {
+ CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ }
+ // Copy UV planes.
+ CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, halfheight);
+ CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, halfheight);
+ return 0;
+}
+
+// 422 chroma is 1/2 width, 1x height
+// 420 chroma is 1/2 width, 1/2 height
+LIBYUV_API
+int I422ToI420(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ const int src_uv_width = SUBSAMPLE(width, 1, 1);
+ return I4xxToI420(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u,
+ dst_v, dst_stride_v, width, height, src_uv_width, height);
+}
+
+// 444 chroma is 1x width, 1x height
+// 420 chroma is 1/2 width, 1/2 height
+LIBYUV_API
+int I444ToI420(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ return I4xxToI420(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u,
+ dst_v, dst_stride_v, width, height, width, height);
+}
+
+// I400 is greyscale typically used in MJPG
+LIBYUV_API
+int I400ToI420(const uint8* src_y,
+ int src_stride_y,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ int halfwidth = (width + 1) >> 1;
+ int halfheight = (height + 1) >> 1;
+ if (!dst_u || !dst_v || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ halfheight = (height + 1) >> 1;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_stride_y = -src_stride_y;
+ }
+ if (dst_y) {
+ CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ }
+ SetPlane(dst_u, dst_stride_u, halfwidth, halfheight, 128);
+ SetPlane(dst_v, dst_stride_v, halfwidth, halfheight, 128);
+ return 0;
+}
+
+static void CopyPlane2(const uint8* src,
+ int src_stride_0,
+ int src_stride_1,
+ uint8* dst,
+ int dst_stride,
+ int width,
+ int height) {
+ int y;
+ void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
+#if defined(HAS_COPYROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
+ }
+#endif
+#if defined(HAS_COPYROW_AVX)
+ if (TestCpuFlag(kCpuHasAVX)) {
+ CopyRow = IS_ALIGNED(width, 64) ? CopyRow_AVX : CopyRow_Any_AVX;
+ }
+#endif
+#if defined(HAS_COPYROW_ERMS)
+ if (TestCpuFlag(kCpuHasERMS)) {
+ CopyRow = CopyRow_ERMS;
+ }
+#endif
+#if defined(HAS_COPYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
+ }
+#endif
+#if defined(HAS_COPYROW_MIPS)
+ if (TestCpuFlag(kCpuHasMIPS)) {
+ CopyRow = CopyRow_MIPS;
+ }
+#endif
+
+ // Copy plane
+ for (y = 0; y < height - 1; y += 2) {
+ CopyRow(src, dst, width);
+ CopyRow(src + src_stride_0, dst + dst_stride, width);
+ src += src_stride_0 + src_stride_1;
+ dst += dst_stride * 2;
+ }
+ if (height & 1) {
+ CopyRow(src, dst, width);
+ }
+}
+
+// Support converting from FOURCC_M420
+// Useful for bandwidth constrained transports like USB 1.0 and 2.0 and for
+// easy conversion to I420.
+// M420 format description:
+// M420 is row biplanar 420: 2 rows of Y and 1 row of UV.
+// Chroma is half width / half height. (420)
+// src_stride_m420 is row planar. Normally this will be the width in pixels.
+// The UV plane is half width, but 2 values, so src_stride_m420 applies to
+// this as well as the two Y planes.
+static int X420ToI420(const uint8* src_y,
+ int src_stride_y0,
+ int src_stride_y1,
+ const uint8* src_uv,
+ int src_stride_uv,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ int halfwidth = (width + 1) >> 1;
+ int halfheight = (height + 1) >> 1;
+ if (!src_uv || !dst_u || !dst_v || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ halfheight = (height + 1) >> 1;
+ if (dst_y) {
+ dst_y = dst_y + (height - 1) * dst_stride_y;
+ }
+ dst_u = dst_u + (halfheight - 1) * dst_stride_u;
+ dst_v = dst_v + (halfheight - 1) * dst_stride_v;
+ dst_stride_y = -dst_stride_y;
+ dst_stride_u = -dst_stride_u;
+ dst_stride_v = -dst_stride_v;
+ }
+ // Coalesce rows.
+ if (src_stride_y0 == width && src_stride_y1 == width &&
+ dst_stride_y == width) {
+ width *= height;
+ height = 1;
+ src_stride_y0 = src_stride_y1 = dst_stride_y = 0;
+ }
+ // Coalesce rows.
+ if (src_stride_uv == halfwidth * 2 && dst_stride_u == halfwidth &&
+ dst_stride_v == halfwidth) {
+ halfwidth *= halfheight;
+ halfheight = 1;
+ src_stride_uv = dst_stride_u = dst_stride_v = 0;
+ }
+
+ if (dst_y) {
+ if (src_stride_y0 == src_stride_y1) {
+ CopyPlane(src_y, src_stride_y0, dst_y, dst_stride_y, width, height);
+ } else {
+ CopyPlane2(src_y, src_stride_y0, src_stride_y1, dst_y, dst_stride_y,
+ width, height);
+ }
+ }
+
+ // Split UV plane - NV12 / NV21
+ SplitUVPlane(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v, dst_stride_v,
+ halfwidth, halfheight);
+
+ return 0;
+}
+
+// Convert NV12 to I420.
+LIBYUV_API
+int NV12ToI420(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_uv,
+ int src_stride_uv,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ return X420ToI420(src_y, src_stride_y, src_stride_y, src_uv, src_stride_uv,
+ dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v,
+ dst_stride_v, width, height);
+}
+
+// Convert NV21 to I420. Same as NV12 but u and v pointers swapped.
+LIBYUV_API
+int NV21ToI420(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_vu,
+ int src_stride_vu,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ return X420ToI420(src_y, src_stride_y, src_stride_y, src_vu, src_stride_vu,
+ dst_y, dst_stride_y, dst_v, dst_stride_v, dst_u,
+ dst_stride_u, width, height);
+}
+
+// Convert M420 to I420.
+LIBYUV_API
+int M420ToI420(const uint8* src_m420,
+ int src_stride_m420,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ return X420ToI420(src_m420, src_stride_m420, src_stride_m420 * 2,
+ src_m420 + src_stride_m420 * 2, src_stride_m420 * 3, dst_y,
+ dst_stride_y, dst_u, dst_stride_u, dst_v, dst_stride_v,
+ width, height);
+}
+
+// Convert YUY2 to I420.
+LIBYUV_API
+int YUY2ToI420(const uint8* src_yuy2,
+ int src_stride_yuy2,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ int y;
+ void (*YUY2ToUVRow)(const uint8* src_yuy2, int src_stride_yuy2, uint8* dst_u,
+ uint8* dst_v, int width) = YUY2ToUVRow_C;
+ void (*YUY2ToYRow)(const uint8* src_yuy2, uint8* dst_y, int width) =
+ YUY2ToYRow_C;
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2;
+ src_stride_yuy2 = -src_stride_yuy2;
+ }
+#if defined(HAS_YUY2TOYROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ YUY2ToUVRow = YUY2ToUVRow_Any_SSE2;
+ YUY2ToYRow = YUY2ToYRow_Any_SSE2;
+ if (IS_ALIGNED(width, 16)) {
+ YUY2ToUVRow = YUY2ToUVRow_SSE2;
+ YUY2ToYRow = YUY2ToYRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_YUY2TOYROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ YUY2ToUVRow = YUY2ToUVRow_Any_AVX2;
+ YUY2ToYRow = YUY2ToYRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ YUY2ToUVRow = YUY2ToUVRow_AVX2;
+ YUY2ToYRow = YUY2ToYRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_YUY2TOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ YUY2ToYRow = YUY2ToYRow_Any_NEON;
+ YUY2ToUVRow = YUY2ToUVRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ YUY2ToYRow = YUY2ToYRow_NEON;
+ YUY2ToUVRow = YUY2ToUVRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_YUY2TOYROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ YUY2ToYRow = YUY2ToYRow_Any_MSA;
+ YUY2ToUVRow = YUY2ToUVRow_Any_MSA;
+ if (IS_ALIGNED(width, 32)) {
+ YUY2ToYRow = YUY2ToYRow_MSA;
+ YUY2ToUVRow = YUY2ToUVRow_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height - 1; y += 2) {
+ YUY2ToUVRow(src_yuy2, src_stride_yuy2, dst_u, dst_v, width);
+ YUY2ToYRow(src_yuy2, dst_y, width);
+ YUY2ToYRow(src_yuy2 + src_stride_yuy2, dst_y + dst_stride_y, width);
+ src_yuy2 += src_stride_yuy2 * 2;
+ dst_y += dst_stride_y * 2;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ if (height & 1) {
+ YUY2ToUVRow(src_yuy2, 0, dst_u, dst_v, width);
+ YUY2ToYRow(src_yuy2, dst_y, width);
+ }
+ return 0;
+}
+
+// Convert UYVY to I420.
+LIBYUV_API
+int UYVYToI420(const uint8* src_uyvy,
+ int src_stride_uyvy,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ int y;
+ void (*UYVYToUVRow)(const uint8* src_uyvy, int src_stride_uyvy, uint8* dst_u,
+ uint8* dst_v, int width) = UYVYToUVRow_C;
+ void (*UYVYToYRow)(const uint8* src_uyvy, uint8* dst_y, int width) =
+ UYVYToYRow_C;
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy;
+ src_stride_uyvy = -src_stride_uyvy;
+ }
+#if defined(HAS_UYVYTOYROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ UYVYToUVRow = UYVYToUVRow_Any_SSE2;
+ UYVYToYRow = UYVYToYRow_Any_SSE2;
+ if (IS_ALIGNED(width, 16)) {
+ UYVYToUVRow = UYVYToUVRow_SSE2;
+ UYVYToYRow = UYVYToYRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_UYVYTOYROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ UYVYToUVRow = UYVYToUVRow_Any_AVX2;
+ UYVYToYRow = UYVYToYRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ UYVYToUVRow = UYVYToUVRow_AVX2;
+ UYVYToYRow = UYVYToYRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_UYVYTOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ UYVYToYRow = UYVYToYRow_Any_NEON;
+ UYVYToUVRow = UYVYToUVRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ UYVYToYRow = UYVYToYRow_NEON;
+ UYVYToUVRow = UYVYToUVRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_UYVYTOYROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ UYVYToYRow = UYVYToYRow_Any_MSA;
+ UYVYToUVRow = UYVYToUVRow_Any_MSA;
+ if (IS_ALIGNED(width, 32)) {
+ UYVYToYRow = UYVYToYRow_MSA;
+ UYVYToUVRow = UYVYToUVRow_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height - 1; y += 2) {
+ UYVYToUVRow(src_uyvy, src_stride_uyvy, dst_u, dst_v, width);
+ UYVYToYRow(src_uyvy, dst_y, width);
+ UYVYToYRow(src_uyvy + src_stride_uyvy, dst_y + dst_stride_y, width);
+ src_uyvy += src_stride_uyvy * 2;
+ dst_y += dst_stride_y * 2;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ if (height & 1) {
+ UYVYToUVRow(src_uyvy, 0, dst_u, dst_v, width);
+ UYVYToYRow(src_uyvy, dst_y, width);
+ }
+ return 0;
+}
+
+// Convert ARGB to I420.
+LIBYUV_API
+int ARGBToI420(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ int y;
+ void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, uint8* dst_u,
+ uint8* dst_v, int width) = ARGBToUVRow_C;
+ void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
+ ARGBToYRow_C;
+ if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+ ARGBToYRow = ARGBToYRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_SSSE3;
+ ARGBToYRow = ARGBToYRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+ ARGBToYRow = ARGBToYRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToUVRow = ARGBToUVRow_AVX2;
+ ARGBToYRow = ARGBToYRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBToYRow = ARGBToYRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToYRow = ARGBToYRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBToUVRow = ARGBToUVRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2)) {
+ ARGBToYRow = ARGBToYRow_Any_DSPR2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToYRow = ARGBToYRow_DSPR2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBToYRow = ARGBToYRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYRow = ARGBToYRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOUVROW_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2)) {
+ ARGBToUVRow = ARGBToUVRow_Any_DSPR2;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_DSPR2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOUVROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBToUVRow = ARGBToUVRow_Any_MSA;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToUVRow = ARGBToUVRow_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height - 1; y += 2) {
+ ARGBToUVRow(src_argb, src_stride_argb, dst_u, dst_v, width);
+ ARGBToYRow(src_argb, dst_y, width);
+ ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width);
+ src_argb += src_stride_argb * 2;
+ dst_y += dst_stride_y * 2;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ if (height & 1) {
+ ARGBToUVRow(src_argb, 0, dst_u, dst_v, width);
+ ARGBToYRow(src_argb, dst_y, width);
+ }
+ return 0;
+}
+
+// Convert BGRA to I420.
+LIBYUV_API
+int BGRAToI420(const uint8* src_bgra,
+ int src_stride_bgra,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ int y;
+ void (*BGRAToUVRow)(const uint8* src_bgra0, int src_stride_bgra, uint8* dst_u,
+ uint8* dst_v, int width) = BGRAToUVRow_C;
+ void (*BGRAToYRow)(const uint8* src_bgra, uint8* dst_y, int width) =
+ BGRAToYRow_C;
+ if (!src_bgra || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_bgra = src_bgra + (height - 1) * src_stride_bgra;
+ src_stride_bgra = -src_stride_bgra;
+ }
+#if defined(HAS_BGRATOYROW_SSSE3) && defined(HAS_BGRATOUVROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ BGRAToUVRow = BGRAToUVRow_Any_SSSE3;
+ BGRAToYRow = BGRAToYRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ BGRAToUVRow = BGRAToUVRow_SSSE3;
+ BGRAToYRow = BGRAToYRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_BGRATOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ BGRAToYRow = BGRAToYRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ BGRAToYRow = BGRAToYRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_BGRATOUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ BGRAToUVRow = BGRAToUVRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ BGRAToUVRow = BGRAToUVRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_BGRATOYROW_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2)) {
+ BGRAToYRow = BGRAToYRow_Any_DSPR2;
+ if (IS_ALIGNED(width, 8)) {
+ BGRAToYRow = BGRAToYRow_DSPR2;
+ }
+ }
+#endif
+#if defined(HAS_BGRATOUVROW_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2)) {
+ BGRAToUVRow = BGRAToUVRow_Any_DSPR2;
+ if (IS_ALIGNED(width, 16)) {
+ BGRAToUVRow = BGRAToUVRow_DSPR2;
+ }
+ }
+#endif
+#if defined(HAS_BGRATOYROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ BGRAToYRow = BGRAToYRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ BGRAToYRow = BGRAToYRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_BGRATOUVROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ BGRAToUVRow = BGRAToUVRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ BGRAToUVRow = BGRAToUVRow_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height - 1; y += 2) {
+ BGRAToUVRow(src_bgra, src_stride_bgra, dst_u, dst_v, width);
+ BGRAToYRow(src_bgra, dst_y, width);
+ BGRAToYRow(src_bgra + src_stride_bgra, dst_y + dst_stride_y, width);
+ src_bgra += src_stride_bgra * 2;
+ dst_y += dst_stride_y * 2;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ if (height & 1) {
+ BGRAToUVRow(src_bgra, 0, dst_u, dst_v, width);
+ BGRAToYRow(src_bgra, dst_y, width);
+ }
+ return 0;
+}
+
+// Convert ABGR to I420.
+LIBYUV_API
+int ABGRToI420(const uint8* src_abgr,
+ int src_stride_abgr,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ int y;
+ void (*ABGRToUVRow)(const uint8* src_abgr0, int src_stride_abgr, uint8* dst_u,
+ uint8* dst_v, int width) = ABGRToUVRow_C;
+ void (*ABGRToYRow)(const uint8* src_abgr, uint8* dst_y, int width) =
+ ABGRToYRow_C;
+ if (!src_abgr || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_abgr = src_abgr + (height - 1) * src_stride_abgr;
+ src_stride_abgr = -src_stride_abgr;
+ }
+#if defined(HAS_ABGRTOYROW_SSSE3) && defined(HAS_ABGRTOUVROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ABGRToUVRow = ABGRToUVRow_Any_SSSE3;
+ ABGRToYRow = ABGRToYRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ABGRToUVRow = ABGRToUVRow_SSSE3;
+ ABGRToYRow = ABGRToYRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ABGRTOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ABGRToYRow = ABGRToYRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ABGRToYRow = ABGRToYRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ABGRTOUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ABGRToUVRow = ABGRToUVRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ ABGRToUVRow = ABGRToUVRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ABGRTOYROW_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2)) {
+ ABGRToYRow = ABGRToYRow_Any_DSPR2;
+ if (IS_ALIGNED(width, 8)) {
+ ABGRToYRow = ABGRToYRow_DSPR2;
+ }
+ }
+#endif
+#if defined(HAS_ABGRTOUVROW_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2)) {
+ ABGRToUVRow = ABGRToUVRow_Any_DSPR2;
+ if (IS_ALIGNED(width, 16)) {
+ ABGRToUVRow = ABGRToUVRow_DSPR2;
+ }
+ }
+#endif
+#if defined(HAS_ABGRTOYROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ABGRToYRow = ABGRToYRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ ABGRToYRow = ABGRToYRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_ABGRTOUVROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ABGRToUVRow = ABGRToUVRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ ABGRToUVRow = ABGRToUVRow_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height - 1; y += 2) {
+ ABGRToUVRow(src_abgr, src_stride_abgr, dst_u, dst_v, width);
+ ABGRToYRow(src_abgr, dst_y, width);
+ ABGRToYRow(src_abgr + src_stride_abgr, dst_y + dst_stride_y, width);
+ src_abgr += src_stride_abgr * 2;
+ dst_y += dst_stride_y * 2;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ if (height & 1) {
+ ABGRToUVRow(src_abgr, 0, dst_u, dst_v, width);
+ ABGRToYRow(src_abgr, dst_y, width);
+ }
+ return 0;
+}
+
+// Convert RGBA to I420.
+LIBYUV_API
+int RGBAToI420(const uint8* src_rgba,
+ int src_stride_rgba,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ int y;
+ void (*RGBAToUVRow)(const uint8* src_rgba0, int src_stride_rgba, uint8* dst_u,
+ uint8* dst_v, int width) = RGBAToUVRow_C;
+ void (*RGBAToYRow)(const uint8* src_rgba, uint8* dst_y, int width) =
+ RGBAToYRow_C;
+ if (!src_rgba || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_rgba = src_rgba + (height - 1) * src_stride_rgba;
+ src_stride_rgba = -src_stride_rgba;
+ }
+#if defined(HAS_RGBATOYROW_SSSE3) && defined(HAS_RGBATOUVROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ RGBAToUVRow = RGBAToUVRow_Any_SSSE3;
+ RGBAToYRow = RGBAToYRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ RGBAToUVRow = RGBAToUVRow_SSSE3;
+ RGBAToYRow = RGBAToYRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_RGBATOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ RGBAToYRow = RGBAToYRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ RGBAToYRow = RGBAToYRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_RGBATOUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ RGBAToUVRow = RGBAToUVRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ RGBAToUVRow = RGBAToUVRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_RGBATOYROW_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2)) {
+ RGBAToYRow = RGBAToYRow_Any_DSPR2;
+ if (IS_ALIGNED(width, 8)) {
+ RGBAToYRow = RGBAToYRow_DSPR2;
+ }
+ }
+#endif
+#if defined(HAS_RGBATOUVROW_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2)) {
+ RGBAToUVRow = RGBAToUVRow_Any_DSPR2;
+ if (IS_ALIGNED(width, 16)) {
+ RGBAToUVRow = RGBAToUVRow_DSPR2;
+ }
+ }
+#endif
+#if defined(HAS_RGBATOYROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ RGBAToYRow = RGBAToYRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ RGBAToYRow = RGBAToYRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_RGBATOUVROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ RGBAToUVRow = RGBAToUVRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ RGBAToUVRow = RGBAToUVRow_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height - 1; y += 2) {
+ RGBAToUVRow(src_rgba, src_stride_rgba, dst_u, dst_v, width);
+ RGBAToYRow(src_rgba, dst_y, width);
+ RGBAToYRow(src_rgba + src_stride_rgba, dst_y + dst_stride_y, width);
+ src_rgba += src_stride_rgba * 2;
+ dst_y += dst_stride_y * 2;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ if (height & 1) {
+ RGBAToUVRow(src_rgba, 0, dst_u, dst_v, width);
+ RGBAToYRow(src_rgba, dst_y, width);
+ }
+ return 0;
+}
+
+// Convert RGB24 to I420.
+LIBYUV_API
+int RGB24ToI420(const uint8* src_rgb24,
+ int src_stride_rgb24,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ int y;
+#if (defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA))
+ void (*RGB24ToUVRow)(const uint8* src_rgb24, int src_stride_rgb24,
+ uint8* dst_u, uint8* dst_v, int width) = RGB24ToUVRow_C;
+ void (*RGB24ToYRow)(const uint8* src_rgb24, uint8* dst_y, int width) =
+ RGB24ToYRow_C;
+#else
+ void (*RGB24ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) =
+ RGB24ToARGBRow_C;
+ void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, uint8* dst_u,
+ uint8* dst_v, int width) = ARGBToUVRow_C;
+ void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
+ ARGBToYRow_C;
+#endif
+ if (!src_rgb24 || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_rgb24 = src_rgb24 + (height - 1) * src_stride_rgb24;
+ src_stride_rgb24 = -src_stride_rgb24;
+ }
+
+// Neon version does direct RGB24 to YUV.
+#if defined(HAS_RGB24TOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ RGB24ToUVRow = RGB24ToUVRow_Any_NEON;
+ RGB24ToYRow = RGB24ToYRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ RGB24ToYRow = RGB24ToYRow_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ RGB24ToUVRow = RGB24ToUVRow_NEON;
+ }
+ }
+ }
+#elif defined(HAS_RGB24TOYROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ RGB24ToUVRow = RGB24ToUVRow_Any_MSA;
+ RGB24ToYRow = RGB24ToYRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ RGB24ToYRow = RGB24ToYRow_MSA;
+ RGB24ToUVRow = RGB24ToUVRow_MSA;
+ }
+ }
+// Other platforms do intermediate conversion from RGB24 to ARGB.
+#else
+#if defined(HAS_RGB24TOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ RGB24ToARGBRow = RGB24ToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+ ARGBToYRow = ARGBToYRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_SSSE3;
+ ARGBToYRow = ARGBToYRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+ ARGBToYRow = ARGBToYRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToUVRow = ARGBToUVRow_AVX2;
+ ARGBToYRow = ARGBToYRow_AVX2;
+ }
+ }
+#endif
+#endif
+
+ {
+#if !(defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA))
+ // Allocate 2 rows of ARGB.
+ const int kRowSize = (width * 4 + 31) & ~31;
+ align_buffer_64(row, kRowSize * 2);
+#endif
+
+ for (y = 0; y < height - 1; y += 2) {
+#if (defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA))
+ RGB24ToUVRow(src_rgb24, src_stride_rgb24, dst_u, dst_v, width);
+ RGB24ToYRow(src_rgb24, dst_y, width);
+ RGB24ToYRow(src_rgb24 + src_stride_rgb24, dst_y + dst_stride_y, width);
+#else
+ RGB24ToARGBRow(src_rgb24, row, width);
+ RGB24ToARGBRow(src_rgb24 + src_stride_rgb24, row + kRowSize, width);
+ ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
+ ARGBToYRow(row, dst_y, width);
+ ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
+#endif
+ src_rgb24 += src_stride_rgb24 * 2;
+ dst_y += dst_stride_y * 2;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ if (height & 1) {
+#if (defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA))
+ RGB24ToUVRow(src_rgb24, 0, dst_u, dst_v, width);
+ RGB24ToYRow(src_rgb24, dst_y, width);
+#else
+ RGB24ToARGBRow(src_rgb24, row, width);
+ ARGBToUVRow(row, 0, dst_u, dst_v, width);
+ ARGBToYRow(row, dst_y, width);
+#endif
+ }
+#if !(defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA))
+ free_aligned_buffer_64(row);
+#endif
+ }
+ return 0;
+}
+
+// Convert RAW to I420.
+LIBYUV_API
+int RAWToI420(const uint8* src_raw,
+ int src_stride_raw,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ int y;
+#if (defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA))
+ void (*RAWToUVRow)(const uint8* src_raw, int src_stride_raw, uint8* dst_u,
+ uint8* dst_v, int width) = RAWToUVRow_C;
+ void (*RAWToYRow)(const uint8* src_raw, uint8* dst_y, int width) =
+ RAWToYRow_C;
+#else
+ void (*RAWToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) =
+ RAWToARGBRow_C;
+ void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, uint8* dst_u,
+ uint8* dst_v, int width) = ARGBToUVRow_C;
+ void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
+ ARGBToYRow_C;
+#endif
+ if (!src_raw || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_raw = src_raw + (height - 1) * src_stride_raw;
+ src_stride_raw = -src_stride_raw;
+ }
+
+// Neon version does direct RAW to YUV.
+#if defined(HAS_RAWTOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ RAWToUVRow = RAWToUVRow_Any_NEON;
+ RAWToYRow = RAWToYRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ RAWToYRow = RAWToYRow_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ RAWToUVRow = RAWToUVRow_NEON;
+ }
+ }
+ }
+#elif defined(HAS_RAWTOYROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ RAWToUVRow = RAWToUVRow_Any_MSA;
+ RAWToYRow = RAWToYRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ RAWToYRow = RAWToYRow_MSA;
+ RAWToUVRow = RAWToUVRow_MSA;
+ }
+ }
+// Other platforms do intermediate conversion from RAW to ARGB.
+#else
+#if defined(HAS_RAWTOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ RAWToARGBRow = RAWToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ RAWToARGBRow = RAWToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+ ARGBToYRow = ARGBToYRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_SSSE3;
+ ARGBToYRow = ARGBToYRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+ ARGBToYRow = ARGBToYRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToUVRow = ARGBToUVRow_AVX2;
+ ARGBToYRow = ARGBToYRow_AVX2;
+ }
+ }
+#endif
+#endif
+
+ {
+#if !(defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA))
+ // Allocate 2 rows of ARGB.
+ const int kRowSize = (width * 4 + 31) & ~31;
+ align_buffer_64(row, kRowSize * 2);
+#endif
+
+ for (y = 0; y < height - 1; y += 2) {
+#if (defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA))
+ RAWToUVRow(src_raw, src_stride_raw, dst_u, dst_v, width);
+ RAWToYRow(src_raw, dst_y, width);
+ RAWToYRow(src_raw + src_stride_raw, dst_y + dst_stride_y, width);
+#else
+ RAWToARGBRow(src_raw, row, width);
+ RAWToARGBRow(src_raw + src_stride_raw, row + kRowSize, width);
+ ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
+ ARGBToYRow(row, dst_y, width);
+ ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
+#endif
+ src_raw += src_stride_raw * 2;
+ dst_y += dst_stride_y * 2;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ if (height & 1) {
+#if (defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA))
+ RAWToUVRow(src_raw, 0, dst_u, dst_v, width);
+ RAWToYRow(src_raw, dst_y, width);
+#else
+ RAWToARGBRow(src_raw, row, width);
+ ARGBToUVRow(row, 0, dst_u, dst_v, width);
+ ARGBToYRow(row, dst_y, width);
+#endif
+ }
+#if !(defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA))
+ free_aligned_buffer_64(row);
+#endif
+ }
+ return 0;
+}
+
+// Convert RGB565 to I420.
+LIBYUV_API
+int RGB565ToI420(const uint8* src_rgb565,
+ int src_stride_rgb565,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ int y;
+#if (defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA))
+ void (*RGB565ToUVRow)(const uint8* src_rgb565, int src_stride_rgb565,
+ uint8* dst_u, uint8* dst_v, int width) =
+ RGB565ToUVRow_C;
+ void (*RGB565ToYRow)(const uint8* src_rgb565, uint8* dst_y, int width) =
+ RGB565ToYRow_C;
+#else
+ void (*RGB565ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) =
+ RGB565ToARGBRow_C;
+ void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, uint8* dst_u,
+ uint8* dst_v, int width) = ARGBToUVRow_C;
+ void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
+ ARGBToYRow_C;
+#endif
+ if (!src_rgb565 || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_rgb565 = src_rgb565 + (height - 1) * src_stride_rgb565;
+ src_stride_rgb565 = -src_stride_rgb565;
+ }
+
+// Neon version does direct RGB565 to YUV.
+#if defined(HAS_RGB565TOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ RGB565ToUVRow = RGB565ToUVRow_Any_NEON;
+ RGB565ToYRow = RGB565ToYRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ RGB565ToYRow = RGB565ToYRow_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ RGB565ToUVRow = RGB565ToUVRow_NEON;
+ }
+ }
+ }
+#elif defined(HAS_RGB565TOYROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ RGB565ToUVRow = RGB565ToUVRow_Any_MSA;
+ RGB565ToYRow = RGB565ToYRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ RGB565ToYRow = RGB565ToYRow_MSA;
+ RGB565ToUVRow = RGB565ToUVRow_MSA;
+ }
+ }
+// Other platforms do intermediate conversion from RGB565 to ARGB.
+#else
+#if defined(HAS_RGB565TOARGBROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ RGB565ToARGBRow = RGB565ToARGBRow_Any_SSE2;
+ if (IS_ALIGNED(width, 8)) {
+ RGB565ToARGBRow = RGB565ToARGBRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_RGB565TOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ RGB565ToARGBRow = RGB565ToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ RGB565ToARGBRow = RGB565ToARGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+ ARGBToYRow = ARGBToYRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_SSSE3;
+ ARGBToYRow = ARGBToYRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+ ARGBToYRow = ARGBToYRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToUVRow = ARGBToUVRow_AVX2;
+ ARGBToYRow = ARGBToYRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_RGB565TOARGBROW_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2)) {
+ RGB565ToARGBRow = RGB565ToARGBRow_Any_DSPR2;
+ if (IS_ALIGNED(width, 8)) {
+ RGB565ToARGBRow = RGB565ToARGBRow_DSPR2;
+ }
+ }
+#endif
+#endif
+ {
+#if !(defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA))
+ // Allocate 2 rows of ARGB.
+ const int kRowSize = (width * 4 + 31) & ~31;
+ align_buffer_64(row, kRowSize * 2);
+#endif
+ for (y = 0; y < height - 1; y += 2) {
+#if (defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA))
+ RGB565ToUVRow(src_rgb565, src_stride_rgb565, dst_u, dst_v, width);
+ RGB565ToYRow(src_rgb565, dst_y, width);
+ RGB565ToYRow(src_rgb565 + src_stride_rgb565, dst_y + dst_stride_y, width);
+#else
+ RGB565ToARGBRow(src_rgb565, row, width);
+ RGB565ToARGBRow(src_rgb565 + src_stride_rgb565, row + kRowSize, width);
+ ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
+ ARGBToYRow(row, dst_y, width);
+ ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
+#endif
+ src_rgb565 += src_stride_rgb565 * 2;
+ dst_y += dst_stride_y * 2;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ if (height & 1) {
+#if (defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA))
+ RGB565ToUVRow(src_rgb565, 0, dst_u, dst_v, width);
+ RGB565ToYRow(src_rgb565, dst_y, width);
+#else
+ RGB565ToARGBRow(src_rgb565, row, width);
+ ARGBToUVRow(row, 0, dst_u, dst_v, width);
+ ARGBToYRow(row, dst_y, width);
+#endif
+ }
+#if !(defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA))
+ free_aligned_buffer_64(row);
+#endif
+ }
+ return 0;
+}
+
+// Convert ARGB1555 to I420.
+LIBYUV_API
+int ARGB1555ToI420(const uint8* src_argb1555,
+ int src_stride_argb1555,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ int y;
+#if (defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA))
+ void (*ARGB1555ToUVRow)(const uint8* src_argb1555, int src_stride_argb1555,
+ uint8* dst_u, uint8* dst_v, int width) =
+ ARGB1555ToUVRow_C;
+ void (*ARGB1555ToYRow)(const uint8* src_argb1555, uint8* dst_y, int width) =
+ ARGB1555ToYRow_C;
+#else
+ void (*ARGB1555ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) =
+ ARGB1555ToARGBRow_C;
+ void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, uint8* dst_u,
+ uint8* dst_v, int width) = ARGBToUVRow_C;
+ void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
+ ARGBToYRow_C;
+#endif
+ if (!src_argb1555 || !dst_y || !dst_u || !dst_v || width <= 0 ||
+ height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_argb1555 = src_argb1555 + (height - 1) * src_stride_argb1555;
+ src_stride_argb1555 = -src_stride_argb1555;
+ }
+
+// Neon version does direct ARGB1555 to YUV.
+#if defined(HAS_ARGB1555TOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGB1555ToUVRow = ARGB1555ToUVRow_Any_NEON;
+ ARGB1555ToYRow = ARGB1555ToYRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGB1555ToYRow = ARGB1555ToYRow_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ ARGB1555ToUVRow = ARGB1555ToUVRow_NEON;
+ }
+ }
+ }
+#elif defined(HAS_ARGB1555TOYROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGB1555ToUVRow = ARGB1555ToUVRow_Any_MSA;
+ ARGB1555ToYRow = ARGB1555ToYRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ ARGB1555ToYRow = ARGB1555ToYRow_MSA;
+ ARGB1555ToUVRow = ARGB1555ToUVRow_MSA;
+ }
+ }
+// Other platforms do intermediate conversion from ARGB1555 to ARGB.
+#else
+#if defined(HAS_ARGB1555TOARGBROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_SSE2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGB1555ToARGBRow = ARGB1555ToARGBRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_ARGB1555TOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ ARGB1555ToARGBRow = ARGB1555ToARGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+ ARGBToYRow = ARGBToYRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_SSSE3;
+ ARGBToYRow = ARGBToYRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+ ARGBToYRow = ARGBToYRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToUVRow = ARGBToUVRow_AVX2;
+ ARGBToYRow = ARGBToYRow_AVX2;
+ }
+ }
+#endif
+#endif
+ {
+#if !(defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA))
+ // Allocate 2 rows of ARGB.
+ const int kRowSize = (width * 4 + 31) & ~31;
+ align_buffer_64(row, kRowSize * 2);
+#endif
+
+ for (y = 0; y < height - 1; y += 2) {
+#if (defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA))
+ ARGB1555ToUVRow(src_argb1555, src_stride_argb1555, dst_u, dst_v, width);
+ ARGB1555ToYRow(src_argb1555, dst_y, width);
+ ARGB1555ToYRow(src_argb1555 + src_stride_argb1555, dst_y + dst_stride_y,
+ width);
+#else
+ ARGB1555ToARGBRow(src_argb1555, row, width);
+ ARGB1555ToARGBRow(src_argb1555 + src_stride_argb1555, row + kRowSize,
+ width);
+ ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
+ ARGBToYRow(row, dst_y, width);
+ ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
+#endif
+ src_argb1555 += src_stride_argb1555 * 2;
+ dst_y += dst_stride_y * 2;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ if (height & 1) {
+#if (defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA))
+ ARGB1555ToUVRow(src_argb1555, 0, dst_u, dst_v, width);
+ ARGB1555ToYRow(src_argb1555, dst_y, width);
+#else
+ ARGB1555ToARGBRow(src_argb1555, row, width);
+ ARGBToUVRow(row, 0, dst_u, dst_v, width);
+ ARGBToYRow(row, dst_y, width);
+#endif
+ }
+#if !(defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA))
+ free_aligned_buffer_64(row);
+#endif
+ }
+ return 0;
+}
+
+// Convert ARGB4444 to I420.
+LIBYUV_API
+int ARGB4444ToI420(const uint8* src_argb4444,
+ int src_stride_argb4444,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ int y;
+#if defined(HAS_ARGB4444TOYROW_NEON)
+ void (*ARGB4444ToUVRow)(const uint8* src_argb4444, int src_stride_argb4444,
+ uint8* dst_u, uint8* dst_v, int width) =
+ ARGB4444ToUVRow_C;
+ void (*ARGB4444ToYRow)(const uint8* src_argb4444, uint8* dst_y, int width) =
+ ARGB4444ToYRow_C;
+#else
+ void (*ARGB4444ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) =
+ ARGB4444ToARGBRow_C;
+ void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, uint8* dst_u,
+ uint8* dst_v, int width) = ARGBToUVRow_C;
+ void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
+ ARGBToYRow_C;
+#endif
+ if (!src_argb4444 || !dst_y || !dst_u || !dst_v || width <= 0 ||
+ height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_argb4444 = src_argb4444 + (height - 1) * src_stride_argb4444;
+ src_stride_argb4444 = -src_stride_argb4444;
+ }
+
+// Neon version does direct ARGB4444 to YUV.
+#if defined(HAS_ARGB4444TOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGB4444ToUVRow = ARGB4444ToUVRow_Any_NEON;
+ ARGB4444ToYRow = ARGB4444ToYRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGB4444ToYRow = ARGB4444ToYRow_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ ARGB4444ToUVRow = ARGB4444ToUVRow_NEON;
+ }
+ }
+ }
+// Other platforms do intermediate conversion from ARGB4444 to ARGB.
+#else
+#if defined(HAS_ARGB4444TOARGBROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_SSE2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGB4444ToARGBRow = ARGB4444ToARGBRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_ARGB4444TOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ ARGB4444ToARGBRow = ARGB4444ToARGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGB4444TOARGBROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ ARGB4444ToARGBRow = ARGB4444ToARGBRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+ ARGBToYRow = ARGBToYRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_SSSE3;
+ ARGBToYRow = ARGBToYRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+ ARGBToYRow = ARGBToYRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToUVRow = ARGBToUVRow_AVX2;
+ ARGBToYRow = ARGBToYRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBToUVRow = ARGBToUVRow_Any_MSA;
+ ARGBToYRow = ARGBToYRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYRow = ARGBToYRow_MSA;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToUVRow = ARGBToUVRow_MSA;
+ }
+ }
+ }
+#endif
+#endif
+
+ {
+#if !defined(HAS_ARGB4444TOYROW_NEON)
+ // Allocate 2 rows of ARGB.
+ const int kRowSize = (width * 4 + 31) & ~31;
+ align_buffer_64(row, kRowSize * 2);
+#endif
+
+ for (y = 0; y < height - 1; y += 2) {
+#if defined(HAS_ARGB4444TOYROW_NEON)
+ ARGB4444ToUVRow(src_argb4444, src_stride_argb4444, dst_u, dst_v, width);
+ ARGB4444ToYRow(src_argb4444, dst_y, width);
+ ARGB4444ToYRow(src_argb4444 + src_stride_argb4444, dst_y + dst_stride_y,
+ width);
+#else
+ ARGB4444ToARGBRow(src_argb4444, row, width);
+ ARGB4444ToARGBRow(src_argb4444 + src_stride_argb4444, row + kRowSize,
+ width);
+ ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
+ ARGBToYRow(row, dst_y, width);
+ ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
+#endif
+ src_argb4444 += src_stride_argb4444 * 2;
+ dst_y += dst_stride_y * 2;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ if (height & 1) {
+#if defined(HAS_ARGB4444TOYROW_NEON)
+ ARGB4444ToUVRow(src_argb4444, 0, dst_u, dst_v, width);
+ ARGB4444ToYRow(src_argb4444, dst_y, width);
+#else
+ ARGB4444ToARGBRow(src_argb4444, row, width);
+ ARGBToUVRow(row, 0, dst_u, dst_v, width);
+ ARGBToYRow(row, dst_y, width);
+#endif
+ }
+#if !defined(HAS_ARGB4444TOYROW_NEON)
+ free_aligned_buffer_64(row);
+#endif
+ }
+ return 0;
+}
+
+static void SplitPixels(const uint8* src_u,
+ int src_pixel_stride_uv,
+ uint8* dst_u,
+ int width) {
+ int i;
+ for (i = 0; i < width; ++i) {
+ *dst_u = *src_u;
+ ++dst_u;
+ src_u += src_pixel_stride_uv;
+ }
+}
+
+// Convert Android420 to I420.
+LIBYUV_API
+int Android420ToI420(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ int src_pixel_stride_uv,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ int y;
+ const ptrdiff_t vu_off = src_v - src_u;
+ int halfwidth = (width + 1) >> 1;
+ int halfheight = (height + 1) >> 1;
+ if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ halfheight = (height + 1) >> 1;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_u = src_u + (halfheight - 1) * src_stride_u;
+ src_v = src_v + (halfheight - 1) * src_stride_v;
+ src_stride_y = -src_stride_y;
+ src_stride_u = -src_stride_u;
+ src_stride_v = -src_stride_v;
+ }
+
+ if (dst_y) {
+ CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ }
+
+ // Copy UV planes as is - I420
+ if (src_pixel_stride_uv == 1) {
+ CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, halfheight);
+ CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, halfheight);
+ return 0;
+ // Split UV planes - NV21
+ } else if (src_pixel_stride_uv == 2 && vu_off == -1 &&
+ src_stride_u == src_stride_v) {
+ SplitUVPlane(src_v, src_stride_v, dst_v, dst_stride_v, dst_u, dst_stride_u,
+ halfwidth, halfheight);
+ return 0;
+ // Split UV planes - NV12
+ } else if (src_pixel_stride_uv == 2 && vu_off == 1 &&
+ src_stride_u == src_stride_v) {
+ SplitUVPlane(src_u, src_stride_u, dst_u, dst_stride_u, dst_v, dst_stride_v,
+ halfwidth, halfheight);
+ return 0;
+ }
+
+ for (y = 0; y < halfheight; ++y) {
+ SplitPixels(src_u, src_pixel_stride_uv, dst_u, halfwidth);
+ SplitPixels(src_v, src_pixel_stride_uv, dst_v, halfwidth);
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ return 0;
+}
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/libyuv/src/main/cpp/libyuv/source/convert_argb.cc b/libyuv/src/main/cpp/libyuv/source/convert_argb.cc
new file mode 100644
index 0000000..5007bdb
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/source/convert_argb.cc
@@ -0,0 +1,1654 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/convert_argb.h"
+
+#include "libyuv/cpu_id.h"
+#ifdef HAVE_JPEG
+#include "libyuv/mjpeg_decoder.h"
+#endif
+#include "libyuv/planar_functions.h" // For CopyPlane and ARGBShuffle.
+#include "libyuv/rotate_argb.h"
+#include "libyuv/row.h"
+#include "libyuv/video_common.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Copy ARGB with optional flipping
+LIBYUV_API
+int ARGBCopy(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ if (!src_argb || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+
+ CopyPlane(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width * 4,
+ height);
+ return 0;
+}
+
+// Convert I422 to ARGB with matrix
+static int I420ToARGBMatrix(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
+ int y;
+ void (*I422ToARGBRow)(const uint8* y_buf, const uint8* u_buf,
+ const uint8* v_buf, uint8* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) =
+ I422ToARGBRow_C;
+ if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+#if defined(HAS_I422TOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToARGBRow = I422ToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I422ToARGBRow = I422ToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToARGBRow = I422ToARGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I422ToARGBRow = I422ToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToARGBRow = I422ToARGBRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGBROW_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 4) &&
+ IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
+ IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
+ IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
+ IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
+ I422ToARGBRow = I422ToARGBRow_DSPR2;
+ }
+#endif
+#if defined(HAS_I422TOARGBROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ I422ToARGBRow = I422ToARGBRow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToARGBRow = I422ToARGBRow_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ I422ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
+ dst_argb += dst_stride_argb;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ }
+ return 0;
+}
+
+// Convert I420 to ARGB.
+LIBYUV_API
+int I420ToARGB(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ return I420ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_argb, dst_stride_argb,
+ &kYuvI601Constants, width, height);
+}
+
+// Convert I420 to ABGR.
+LIBYUV_API
+int I420ToABGR(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height) {
+ return I420ToARGBMatrix(src_y, src_stride_y, src_v,
+ src_stride_v, // Swap U and V
+ src_u, src_stride_u, dst_abgr, dst_stride_abgr,
+ &kYvuI601Constants, // Use Yvu matrix
+ width, height);
+}
+
+// Convert J420 to ARGB.
+LIBYUV_API
+int J420ToARGB(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ return I420ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_argb, dst_stride_argb,
+ &kYuvJPEGConstants, width, height);
+}
+
+// Convert J420 to ABGR.
+LIBYUV_API
+int J420ToABGR(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height) {
+ return I420ToARGBMatrix(src_y, src_stride_y, src_v,
+ src_stride_v, // Swap U and V
+ src_u, src_stride_u, dst_abgr, dst_stride_abgr,
+ &kYvuJPEGConstants, // Use Yvu matrix
+ width, height);
+}
+
+// Convert H420 to ARGB.
+LIBYUV_API
+int H420ToARGB(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ return I420ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_argb, dst_stride_argb,
+ &kYuvH709Constants, width, height);
+}
+
+// Convert H420 to ABGR.
+LIBYUV_API
+int H420ToABGR(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height) {
+ return I420ToARGBMatrix(src_y, src_stride_y, src_v,
+ src_stride_v, // Swap U and V
+ src_u, src_stride_u, dst_abgr, dst_stride_abgr,
+ &kYvuH709Constants, // Use Yvu matrix
+ width, height);
+}
+
+// Convert I422 to ARGB with matrix
+static int I422ToARGBMatrix(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
+ int y;
+ void (*I422ToARGBRow)(const uint8* y_buf, const uint8* u_buf,
+ const uint8* v_buf, uint8* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) =
+ I422ToARGBRow_C;
+ if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+ // Coalesce rows.
+ if (src_stride_y == width && src_stride_u * 2 == width &&
+ src_stride_v * 2 == width && dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0;
+ }
+#if defined(HAS_I422TOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToARGBRow = I422ToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I422ToARGBRow = I422ToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToARGBRow = I422ToARGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I422ToARGBRow = I422ToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToARGBRow = I422ToARGBRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGBROW_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 4) &&
+ IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
+ IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
+ IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
+ IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
+ I422ToARGBRow = I422ToARGBRow_DSPR2;
+ }
+#endif
+#if defined(HAS_I422TOARGBROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ I422ToARGBRow = I422ToARGBRow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToARGBRow = I422ToARGBRow_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ I422ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
+ dst_argb += dst_stride_argb;
+ src_y += src_stride_y;
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ return 0;
+}
+
+// Convert I422 to ARGB.
+LIBYUV_API
+int I422ToARGB(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ return I422ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_argb, dst_stride_argb,
+ &kYuvI601Constants, width, height);
+}
+
+// Convert I422 to ABGR.
+LIBYUV_API
+int I422ToABGR(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height) {
+ return I422ToARGBMatrix(src_y, src_stride_y, src_v,
+ src_stride_v, // Swap U and V
+ src_u, src_stride_u, dst_abgr, dst_stride_abgr,
+ &kYvuI601Constants, // Use Yvu matrix
+ width, height);
+}
+
+// Convert J422 to ARGB.
+LIBYUV_API
+int J422ToARGB(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ return I422ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_argb, dst_stride_argb,
+ &kYuvJPEGConstants, width, height);
+}
+
+// Convert J422 to ABGR.
+LIBYUV_API
+int J422ToABGR(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height) {
+ return I422ToARGBMatrix(src_y, src_stride_y, src_v,
+ src_stride_v, // Swap U and V
+ src_u, src_stride_u, dst_abgr, dst_stride_abgr,
+ &kYvuJPEGConstants, // Use Yvu matrix
+ width, height);
+}
+
+// Convert H422 to ARGB.
+LIBYUV_API
+int H422ToARGB(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ return I422ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_argb, dst_stride_argb,
+ &kYuvH709Constants, width, height);
+}
+
+// Convert H422 to ABGR.
+LIBYUV_API
+int H422ToABGR(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height) {
+ return I422ToARGBMatrix(src_y, src_stride_y, src_v,
+ src_stride_v, // Swap U and V
+ src_u, src_stride_u, dst_abgr, dst_stride_abgr,
+ &kYvuH709Constants, // Use Yvu matrix
+ width, height);
+}
+
+// Convert I444 to ARGB with matrix
+static int I444ToARGBMatrix(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
+ int y;
+ void (*I444ToARGBRow)(const uint8* y_buf, const uint8* u_buf,
+ const uint8* v_buf, uint8* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) =
+ I444ToARGBRow_C;
+ if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+ // Coalesce rows.
+ if (src_stride_y == width && src_stride_u == width && src_stride_v == width &&
+ dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0;
+ }
+#if defined(HAS_I444TOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I444ToARGBRow = I444ToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I444ToARGBRow = I444ToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I444TOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I444ToARGBRow = I444ToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I444ToARGBRow = I444ToARGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_I444TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I444ToARGBRow = I444ToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ I444ToARGBRow = I444ToARGBRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_I444TOARGBROW_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2)) {
+ I444ToARGBRow = I444ToARGBRow_Any_DSPR2;
+ if (IS_ALIGNED(width, 8)) {
+ I444ToARGBRow = I444ToARGBRow_DSPR2;
+ }
+ }
+#endif
+#if defined(HAS_I444TOARGBROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ I444ToARGBRow = I444ToARGBRow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ I444ToARGBRow = I444ToARGBRow_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ I444ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
+ dst_argb += dst_stride_argb;
+ src_y += src_stride_y;
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ return 0;
+}
+
+// Convert I444 to ARGB.
+LIBYUV_API
+int I444ToARGB(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ return I444ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_argb, dst_stride_argb,
+ &kYuvI601Constants, width, height);
+}
+
+// Convert I444 to ABGR.
+LIBYUV_API
+int I444ToABGR(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height) {
+ return I444ToARGBMatrix(src_y, src_stride_y, src_v,
+ src_stride_v, // Swap U and V
+ src_u, src_stride_u, dst_abgr, dst_stride_abgr,
+ &kYvuI601Constants, // Use Yvu matrix
+ width, height);
+}
+
+// Convert J444 to ARGB.
+LIBYUV_API
+int J444ToARGB(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ return I444ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_argb, dst_stride_argb,
+ &kYuvJPEGConstants, width, height);
+}
+
+// Convert I420 with Alpha to preattenuated ARGB.
+static int I420AlphaToARGBMatrix(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ const uint8* src_a,
+ int src_stride_a,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height,
+ int attenuate) {
+ int y;
+ void (*I422AlphaToARGBRow)(const uint8* y_buf, const uint8* u_buf,
+ const uint8* v_buf, const uint8* a_buf,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) = I422AlphaToARGBRow_C;
+ void (*ARGBAttenuateRow)(const uint8* src_argb, uint8* dst_argb, int width) =
+ ARGBAttenuateRow_C;
+ if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+#if defined(HAS_I422ALPHATOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I422AlphaToARGBRow = I422AlphaToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I422AlphaToARGBRow = I422AlphaToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I422ALPHATOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I422AlphaToARGBRow = I422AlphaToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I422AlphaToARGBRow = I422AlphaToARGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_I422ALPHATOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I422AlphaToARGBRow = I422AlphaToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ I422AlphaToARGBRow = I422AlphaToARGBRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_I422ALPHATOARGBROW_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 4) &&
+ IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
+ IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
+ IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
+ IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
+ I422AlphaToARGBRow = I422AlphaToARGBRow_DSPR2;
+ }
+#endif
+#if defined(HAS_I422ALPHATOARGBROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ I422AlphaToARGBRow = I422AlphaToARGBRow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ I422AlphaToARGBRow = I422AlphaToARGBRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ I422AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, yuvconstants,
+ width);
+ if (attenuate) {
+ ARGBAttenuateRow(dst_argb, dst_argb, width);
+ }
+ dst_argb += dst_stride_argb;
+ src_a += src_stride_a;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ }
+ return 0;
+}
+
+// Convert I420 with Alpha to ARGB.
+LIBYUV_API
+int I420AlphaToARGB(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ const uint8* src_a,
+ int src_stride_a,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height,
+ int attenuate) {
+ return I420AlphaToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, src_a, src_stride_a, dst_argb,
+ dst_stride_argb, &kYuvI601Constants, width,
+ height, attenuate);
+}
+
+// Convert I420 with Alpha to ABGR.
+LIBYUV_API
+int I420AlphaToABGR(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ const uint8* src_a,
+ int src_stride_a,
+ uint8* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height,
+ int attenuate) {
+ return I420AlphaToARGBMatrix(
+ src_y, src_stride_y, src_v, src_stride_v, // Swap U and V
+ src_u, src_stride_u, src_a, src_stride_a, dst_abgr, dst_stride_abgr,
+ &kYvuI601Constants, // Use Yvu matrix
+ width, height, attenuate);
+}
+
+// Convert I400 to ARGB.
+LIBYUV_API
+int I400ToARGB(const uint8* src_y,
+ int src_stride_y,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ int y;
+ void (*I400ToARGBRow)(const uint8* y_buf, uint8* rgb_buf, int width) =
+ I400ToARGBRow_C;
+ if (!src_y || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+ // Coalesce rows.
+ if (src_stride_y == width && dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_y = dst_stride_argb = 0;
+ }
+#if defined(HAS_I400TOARGBROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ I400ToARGBRow = I400ToARGBRow_Any_SSE2;
+ if (IS_ALIGNED(width, 8)) {
+ I400ToARGBRow = I400ToARGBRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_I400TOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I400ToARGBRow = I400ToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I400ToARGBRow = I400ToARGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_I400TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I400ToARGBRow = I400ToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ I400ToARGBRow = I400ToARGBRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_I400TOARGBROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ I400ToARGBRow = I400ToARGBRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ I400ToARGBRow = I400ToARGBRow_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ I400ToARGBRow(src_y, dst_argb, width);
+ dst_argb += dst_stride_argb;
+ src_y += src_stride_y;
+ }
+ return 0;
+}
+
+// Convert J400 to ARGB.
+LIBYUV_API
+int J400ToARGB(const uint8* src_y,
+ int src_stride_y,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ int y;
+ void (*J400ToARGBRow)(const uint8* src_y, uint8* dst_argb, int width) =
+ J400ToARGBRow_C;
+ if (!src_y || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_stride_y = -src_stride_y;
+ }
+ // Coalesce rows.
+ if (src_stride_y == width && dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_y = dst_stride_argb = 0;
+ }
+#if defined(HAS_J400TOARGBROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ J400ToARGBRow = J400ToARGBRow_Any_SSE2;
+ if (IS_ALIGNED(width, 8)) {
+ J400ToARGBRow = J400ToARGBRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_J400TOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ J400ToARGBRow = J400ToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ J400ToARGBRow = J400ToARGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_J400TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ J400ToARGBRow = J400ToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ J400ToARGBRow = J400ToARGBRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_J400TOARGBROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ J400ToARGBRow = J400ToARGBRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ J400ToARGBRow = J400ToARGBRow_MSA;
+ }
+ }
+#endif
+ for (y = 0; y < height; ++y) {
+ J400ToARGBRow(src_y, dst_argb, width);
+ src_y += src_stride_y;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Shuffle table for converting BGRA to ARGB.
+static uvec8 kShuffleMaskBGRAToARGB = {3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u,
+ 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u};
+
+// Shuffle table for converting ABGR to ARGB.
+static uvec8 kShuffleMaskABGRToARGB = {2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u,
+ 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u};
+
+// Shuffle table for converting RGBA to ARGB.
+static uvec8 kShuffleMaskRGBAToARGB = {1u, 2u, 3u, 0u, 5u, 6u, 7u, 4u,
+ 9u, 10u, 11u, 8u, 13u, 14u, 15u, 12u};
+
+// Convert BGRA to ARGB.
+LIBYUV_API
+int BGRAToARGB(const uint8* src_bgra,
+ int src_stride_bgra,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ return ARGBShuffle(src_bgra, src_stride_bgra, dst_argb, dst_stride_argb,
+ (const uint8*)(&kShuffleMaskBGRAToARGB), width, height);
+}
+
+// Convert ARGB to BGRA (same as BGRAToARGB).
+LIBYUV_API
+int ARGBToBGRA(const uint8* src_bgra,
+ int src_stride_bgra,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ return ARGBShuffle(src_bgra, src_stride_bgra, dst_argb, dst_stride_argb,
+ (const uint8*)(&kShuffleMaskBGRAToARGB), width, height);
+}
+
+// Convert ABGR to ARGB.
+LIBYUV_API
+int ABGRToARGB(const uint8* src_abgr,
+ int src_stride_abgr,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ return ARGBShuffle(src_abgr, src_stride_abgr, dst_argb, dst_stride_argb,
+ (const uint8*)(&kShuffleMaskABGRToARGB), width, height);
+}
+
+// Convert ARGB to ABGR to (same as ABGRToARGB).
+LIBYUV_API
+int ARGBToABGR(const uint8* src_abgr,
+ int src_stride_abgr,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ return ARGBShuffle(src_abgr, src_stride_abgr, dst_argb, dst_stride_argb,
+ (const uint8*)(&kShuffleMaskABGRToARGB), width, height);
+}
+
+// Convert RGBA to ARGB.
+LIBYUV_API
+int RGBAToARGB(const uint8* src_rgba,
+ int src_stride_rgba,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ return ARGBShuffle(src_rgba, src_stride_rgba, dst_argb, dst_stride_argb,
+ (const uint8*)(&kShuffleMaskRGBAToARGB), width, height);
+}
+
+// Convert RGB24 to ARGB.
+LIBYUV_API
+int RGB24ToARGB(const uint8* src_rgb24,
+ int src_stride_rgb24,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ int y;
+ void (*RGB24ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) =
+ RGB24ToARGBRow_C;
+ if (!src_rgb24 || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_rgb24 = src_rgb24 + (height - 1) * src_stride_rgb24;
+ src_stride_rgb24 = -src_stride_rgb24;
+ }
+ // Coalesce rows.
+ if (src_stride_rgb24 == width * 3 && dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_rgb24 = dst_stride_argb = 0;
+ }
+#if defined(HAS_RGB24TOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ RGB24ToARGBRow = RGB24ToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_RGB24TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ RGB24ToARGBRow = RGB24ToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ RGB24ToARGBRow = RGB24ToARGBRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_RGB24TOARGBROW_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2)) {
+ RGB24ToARGBRow = RGB24ToARGBRow_Any_DSPR2;
+ if (IS_ALIGNED(width, 8)) {
+ RGB24ToARGBRow = RGB24ToARGBRow_DSPR2;
+ }
+ }
+#endif
+#if defined(HAS_RGB24TOARGBROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ RGB24ToARGBRow = RGB24ToARGBRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ RGB24ToARGBRow = RGB24ToARGBRow_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ RGB24ToARGBRow(src_rgb24, dst_argb, width);
+ src_rgb24 += src_stride_rgb24;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Convert RAW to ARGB.
+LIBYUV_API
+int RAWToARGB(const uint8* src_raw,
+ int src_stride_raw,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ int y;
+ void (*RAWToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) =
+ RAWToARGBRow_C;
+ if (!src_raw || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_raw = src_raw + (height - 1) * src_stride_raw;
+ src_stride_raw = -src_stride_raw;
+ }
+ // Coalesce rows.
+ if (src_stride_raw == width * 3 && dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_raw = dst_stride_argb = 0;
+ }
+#if defined(HAS_RAWTOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ RAWToARGBRow = RAWToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ RAWToARGBRow = RAWToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_RAWTOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ RAWToARGBRow = RAWToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ RAWToARGBRow = RAWToARGBRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_RAWTOARGBROW_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2)) {
+ RAWToARGBRow = RAWToARGBRow_Any_DSPR2;
+ if (IS_ALIGNED(width, 8)) {
+ RAWToARGBRow = RAWToARGBRow_DSPR2;
+ }
+ }
+#endif
+#if defined(HAS_RAWTOARGBROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ RAWToARGBRow = RAWToARGBRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ RAWToARGBRow = RAWToARGBRow_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ RAWToARGBRow(src_raw, dst_argb, width);
+ src_raw += src_stride_raw;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Convert RGB565 to ARGB.
+LIBYUV_API
+int RGB565ToARGB(const uint8* src_rgb565,
+ int src_stride_rgb565,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ int y;
+ void (*RGB565ToARGBRow)(const uint8* src_rgb565, uint8* dst_argb, int width) =
+ RGB565ToARGBRow_C;
+ if (!src_rgb565 || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_rgb565 = src_rgb565 + (height - 1) * src_stride_rgb565;
+ src_stride_rgb565 = -src_stride_rgb565;
+ }
+ // Coalesce rows.
+ if (src_stride_rgb565 == width * 2 && dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_rgb565 = dst_stride_argb = 0;
+ }
+#if defined(HAS_RGB565TOARGBROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ RGB565ToARGBRow = RGB565ToARGBRow_Any_SSE2;
+ if (IS_ALIGNED(width, 8)) {
+ RGB565ToARGBRow = RGB565ToARGBRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_RGB565TOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ RGB565ToARGBRow = RGB565ToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ RGB565ToARGBRow = RGB565ToARGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_RGB565TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ RGB565ToARGBRow = RGB565ToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ RGB565ToARGBRow = RGB565ToARGBRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_RGB565TOARGBROW_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2)) {
+ RGB565ToARGBRow = RGB565ToARGBRow_Any_DSPR2;
+ if (IS_ALIGNED(width, 8)) {
+ RGB565ToARGBRow = RGB565ToARGBRow_DSPR2;
+ }
+ }
+#endif
+#if defined(HAS_RGB565TOARGBROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ RGB565ToARGBRow = RGB565ToARGBRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ RGB565ToARGBRow = RGB565ToARGBRow_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ RGB565ToARGBRow(src_rgb565, dst_argb, width);
+ src_rgb565 += src_stride_rgb565;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Convert ARGB1555 to ARGB.
+LIBYUV_API
+int ARGB1555ToARGB(const uint8* src_argb1555,
+ int src_stride_argb1555,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ int y;
+ void (*ARGB1555ToARGBRow)(const uint8* src_argb1555, uint8* dst_argb,
+ int width) = ARGB1555ToARGBRow_C;
+ if (!src_argb1555 || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_argb1555 = src_argb1555 + (height - 1) * src_stride_argb1555;
+ src_stride_argb1555 = -src_stride_argb1555;
+ }
+ // Coalesce rows.
+ if (src_stride_argb1555 == width * 2 && dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_argb1555 = dst_stride_argb = 0;
+ }
+#if defined(HAS_ARGB1555TOARGBROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_SSE2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGB1555ToARGBRow = ARGB1555ToARGBRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_ARGB1555TOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ ARGB1555ToARGBRow = ARGB1555ToARGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGB1555TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGB1555ToARGBRow = ARGB1555ToARGBRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGB1555TOARGBROW_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2)) {
+ ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_DSPR2;
+ if (IS_ALIGNED(width, 4)) {
+ ARGB1555ToARGBRow = ARGB1555ToARGBRow_DSPR2;
+ }
+ }
+#endif
+#if defined(HAS_ARGB1555TOARGBROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ ARGB1555ToARGBRow = ARGB1555ToARGBRow_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ ARGB1555ToARGBRow(src_argb1555, dst_argb, width);
+ src_argb1555 += src_stride_argb1555;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Convert ARGB4444 to ARGB.
+LIBYUV_API
+int ARGB4444ToARGB(const uint8* src_argb4444,
+ int src_stride_argb4444,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ int y;
+ void (*ARGB4444ToARGBRow)(const uint8* src_argb4444, uint8* dst_argb,
+ int width) = ARGB4444ToARGBRow_C;
+ if (!src_argb4444 || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_argb4444 = src_argb4444 + (height - 1) * src_stride_argb4444;
+ src_stride_argb4444 = -src_stride_argb4444;
+ }
+ // Coalesce rows.
+ if (src_stride_argb4444 == width * 2 && dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_argb4444 = dst_stride_argb = 0;
+ }
+#if defined(HAS_ARGB4444TOARGBROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_SSE2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGB4444ToARGBRow = ARGB4444ToARGBRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_ARGB4444TOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ ARGB4444ToARGBRow = ARGB4444ToARGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGB4444TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGB4444ToARGBRow = ARGB4444ToARGBRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGB4444TOARGBROW_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2)) {
+ ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_DSPR2;
+ if (IS_ALIGNED(width, 4)) {
+ ARGB4444ToARGBRow = ARGB4444ToARGBRow_DSPR2;
+ }
+ }
+#endif
+#if defined(HAS_ARGB4444TOARGBROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ ARGB4444ToARGBRow = ARGB4444ToARGBRow_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ ARGB4444ToARGBRow(src_argb4444, dst_argb, width);
+ src_argb4444 += src_stride_argb4444;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Convert NV12 to ARGB.
+LIBYUV_API
+int NV12ToARGB(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_uv,
+ int src_stride_uv,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ int y;
+ void (*NV12ToARGBRow)(const uint8* y_buf, const uint8* uv_buf, uint8* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) =
+ NV12ToARGBRow_C;
+ if (!src_y || !src_uv || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+#if defined(HAS_NV12TOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ NV12ToARGBRow = NV12ToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ NV12ToARGBRow = NV12ToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_NV12TOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ NV12ToARGBRow = NV12ToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ NV12ToARGBRow = NV12ToARGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_NV12TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ NV12ToARGBRow = NV12ToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ NV12ToARGBRow = NV12ToARGBRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_NV12TOARGBROW_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2)) {
+ NV12ToARGBRow = NV12ToARGBRow_Any_DSPR2;
+ if (IS_ALIGNED(width, 8)) {
+ NV12ToARGBRow = NV12ToARGBRow_DSPR2;
+ }
+ }
+#endif
+#if defined(HAS_NV12TOARGBROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ NV12ToARGBRow = NV12ToARGBRow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ NV12ToARGBRow = NV12ToARGBRow_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ NV12ToARGBRow(src_y, src_uv, dst_argb, &kYuvI601Constants, width);
+ dst_argb += dst_stride_argb;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_uv += src_stride_uv;
+ }
+ }
+ return 0;
+}
+
+// Convert NV21 to ARGB.
+LIBYUV_API
+int NV21ToARGB(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_uv,
+ int src_stride_uv,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ int y;
+ void (*NV21ToARGBRow)(const uint8* y_buf, const uint8* uv_buf, uint8* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) =
+ NV21ToARGBRow_C;
+ if (!src_y || !src_uv || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+#if defined(HAS_NV21TOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ NV21ToARGBRow = NV21ToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ NV21ToARGBRow = NV21ToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_NV21TOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ NV21ToARGBRow = NV21ToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ NV21ToARGBRow = NV21ToARGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_NV21TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ NV21ToARGBRow = NV21ToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ NV21ToARGBRow = NV21ToARGBRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_NV21TOARGBROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ NV21ToARGBRow = NV21ToARGBRow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ NV21ToARGBRow = NV21ToARGBRow_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ NV21ToARGBRow(src_y, src_uv, dst_argb, &kYuvI601Constants, width);
+ dst_argb += dst_stride_argb;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_uv += src_stride_uv;
+ }
+ }
+ return 0;
+}
+
+// Convert M420 to ARGB.
+LIBYUV_API
+int M420ToARGB(const uint8* src_m420,
+ int src_stride_m420,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ int y;
+ void (*NV12ToARGBRow)(const uint8* y_buf, const uint8* uv_buf, uint8* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) =
+ NV12ToARGBRow_C;
+ if (!src_m420 || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+#if defined(HAS_NV12TOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ NV12ToARGBRow = NV12ToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ NV12ToARGBRow = NV12ToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_NV12TOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ NV12ToARGBRow = NV12ToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ NV12ToARGBRow = NV12ToARGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_NV12TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ NV12ToARGBRow = NV12ToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ NV12ToARGBRow = NV12ToARGBRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_NV12TOARGBROW_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2)) {
+ NV12ToARGBRow = NV12ToARGBRow_Any_DSPR2;
+ if (IS_ALIGNED(width, 8)) {
+ NV12ToARGBRow = NV12ToARGBRow_DSPR2;
+ }
+ }
+#endif
+#if defined(HAS_NV12TOARGBROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ NV12ToARGBRow = NV12ToARGBRow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ NV12ToARGBRow = NV12ToARGBRow_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height - 1; y += 2) {
+ NV12ToARGBRow(src_m420, src_m420 + src_stride_m420 * 2, dst_argb,
+ &kYuvI601Constants, width);
+ NV12ToARGBRow(src_m420 + src_stride_m420, src_m420 + src_stride_m420 * 2,
+ dst_argb + dst_stride_argb, &kYuvI601Constants, width);
+ dst_argb += dst_stride_argb * 2;
+ src_m420 += src_stride_m420 * 3;
+ }
+ if (height & 1) {
+ NV12ToARGBRow(src_m420, src_m420 + src_stride_m420 * 2, dst_argb,
+ &kYuvI601Constants, width);
+ }
+ return 0;
+}
+
+// Convert YUY2 to ARGB.
+LIBYUV_API
+int YUY2ToARGB(const uint8* src_yuy2,
+ int src_stride_yuy2,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ int y;
+ void (*YUY2ToARGBRow)(const uint8* src_yuy2, uint8* dst_argb,
+ const struct YuvConstants* yuvconstants, int width) =
+ YUY2ToARGBRow_C;
+ if (!src_yuy2 || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2;
+ src_stride_yuy2 = -src_stride_yuy2;
+ }
+ // Coalesce rows.
+ if (src_stride_yuy2 == width * 2 && dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_yuy2 = dst_stride_argb = 0;
+ }
+#if defined(HAS_YUY2TOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ YUY2ToARGBRow = YUY2ToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ YUY2ToARGBRow = YUY2ToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_YUY2TOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ YUY2ToARGBRow = YUY2ToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ YUY2ToARGBRow = YUY2ToARGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_YUY2TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ YUY2ToARGBRow = YUY2ToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ YUY2ToARGBRow = YUY2ToARGBRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_YUY2TOARGBROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ YUY2ToARGBRow = YUY2ToARGBRow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ YUY2ToARGBRow = YUY2ToARGBRow_MSA;
+ }
+ }
+#endif
+ for (y = 0; y < height; ++y) {
+ YUY2ToARGBRow(src_yuy2, dst_argb, &kYuvI601Constants, width);
+ src_yuy2 += src_stride_yuy2;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Convert UYVY to ARGB.
+LIBYUV_API
+int UYVYToARGB(const uint8* src_uyvy,
+ int src_stride_uyvy,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ int y;
+ void (*UYVYToARGBRow)(const uint8* src_uyvy, uint8* dst_argb,
+ const struct YuvConstants* yuvconstants, int width) =
+ UYVYToARGBRow_C;
+ if (!src_uyvy || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy;
+ src_stride_uyvy = -src_stride_uyvy;
+ }
+ // Coalesce rows.
+ if (src_stride_uyvy == width * 2 && dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_uyvy = dst_stride_argb = 0;
+ }
+#if defined(HAS_UYVYTOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ UYVYToARGBRow = UYVYToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ UYVYToARGBRow = UYVYToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_UYVYTOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ UYVYToARGBRow = UYVYToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ UYVYToARGBRow = UYVYToARGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_UYVYTOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ UYVYToARGBRow = UYVYToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ UYVYToARGBRow = UYVYToARGBRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_UYVYTOARGBROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ UYVYToARGBRow = UYVYToARGBRow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ UYVYToARGBRow = UYVYToARGBRow_MSA;
+ }
+ }
+#endif
+ for (y = 0; y < height; ++y) {
+ UYVYToARGBRow(src_uyvy, dst_argb, &kYuvI601Constants, width);
+ src_uyvy += src_stride_uyvy;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/libyuv/src/main/cpp/libyuv/source/convert_from.cc b/libyuv/src/main/cpp/libyuv/source/convert_from.cc
new file mode 100644
index 0000000..d623731
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/source/convert_from.cc
@@ -0,0 +1,1254 @@
+/*
+ * Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/convert_from.h"
+
+#include "libyuv/basic_types.h"
+#include "libyuv/convert.h" // For I420Copy
+#include "libyuv/cpu_id.h"
+#include "libyuv/planar_functions.h"
+#include "libyuv/rotate.h"
+#include "libyuv/row.h"
+#include "libyuv/scale.h" // For ScalePlane()
+#include "libyuv/video_common.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#define SUBSAMPLE(v, a, s) (v < 0) ? (-((-v + a) >> s)) : ((v + a) >> s)
+static __inline int Abs(int v) {
+ return v >= 0 ? v : -v;
+}
+
+// I420 To any I4xx YUV format with mirroring.
+static int I420ToI4xx(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int src_y_width,
+ int src_y_height,
+ int dst_uv_width,
+ int dst_uv_height) {
+ const int dst_y_width = Abs(src_y_width);
+ const int dst_y_height = Abs(src_y_height);
+ const int src_uv_width = SUBSAMPLE(src_y_width, 1, 1);
+ const int src_uv_height = SUBSAMPLE(src_y_height, 1, 1);
+ if (src_y_width == 0 || src_y_height == 0 || dst_uv_width <= 0 ||
+ dst_uv_height <= 0) {
+ return -1;
+ }
+ if (dst_y) {
+ ScalePlane(src_y, src_stride_y, src_y_width, src_y_height, dst_y,
+ dst_stride_y, dst_y_width, dst_y_height, kFilterBilinear);
+ }
+ ScalePlane(src_u, src_stride_u, src_uv_width, src_uv_height, dst_u,
+ dst_stride_u, dst_uv_width, dst_uv_height, kFilterBilinear);
+ ScalePlane(src_v, src_stride_v, src_uv_width, src_uv_height, dst_v,
+ dst_stride_v, dst_uv_width, dst_uv_height, kFilterBilinear);
+ return 0;
+}
+
+// 420 chroma is 1/2 width, 1/2 height
+// 422 chroma is 1/2 width, 1x height
+LIBYUV_API
+int I420ToI422(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ const int dst_uv_width = (Abs(width) + 1) >> 1;
+ const int dst_uv_height = Abs(height);
+ return I420ToI4xx(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u,
+ dst_v, dst_stride_v, width, height, dst_uv_width,
+ dst_uv_height);
+}
+
+// 420 chroma is 1/2 width, 1/2 height
+// 444 chroma is 1x width, 1x height
+LIBYUV_API
+int I420ToI444(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ const int dst_uv_width = Abs(width);
+ const int dst_uv_height = Abs(height);
+ return I420ToI4xx(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u,
+ dst_v, dst_stride_v, width, height, dst_uv_width,
+ dst_uv_height);
+}
+
+// Copy to I400. Source can be I420,422,444,400,NV12,NV21
+LIBYUV_API
+int I400Copy(const uint8* src_y,
+ int src_stride_y,
+ uint8* dst_y,
+ int dst_stride_y,
+ int width,
+ int height) {
+ if (!src_y || !dst_y || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_stride_y = -src_stride_y;
+ }
+ CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ return 0;
+}
+
+LIBYUV_API
+int I422ToYUY2(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_yuy2,
+ int dst_stride_yuy2,
+ int width,
+ int height) {
+ int y;
+ void (*I422ToYUY2Row)(const uint8* src_y, const uint8* src_u,
+ const uint8* src_v, uint8* dst_yuy2, int width) =
+ I422ToYUY2Row_C;
+ if (!src_y || !src_u || !src_v || !dst_yuy2 || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_yuy2 = dst_yuy2 + (height - 1) * dst_stride_yuy2;
+ dst_stride_yuy2 = -dst_stride_yuy2;
+ }
+ // Coalesce rows.
+ if (src_stride_y == width && src_stride_u * 2 == width &&
+ src_stride_v * 2 == width && dst_stride_yuy2 == width * 2) {
+ width *= height;
+ height = 1;
+ src_stride_y = src_stride_u = src_stride_v = dst_stride_yuy2 = 0;
+ }
+#if defined(HAS_I422TOYUY2ROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ I422ToYUY2Row = I422ToYUY2Row_Any_SSE2;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToYUY2Row = I422ToYUY2Row_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_I422TOYUY2ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I422ToYUY2Row = I422ToYUY2Row_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToYUY2Row = I422ToYUY2Row_NEON;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ I422ToYUY2Row(src_y, src_u, src_v, dst_yuy2, width);
+ src_y += src_stride_y;
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ dst_yuy2 += dst_stride_yuy2;
+ }
+ return 0;
+}
+
+LIBYUV_API
+int I420ToYUY2(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_yuy2,
+ int dst_stride_yuy2,
+ int width,
+ int height) {
+ int y;
+ void (*I422ToYUY2Row)(const uint8* src_y, const uint8* src_u,
+ const uint8* src_v, uint8* dst_yuy2, int width) =
+ I422ToYUY2Row_C;
+ if (!src_y || !src_u || !src_v || !dst_yuy2 || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_yuy2 = dst_yuy2 + (height - 1) * dst_stride_yuy2;
+ dst_stride_yuy2 = -dst_stride_yuy2;
+ }
+#if defined(HAS_I422TOYUY2ROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ I422ToYUY2Row = I422ToYUY2Row_Any_SSE2;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToYUY2Row = I422ToYUY2Row_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_I422TOYUY2ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I422ToYUY2Row = I422ToYUY2Row_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToYUY2Row = I422ToYUY2Row_NEON;
+ }
+ }
+#endif
+#if defined(HAS_I422TOYUY2ROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ I422ToYUY2Row = I422ToYUY2Row_Any_MSA;
+ if (IS_ALIGNED(width, 32)) {
+ I422ToYUY2Row = I422ToYUY2Row_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height - 1; y += 2) {
+ I422ToYUY2Row(src_y, src_u, src_v, dst_yuy2, width);
+ I422ToYUY2Row(src_y + src_stride_y, src_u, src_v,
+ dst_yuy2 + dst_stride_yuy2, width);
+ src_y += src_stride_y * 2;
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ dst_yuy2 += dst_stride_yuy2 * 2;
+ }
+ if (height & 1) {
+ I422ToYUY2Row(src_y, src_u, src_v, dst_yuy2, width);
+ }
+ return 0;
+}
+
+LIBYUV_API
+int I422ToUYVY(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_uyvy,
+ int dst_stride_uyvy,
+ int width,
+ int height) {
+ int y;
+ void (*I422ToUYVYRow)(const uint8* src_y, const uint8* src_u,
+ const uint8* src_v, uint8* dst_uyvy, int width) =
+ I422ToUYVYRow_C;
+ if (!src_y || !src_u || !src_v || !dst_uyvy || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_uyvy = dst_uyvy + (height - 1) * dst_stride_uyvy;
+ dst_stride_uyvy = -dst_stride_uyvy;
+ }
+ // Coalesce rows.
+ if (src_stride_y == width && src_stride_u * 2 == width &&
+ src_stride_v * 2 == width && dst_stride_uyvy == width * 2) {
+ width *= height;
+ height = 1;
+ src_stride_y = src_stride_u = src_stride_v = dst_stride_uyvy = 0;
+ }
+#if defined(HAS_I422TOUYVYROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ I422ToUYVYRow = I422ToUYVYRow_Any_SSE2;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToUYVYRow = I422ToUYVYRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_I422TOUYVYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I422ToUYVYRow = I422ToUYVYRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToUYVYRow = I422ToUYVYRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_I422TOUYVYROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ I422ToUYVYRow = I422ToUYVYRow_Any_MSA;
+ if (IS_ALIGNED(width, 32)) {
+ I422ToUYVYRow = I422ToUYVYRow_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width);
+ src_y += src_stride_y;
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ dst_uyvy += dst_stride_uyvy;
+ }
+ return 0;
+}
+
+LIBYUV_API
+int I420ToUYVY(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_uyvy,
+ int dst_stride_uyvy,
+ int width,
+ int height) {
+ int y;
+ void (*I422ToUYVYRow)(const uint8* src_y, const uint8* src_u,
+ const uint8* src_v, uint8* dst_uyvy, int width) =
+ I422ToUYVYRow_C;
+ if (!src_y || !src_u || !src_v || !dst_uyvy || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_uyvy = dst_uyvy + (height - 1) * dst_stride_uyvy;
+ dst_stride_uyvy = -dst_stride_uyvy;
+ }
+#if defined(HAS_I422TOUYVYROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ I422ToUYVYRow = I422ToUYVYRow_Any_SSE2;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToUYVYRow = I422ToUYVYRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_I422TOUYVYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I422ToUYVYRow = I422ToUYVYRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToUYVYRow = I422ToUYVYRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_I422TOUYVYROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ I422ToUYVYRow = I422ToUYVYRow_Any_MSA;
+ if (IS_ALIGNED(width, 32)) {
+ I422ToUYVYRow = I422ToUYVYRow_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height - 1; y += 2) {
+ I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width);
+ I422ToUYVYRow(src_y + src_stride_y, src_u, src_v,
+ dst_uyvy + dst_stride_uyvy, width);
+ src_y += src_stride_y * 2;
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ dst_uyvy += dst_stride_uyvy * 2;
+ }
+ if (height & 1) {
+ I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width);
+ }
+ return 0;
+}
+
+// TODO(fbarchard): test negative height for invert.
+LIBYUV_API
+int I420ToNV12(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height) {
+ if (!src_y || !src_u || !src_v || !dst_y || !dst_uv || width <= 0 ||
+ height == 0) {
+ return -1;
+ }
+ int halfwidth = (width + 1) / 2;
+ int halfheight = height > 0 ? (height + 1) / 2 : (height - 1) / 2;
+ if (dst_y) {
+ CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ }
+ MergeUVPlane(src_u, src_stride_u, src_v, src_stride_v, dst_uv, dst_stride_uv,
+ halfwidth, halfheight);
+ return 0;
+}
+
+LIBYUV_API
+int I420ToNV21(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_vu,
+ int dst_stride_vu,
+ int width,
+ int height) {
+ return I420ToNV12(src_y, src_stride_y, src_v, src_stride_v, src_u,
+ src_stride_u, dst_y, dst_stride_y, dst_vu, dst_stride_vu,
+ width, height);
+}
+
+// Convert I422 to RGBA with matrix
+static int I420ToRGBAMatrix(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_rgba,
+ int dst_stride_rgba,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
+ int y;
+ void (*I422ToRGBARow)(const uint8* y_buf, const uint8* u_buf,
+ const uint8* v_buf, uint8* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) =
+ I422ToRGBARow_C;
+ if (!src_y || !src_u || !src_v || !dst_rgba || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_rgba = dst_rgba + (height - 1) * dst_stride_rgba;
+ dst_stride_rgba = -dst_stride_rgba;
+ }
+#if defined(HAS_I422TORGBAROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I422ToRGBARow = I422ToRGBARow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToRGBARow = I422ToRGBARow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGBAROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I422ToRGBARow = I422ToRGBARow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToRGBARow = I422ToRGBARow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGBAROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I422ToRGBARow = I422ToRGBARow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToRGBARow = I422ToRGBARow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGBAROW_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 4) &&
+ IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
+ IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
+ IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
+ IS_ALIGNED(dst_rgba, 4) && IS_ALIGNED(dst_stride_rgba, 4)) {
+ I422ToRGBARow = I422ToRGBARow_DSPR2;
+ }
+#endif
+#if defined(HAS_I422TORGBAROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ I422ToRGBARow = I422ToRGBARow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToRGBARow = I422ToRGBARow_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ I422ToRGBARow(src_y, src_u, src_v, dst_rgba, yuvconstants, width);
+ dst_rgba += dst_stride_rgba;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ }
+ return 0;
+}
+
+// Convert I420 to RGBA.
+LIBYUV_API
+int I420ToRGBA(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_rgba,
+ int dst_stride_rgba,
+ int width,
+ int height) {
+ return I420ToRGBAMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_rgba, dst_stride_rgba,
+ &kYuvI601Constants, width, height);
+}
+
+// Convert I420 to BGRA.
+LIBYUV_API
+int I420ToBGRA(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_bgra,
+ int dst_stride_bgra,
+ int width,
+ int height) {
+ return I420ToRGBAMatrix(src_y, src_stride_y, src_v,
+ src_stride_v, // Swap U and V
+ src_u, src_stride_u, dst_bgra, dst_stride_bgra,
+ &kYvuI601Constants, // Use Yvu matrix
+ width, height);
+}
+
+// Convert I420 to RGB24 with matrix
+static int I420ToRGB24Matrix(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_rgb24,
+ int dst_stride_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
+ int y;
+ void (*I422ToRGB24Row)(const uint8* y_buf, const uint8* u_buf,
+ const uint8* v_buf, uint8* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) =
+ I422ToRGB24Row_C;
+ if (!src_y || !src_u || !src_v || !dst_rgb24 || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24;
+ dst_stride_rgb24 = -dst_stride_rgb24;
+ }
+#if defined(HAS_I422TORGB24ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I422ToRGB24Row = I422ToRGB24Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToRGB24Row = I422ToRGB24Row_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGB24ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I422ToRGB24Row = I422ToRGB24Row_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToRGB24Row = I422ToRGB24Row_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGB24ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I422ToRGB24Row = I422ToRGB24Row_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToRGB24Row = I422ToRGB24Row_NEON;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGB24ROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ I422ToRGB24Row = I422ToRGB24Row_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToRGB24Row = I422ToRGB24Row_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ I422ToRGB24Row(src_y, src_u, src_v, dst_rgb24, yuvconstants, width);
+ dst_rgb24 += dst_stride_rgb24;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ }
+ return 0;
+}
+
+// Convert I420 to RGB24.
+LIBYUV_API
+int I420ToRGB24(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_rgb24,
+ int dst_stride_rgb24,
+ int width,
+ int height) {
+ return I420ToRGB24Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_rgb24, dst_stride_rgb24,
+ &kYuvI601Constants, width, height);
+}
+
+// Convert I420 to RAW.
+LIBYUV_API
+int I420ToRAW(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_raw,
+ int dst_stride_raw,
+ int width,
+ int height) {
+ return I420ToRGB24Matrix(src_y, src_stride_y, src_v,
+ src_stride_v, // Swap U and V
+ src_u, src_stride_u, dst_raw, dst_stride_raw,
+ &kYvuI601Constants, // Use Yvu matrix
+ width, height);
+}
+
+// Convert I420 to ARGB1555.
+LIBYUV_API
+int I420ToARGB1555(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_argb1555,
+ int dst_stride_argb1555,
+ int width,
+ int height) {
+ int y;
+ void (*I422ToARGB1555Row)(const uint8* y_buf, const uint8* u_buf,
+ const uint8* v_buf, uint8* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) = I422ToARGB1555Row_C;
+ if (!src_y || !src_u || !src_v || !dst_argb1555 || width <= 0 ||
+ height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb1555 = dst_argb1555 + (height - 1) * dst_stride_argb1555;
+ dst_stride_argb1555 = -dst_stride_argb1555;
+ }
+#if defined(HAS_I422TOARGB1555ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I422ToARGB1555Row = I422ToARGB1555Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToARGB1555Row = I422ToARGB1555Row_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGB1555ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I422ToARGB1555Row = I422ToARGB1555Row_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToARGB1555Row = I422ToARGB1555Row_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGB1555ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I422ToARGB1555Row = I422ToARGB1555Row_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToARGB1555Row = I422ToARGB1555Row_NEON;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGB1555ROW_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2)) {
+ I422ToARGB1555Row = I422ToARGB1555Row_Any_DSPR2;
+ if (IS_ALIGNED(width, 4)) {
+ I422ToARGB1555Row = I422ToARGB1555Row_DSPR2;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGB1555ROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ I422ToARGB1555Row = I422ToARGB1555Row_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToARGB1555Row = I422ToARGB1555Row_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ I422ToARGB1555Row(src_y, src_u, src_v, dst_argb1555, &kYuvI601Constants,
+ width);
+ dst_argb1555 += dst_stride_argb1555;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ }
+ return 0;
+}
+
+// Convert I420 to ARGB4444.
+LIBYUV_API
+int I420ToARGB4444(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_argb4444,
+ int dst_stride_argb4444,
+ int width,
+ int height) {
+ int y;
+ void (*I422ToARGB4444Row)(const uint8* y_buf, const uint8* u_buf,
+ const uint8* v_buf, uint8* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) = I422ToARGB4444Row_C;
+ if (!src_y || !src_u || !src_v || !dst_argb4444 || width <= 0 ||
+ height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb4444 = dst_argb4444 + (height - 1) * dst_stride_argb4444;
+ dst_stride_argb4444 = -dst_stride_argb4444;
+ }
+#if defined(HAS_I422TOARGB4444ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I422ToARGB4444Row = I422ToARGB4444Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToARGB4444Row = I422ToARGB4444Row_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGB4444ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I422ToARGB4444Row = I422ToARGB4444Row_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToARGB4444Row = I422ToARGB4444Row_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGB4444ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I422ToARGB4444Row = I422ToARGB4444Row_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToARGB4444Row = I422ToARGB4444Row_NEON;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGB4444ROW_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2)) {
+ I422ToARGB4444Row = I422ToARGB4444Row_Any_DSPR2;
+ if (IS_ALIGNED(width, 4)) {
+ I422ToARGB4444Row = I422ToARGB4444Row_DSPR2;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGB4444ROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ I422ToARGB4444Row = I422ToARGB4444Row_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToARGB4444Row = I422ToARGB4444Row_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ I422ToARGB4444Row(src_y, src_u, src_v, dst_argb4444, &kYuvI601Constants,
+ width);
+ dst_argb4444 += dst_stride_argb4444;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ }
+ return 0;
+}
+
+// Convert I420 to RGB565.
+LIBYUV_API
+int I420ToRGB565(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_rgb565,
+ int dst_stride_rgb565,
+ int width,
+ int height) {
+ int y;
+ void (*I422ToRGB565Row)(const uint8* y_buf, const uint8* u_buf,
+ const uint8* v_buf, uint8* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) =
+ I422ToRGB565Row_C;
+ if (!src_y || !src_u || !src_v || !dst_rgb565 || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
+ dst_stride_rgb565 = -dst_stride_rgb565;
+ }
+#if defined(HAS_I422TORGB565ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I422ToRGB565Row = I422ToRGB565Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToRGB565Row = I422ToRGB565Row_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGB565ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I422ToRGB565Row = I422ToRGB565Row_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToRGB565Row = I422ToRGB565Row_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGB565ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I422ToRGB565Row = I422ToRGB565Row_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToRGB565Row = I422ToRGB565Row_NEON;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGB565ROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ I422ToRGB565Row = I422ToRGB565Row_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToRGB565Row = I422ToRGB565Row_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ I422ToRGB565Row(src_y, src_u, src_v, dst_rgb565, &kYuvI601Constants, width);
+ dst_rgb565 += dst_stride_rgb565;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ }
+ return 0;
+}
+
+// Convert I422 to RGB565.
+LIBYUV_API
+int I422ToRGB565(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_rgb565,
+ int dst_stride_rgb565,
+ int width,
+ int height) {
+ int y;
+ void (*I422ToRGB565Row)(const uint8* y_buf, const uint8* u_buf,
+ const uint8* v_buf, uint8* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) =
+ I422ToRGB565Row_C;
+ if (!src_y || !src_u || !src_v || !dst_rgb565 || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
+ dst_stride_rgb565 = -dst_stride_rgb565;
+ }
+#if defined(HAS_I422TORGB565ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I422ToRGB565Row = I422ToRGB565Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToRGB565Row = I422ToRGB565Row_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGB565ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I422ToRGB565Row = I422ToRGB565Row_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToRGB565Row = I422ToRGB565Row_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGB565ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I422ToRGB565Row = I422ToRGB565Row_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToRGB565Row = I422ToRGB565Row_NEON;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGB565ROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ I422ToRGB565Row = I422ToRGB565Row_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToRGB565Row = I422ToRGB565Row_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ I422ToRGB565Row(src_y, src_u, src_v, dst_rgb565, &kYuvI601Constants, width);
+ dst_rgb565 += dst_stride_rgb565;
+ src_y += src_stride_y;
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ return 0;
+}
+
+// Ordered 8x8 dither for 888 to 565. Values from 0 to 7.
+static const uint8 kDither565_4x4[16] = {
+ 0, 4, 1, 5, 6, 2, 7, 3, 1, 5, 0, 4, 7, 3, 6, 2,
+};
+
+// Convert I420 to RGB565 with dithering.
+LIBYUV_API
+int I420ToRGB565Dither(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_rgb565,
+ int dst_stride_rgb565,
+ const uint8* dither4x4,
+ int width,
+ int height) {
+ int y;
+ void (*I422ToARGBRow)(const uint8* y_buf, const uint8* u_buf,
+ const uint8* v_buf, uint8* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) =
+ I422ToARGBRow_C;
+ void (*ARGBToRGB565DitherRow)(const uint8* src_argb, uint8* dst_rgb,
+ const uint32 dither4, int width) =
+ ARGBToRGB565DitherRow_C;
+ if (!src_y || !src_u || !src_v || !dst_rgb565 || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
+ dst_stride_rgb565 = -dst_stride_rgb565;
+ }
+ if (!dither4x4) {
+ dither4x4 = kDither565_4x4;
+ }
+#if defined(HAS_I422TOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToARGBRow = I422ToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I422ToARGBRow = I422ToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToARGBRow = I422ToARGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I422ToARGBRow = I422ToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToARGBRow = I422ToARGBRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGBROW_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 4) &&
+ IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
+ IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
+ IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2)) {
+ I422ToARGBRow = I422ToARGBRow_DSPR2;
+ }
+#endif
+#if defined(HAS_I422TOARGBROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ I422ToARGBRow = I422ToARGBRow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToARGBRow = I422ToARGBRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_SSE2;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_AVX2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_MSA;
+ }
+ }
+#endif
+ {
+ // Allocate a row of argb.
+ align_buffer_64(row_argb, width * 4);
+ for (y = 0; y < height; ++y) {
+ I422ToARGBRow(src_y, src_u, src_v, row_argb, &kYuvI601Constants, width);
+ ARGBToRGB565DitherRow(row_argb, dst_rgb565,
+ *(uint32*)(dither4x4 + ((y & 3) << 2)),
+ width); // NOLINT
+ dst_rgb565 += dst_stride_rgb565;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ }
+ free_aligned_buffer_64(row_argb);
+ }
+ return 0;
+}
+
+// Convert I420 to specified format
+LIBYUV_API
+int ConvertFromI420(const uint8* y,
+ int y_stride,
+ const uint8* u,
+ int u_stride,
+ const uint8* v,
+ int v_stride,
+ uint8* dst_sample,
+ int dst_sample_stride,
+ int width,
+ int height,
+ uint32 fourcc) {
+ uint32 format = CanonicalFourCC(fourcc);
+ int r = 0;
+ if (!y || !u || !v || !dst_sample || width <= 0 || height == 0) {
+ return -1;
+ }
+ switch (format) {
+ // Single plane formats
+ case FOURCC_YUY2:
+ r = I420ToYUY2(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+ dst_sample_stride ? dst_sample_stride : width * 2, width,
+ height);
+ break;
+ case FOURCC_UYVY:
+ r = I420ToUYVY(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+ dst_sample_stride ? dst_sample_stride : width * 2, width,
+ height);
+ break;
+ case FOURCC_RGBP:
+ r = I420ToRGB565(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+ dst_sample_stride ? dst_sample_stride : width * 2, width,
+ height);
+ break;
+ case FOURCC_RGBO:
+ r = I420ToARGB1555(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+ dst_sample_stride ? dst_sample_stride : width * 2,
+ width, height);
+ break;
+ case FOURCC_R444:
+ r = I420ToARGB4444(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+ dst_sample_stride ? dst_sample_stride : width * 2,
+ width, height);
+ break;
+ case FOURCC_24BG:
+ r = I420ToRGB24(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+ dst_sample_stride ? dst_sample_stride : width * 3, width,
+ height);
+ break;
+ case FOURCC_RAW:
+ r = I420ToRAW(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+ dst_sample_stride ? dst_sample_stride : width * 3, width,
+ height);
+ break;
+ case FOURCC_ARGB:
+ r = I420ToARGB(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+ dst_sample_stride ? dst_sample_stride : width * 4, width,
+ height);
+ break;
+ case FOURCC_BGRA:
+ r = I420ToBGRA(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+ dst_sample_stride ? dst_sample_stride : width * 4, width,
+ height);
+ break;
+ case FOURCC_ABGR:
+ r = I420ToABGR(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+ dst_sample_stride ? dst_sample_stride : width * 4, width,
+ height);
+ break;
+ case FOURCC_RGBA:
+ r = I420ToRGBA(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+ dst_sample_stride ? dst_sample_stride : width * 4, width,
+ height);
+ break;
+ case FOURCC_I400:
+ r = I400Copy(y, y_stride, dst_sample,
+ dst_sample_stride ? dst_sample_stride : width, width,
+ height);
+ break;
+ case FOURCC_NV12: {
+ uint8* dst_uv = dst_sample + width * height;
+ r = I420ToNV12(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+ dst_sample_stride ? dst_sample_stride : width, dst_uv,
+ dst_sample_stride ? dst_sample_stride : width, width,
+ height);
+ break;
+ }
+ case FOURCC_NV21: {
+ uint8* dst_vu = dst_sample + width * height;
+ r = I420ToNV21(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+ dst_sample_stride ? dst_sample_stride : width, dst_vu,
+ dst_sample_stride ? dst_sample_stride : width, width,
+ height);
+ break;
+ }
+ // TODO(fbarchard): Add M420.
+ // Triplanar formats
+ case FOURCC_I420:
+ case FOURCC_YV12: {
+ dst_sample_stride = dst_sample_stride ? dst_sample_stride : width;
+ int halfstride = (dst_sample_stride + 1) / 2;
+ int halfheight = (height + 1) / 2;
+ uint8* dst_u;
+ uint8* dst_v;
+ if (format == FOURCC_YV12) {
+ dst_v = dst_sample + dst_sample_stride * height;
+ dst_u = dst_v + halfstride * halfheight;
+ } else {
+ dst_u = dst_sample + dst_sample_stride * height;
+ dst_v = dst_u + halfstride * halfheight;
+ }
+ r = I420Copy(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+ dst_sample_stride, dst_u, halfstride, dst_v, halfstride,
+ width, height);
+ break;
+ }
+ case FOURCC_I422:
+ case FOURCC_YV16: {
+ dst_sample_stride = dst_sample_stride ? dst_sample_stride : width;
+ int halfstride = (dst_sample_stride + 1) / 2;
+ uint8* dst_u;
+ uint8* dst_v;
+ if (format == FOURCC_YV16) {
+ dst_v = dst_sample + dst_sample_stride * height;
+ dst_u = dst_v + halfstride * height;
+ } else {
+ dst_u = dst_sample + dst_sample_stride * height;
+ dst_v = dst_u + halfstride * height;
+ }
+ r = I420ToI422(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+ dst_sample_stride, dst_u, halfstride, dst_v, halfstride,
+ width, height);
+ break;
+ }
+ case FOURCC_I444:
+ case FOURCC_YV24: {
+ dst_sample_stride = dst_sample_stride ? dst_sample_stride : width;
+ uint8* dst_u;
+ uint8* dst_v;
+ if (format == FOURCC_YV24) {
+ dst_v = dst_sample + dst_sample_stride * height;
+ dst_u = dst_v + dst_sample_stride * height;
+ } else {
+ dst_u = dst_sample + dst_sample_stride * height;
+ dst_v = dst_u + dst_sample_stride * height;
+ }
+ r = I420ToI444(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+ dst_sample_stride, dst_u, dst_sample_stride, dst_v,
+ dst_sample_stride, width, height);
+ break;
+ }
+ // Formats not supported - MJPG, biplanar, some rgb formats.
+ default:
+ return -1; // unknown fourcc - return failure code.
+ }
+ return r;
+}
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/libyuv/src/main/cpp/libyuv/source/convert_from_argb.cc b/libyuv/src/main/cpp/libyuv/source/convert_from_argb.cc
new file mode 100644
index 0000000..88f3827
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/source/convert_from_argb.cc
@@ -0,0 +1,1568 @@
+/*
+ * Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/convert_from_argb.h"
+
+#include "libyuv/basic_types.h"
+#include "libyuv/cpu_id.h"
+#include "libyuv/planar_functions.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// ARGB little endian (bgra in memory) to I444
+LIBYUV_API
+int ARGBToI444(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ int y;
+ void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
+ ARGBToYRow_C;
+ void (*ARGBToUV444Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+ int width) = ARGBToUV444Row_C;
+ if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+ return -1;
+ }
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+ // Coalesce rows.
+ if (src_stride_argb == width * 4 && dst_stride_y == width &&
+ dst_stride_u == width && dst_stride_v == width) {
+ width *= height;
+ height = 1;
+ src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0;
+ }
+#if defined(HAS_ARGBTOUV444ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBToUV444Row = ARGBToUV444Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUV444Row = ARGBToUV444Row_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOUV444ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBToUV444Row = ARGBToUV444Row_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToUV444Row = ARGBToUV444Row_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOUV444ROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBToUV444Row = ARGBToUV444Row_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUV444Row = ARGBToUV444Row_MSA;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBToYRow = ARGBToYRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYRow = ARGBToYRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToYRow = ARGBToYRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToYRow = ARGBToYRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBToYRow = ARGBToYRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToYRow = ARGBToYRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2)) {
+ ARGBToYRow = ARGBToYRow_Any_DSPR2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToYRow = ARGBToYRow_DSPR2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBToYRow = ARGBToYRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYRow = ARGBToYRow_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ ARGBToUV444Row(src_argb, dst_u, dst_v, width);
+ ARGBToYRow(src_argb, dst_y, width);
+ src_argb += src_stride_argb;
+ dst_y += dst_stride_y;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ return 0;
+}
+
+// ARGB little endian (bgra in memory) to I422
+LIBYUV_API
+int ARGBToI422(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ int y;
+ void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, uint8* dst_u,
+ uint8* dst_v, int width) = ARGBToUVRow_C;
+ void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
+ ARGBToYRow_C;
+ if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+ // Coalesce rows.
+ if (src_stride_argb == width * 4 && dst_stride_y == width &&
+ dst_stride_u * 2 == width && dst_stride_v * 2 == width) {
+ width *= height;
+ height = 1;
+ src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0;
+ }
+#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+ ARGBToYRow = ARGBToYRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_SSSE3;
+ ARGBToYRow = ARGBToYRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+ ARGBToYRow = ARGBToYRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToUVRow = ARGBToUVRow_AVX2;
+ ARGBToYRow = ARGBToYRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBToYRow = ARGBToYRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToYRow = ARGBToYRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBToUVRow = ARGBToUVRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2)) {
+ ARGBToYRow = ARGBToYRow_Any_DSPR2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToYRow = ARGBToYRow_DSPR2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOUVROW_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2)) {
+ ARGBToUVRow = ARGBToUVRow_Any_DSPR2;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_DSPR2;
+ }
+ }
+#endif
+
+#if defined(HAS_ARGBTOYROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBToYRow = ARGBToYRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYRow = ARGBToYRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOUVROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBToUVRow = ARGBToUVRow_Any_MSA;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToUVRow = ARGBToUVRow_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ ARGBToUVRow(src_argb, 0, dst_u, dst_v, width);
+ ARGBToYRow(src_argb, dst_y, width);
+ src_argb += src_stride_argb;
+ dst_y += dst_stride_y;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ return 0;
+}
+
+LIBYUV_API
+int ARGBToNV12(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height) {
+ int y;
+ int halfwidth = (width + 1) >> 1;
+ void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, uint8* dst_u,
+ uint8* dst_v, int width) = ARGBToUVRow_C;
+ void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
+ ARGBToYRow_C;
+ void (*MergeUVRow_)(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+ int width) = MergeUVRow_C;
+ if (!src_argb || !dst_y || !dst_uv || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+ ARGBToYRow = ARGBToYRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_SSSE3;
+ ARGBToYRow = ARGBToYRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+ ARGBToYRow = ARGBToYRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToUVRow = ARGBToUVRow_AVX2;
+ ARGBToYRow = ARGBToYRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBToYRow = ARGBToYRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToYRow = ARGBToYRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBToUVRow = ARGBToUVRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBToYRow = ARGBToYRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYRow = ARGBToYRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOUVROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBToUVRow = ARGBToUVRow_Any_MSA;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToUVRow = ARGBToUVRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_MERGEUVROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ MergeUVRow_ = MergeUVRow_Any_SSE2;
+ if (IS_ALIGNED(halfwidth, 16)) {
+ MergeUVRow_ = MergeUVRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_MERGEUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ MergeUVRow_ = MergeUVRow_Any_AVX2;
+ if (IS_ALIGNED(halfwidth, 32)) {
+ MergeUVRow_ = MergeUVRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_MERGEUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ MergeUVRow_ = MergeUVRow_Any_NEON;
+ if (IS_ALIGNED(halfwidth, 16)) {
+ MergeUVRow_ = MergeUVRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2)) {
+ ARGBToYRow = ARGBToYRow_Any_DSPR2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToYRow = ARGBToYRow_DSPR2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOUVROW_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2)) {
+ ARGBToUVRow = ARGBToUVRow_Any_DSPR2;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_DSPR2;
+ }
+ }
+#endif
+#if defined(HAS_MERGEUVROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ MergeUVRow_ = MergeUVRow_Any_MSA;
+ if (IS_ALIGNED(halfwidth, 16)) {
+ MergeUVRow_ = MergeUVRow_MSA;
+ }
+ }
+#endif
+ {
+ // Allocate a rows of uv.
+ align_buffer_64(row_u, ((halfwidth + 31) & ~31) * 2);
+ uint8* row_v = row_u + ((halfwidth + 31) & ~31);
+
+ for (y = 0; y < height - 1; y += 2) {
+ ARGBToUVRow(src_argb, src_stride_argb, row_u, row_v, width);
+ MergeUVRow_(row_u, row_v, dst_uv, halfwidth);
+ ARGBToYRow(src_argb, dst_y, width);
+ ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width);
+ src_argb += src_stride_argb * 2;
+ dst_y += dst_stride_y * 2;
+ dst_uv += dst_stride_uv;
+ }
+ if (height & 1) {
+ ARGBToUVRow(src_argb, 0, row_u, row_v, width);
+ MergeUVRow_(row_u, row_v, dst_uv, halfwidth);
+ ARGBToYRow(src_argb, dst_y, width);
+ }
+ free_aligned_buffer_64(row_u);
+ }
+ return 0;
+}
+
+// Same as NV12 but U and V swapped.
+LIBYUV_API
+int ARGBToNV21(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height) {
+ int y;
+ int halfwidth = (width + 1) >> 1;
+ void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, uint8* dst_u,
+ uint8* dst_v, int width) = ARGBToUVRow_C;
+ void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
+ ARGBToYRow_C;
+ void (*MergeUVRow_)(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+ int width) = MergeUVRow_C;
+ if (!src_argb || !dst_y || !dst_uv || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+ ARGBToYRow = ARGBToYRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_SSSE3;
+ ARGBToYRow = ARGBToYRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+ ARGBToYRow = ARGBToYRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToUVRow = ARGBToUVRow_AVX2;
+ ARGBToYRow = ARGBToYRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBToYRow = ARGBToYRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToYRow = ARGBToYRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBToUVRow = ARGBToUVRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBToYRow = ARGBToYRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYRow = ARGBToYRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOUVROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBToUVRow = ARGBToUVRow_Any_MSA;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToUVRow = ARGBToUVRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_MERGEUVROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ MergeUVRow_ = MergeUVRow_Any_SSE2;
+ if (IS_ALIGNED(halfwidth, 16)) {
+ MergeUVRow_ = MergeUVRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_MERGEUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ MergeUVRow_ = MergeUVRow_Any_AVX2;
+ if (IS_ALIGNED(halfwidth, 32)) {
+ MergeUVRow_ = MergeUVRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_MERGEUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ MergeUVRow_ = MergeUVRow_Any_NEON;
+ if (IS_ALIGNED(halfwidth, 16)) {
+ MergeUVRow_ = MergeUVRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2)) {
+ ARGBToYRow = ARGBToYRow_Any_DSPR2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToYRow = ARGBToYRow_DSPR2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOUVROW_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2)) {
+ ARGBToUVRow = ARGBToUVRow_Any_DSPR2;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_DSPR2;
+ }
+ }
+#endif
+#if defined(HAS_MERGEUVROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ MergeUVRow_ = MergeUVRow_Any_MSA;
+ if (IS_ALIGNED(halfwidth, 16)) {
+ MergeUVRow_ = MergeUVRow_MSA;
+ }
+ }
+#endif
+ {
+ // Allocate a rows of uv.
+ align_buffer_64(row_u, ((halfwidth + 31) & ~31) * 2);
+ uint8* row_v = row_u + ((halfwidth + 31) & ~31);
+
+ for (y = 0; y < height - 1; y += 2) {
+ ARGBToUVRow(src_argb, src_stride_argb, row_u, row_v, width);
+ MergeUVRow_(row_v, row_u, dst_uv, halfwidth);
+ ARGBToYRow(src_argb, dst_y, width);
+ ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width);
+ src_argb += src_stride_argb * 2;
+ dst_y += dst_stride_y * 2;
+ dst_uv += dst_stride_uv;
+ }
+ if (height & 1) {
+ ARGBToUVRow(src_argb, 0, row_u, row_v, width);
+ MergeUVRow_(row_v, row_u, dst_uv, halfwidth);
+ ARGBToYRow(src_argb, dst_y, width);
+ }
+ free_aligned_buffer_64(row_u);
+ }
+ return 0;
+}
+
+// Convert ARGB to YUY2.
+LIBYUV_API
+int ARGBToYUY2(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_yuy2,
+ int dst_stride_yuy2,
+ int width,
+ int height) {
+ int y;
+ void (*ARGBToUVRow)(const uint8* src_argb, int src_stride_argb, uint8* dst_u,
+ uint8* dst_v, int width) = ARGBToUVRow_C;
+ void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
+ ARGBToYRow_C;
+ void (*I422ToYUY2Row)(const uint8* src_y, const uint8* src_u,
+ const uint8* src_v, uint8* dst_yuy2, int width) =
+ I422ToYUY2Row_C;
+
+ if (!src_argb || !dst_yuy2 || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_yuy2 = dst_yuy2 + (height - 1) * dst_stride_yuy2;
+ dst_stride_yuy2 = -dst_stride_yuy2;
+ }
+ // Coalesce rows.
+ if (src_stride_argb == width * 4 && dst_stride_yuy2 == width * 2) {
+ width *= height;
+ height = 1;
+ src_stride_argb = dst_stride_yuy2 = 0;
+ }
+#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+ ARGBToYRow = ARGBToYRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_SSSE3;
+ ARGBToYRow = ARGBToYRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+ ARGBToYRow = ARGBToYRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToUVRow = ARGBToUVRow_AVX2;
+ ARGBToYRow = ARGBToYRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBToYRow = ARGBToYRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToYRow = ARGBToYRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBToUVRow = ARGBToUVRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBToYRow = ARGBToYRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYRow = ARGBToYRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOUVROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBToUVRow = ARGBToUVRow_Any_MSA;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToUVRow = ARGBToUVRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_I422TOYUY2ROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ I422ToYUY2Row = I422ToYUY2Row_Any_SSE2;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToYUY2Row = I422ToYUY2Row_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_I422TOYUY2ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I422ToYUY2Row = I422ToYUY2Row_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToYUY2Row = I422ToYUY2Row_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2)) {
+ ARGBToYRow = ARGBToYRow_Any_DSPR2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToYRow = ARGBToYRow_DSPR2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOUVROW_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2)) {
+ ARGBToUVRow = ARGBToUVRow_Any_DSPR2;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_DSPR2;
+ }
+ }
+#endif
+#if defined(HAS_I422TOYUY2ROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ I422ToYUY2Row = I422ToYUY2Row_Any_MSA;
+ if (IS_ALIGNED(width, 32)) {
+ I422ToYUY2Row = I422ToYUY2Row_MSA;
+ }
+ }
+#endif
+
+ {
+ // Allocate a rows of yuv.
+ align_buffer_64(row_y, ((width + 63) & ~63) * 2);
+ uint8* row_u = row_y + ((width + 63) & ~63);
+ uint8* row_v = row_u + ((width + 63) & ~63) / 2;
+
+ for (y = 0; y < height; ++y) {
+ ARGBToUVRow(src_argb, 0, row_u, row_v, width);
+ ARGBToYRow(src_argb, row_y, width);
+ I422ToYUY2Row(row_y, row_u, row_v, dst_yuy2, width);
+ src_argb += src_stride_argb;
+ dst_yuy2 += dst_stride_yuy2;
+ }
+
+ free_aligned_buffer_64(row_y);
+ }
+ return 0;
+}
+
+// Convert ARGB to UYVY.
+LIBYUV_API
+int ARGBToUYVY(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_uyvy,
+ int dst_stride_uyvy,
+ int width,
+ int height) {
+ int y;
+ void (*ARGBToUVRow)(const uint8* src_argb, int src_stride_argb, uint8* dst_u,
+ uint8* dst_v, int width) = ARGBToUVRow_C;
+ void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
+ ARGBToYRow_C;
+ void (*I422ToUYVYRow)(const uint8* src_y, const uint8* src_u,
+ const uint8* src_v, uint8* dst_uyvy, int width) =
+ I422ToUYVYRow_C;
+
+ if (!src_argb || !dst_uyvy || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_uyvy = dst_uyvy + (height - 1) * dst_stride_uyvy;
+ dst_stride_uyvy = -dst_stride_uyvy;
+ }
+ // Coalesce rows.
+ if (src_stride_argb == width * 4 && dst_stride_uyvy == width * 2) {
+ width *= height;
+ height = 1;
+ src_stride_argb = dst_stride_uyvy = 0;
+ }
+#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+ ARGBToYRow = ARGBToYRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_SSSE3;
+ ARGBToYRow = ARGBToYRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+ ARGBToYRow = ARGBToYRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToUVRow = ARGBToUVRow_AVX2;
+ ARGBToYRow = ARGBToYRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBToYRow = ARGBToYRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToYRow = ARGBToYRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBToUVRow = ARGBToUVRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBToYRow = ARGBToYRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYRow = ARGBToYRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOUVROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBToUVRow = ARGBToUVRow_Any_MSA;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToUVRow = ARGBToUVRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_I422TOUYVYROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ I422ToUYVYRow = I422ToUYVYRow_Any_SSE2;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToUYVYRow = I422ToUYVYRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_I422TOUYVYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I422ToUYVYRow = I422ToUYVYRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToUYVYRow = I422ToUYVYRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2)) {
+ ARGBToYRow = ARGBToYRow_Any_DSPR2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToYRow = ARGBToYRow_DSPR2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOUVROW_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2)) {
+ ARGBToUVRow = ARGBToUVRow_Any_DSPR2;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_DSPR2;
+ }
+ }
+#endif
+#if defined(HAS_I422TOUYVYROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ I422ToUYVYRow = I422ToUYVYRow_Any_MSA;
+ if (IS_ALIGNED(width, 32)) {
+ I422ToUYVYRow = I422ToUYVYRow_MSA;
+ }
+ }
+#endif
+
+ {
+ // Allocate a rows of yuv.
+ align_buffer_64(row_y, ((width + 63) & ~63) * 2);
+ uint8* row_u = row_y + ((width + 63) & ~63);
+ uint8* row_v = row_u + ((width + 63) & ~63) / 2;
+
+ for (y = 0; y < height; ++y) {
+ ARGBToUVRow(src_argb, 0, row_u, row_v, width);
+ ARGBToYRow(src_argb, row_y, width);
+ I422ToUYVYRow(row_y, row_u, row_v, dst_uyvy, width);
+ src_argb += src_stride_argb;
+ dst_uyvy += dst_stride_uyvy;
+ }
+
+ free_aligned_buffer_64(row_y);
+ }
+ return 0;
+}
+
+// Convert ARGB to I400.
+LIBYUV_API
+int ARGBToI400(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_y,
+ int dst_stride_y,
+ int width,
+ int height) {
+ int y;
+ void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
+ ARGBToYRow_C;
+ if (!src_argb || !dst_y || width <= 0 || height == 0) {
+ return -1;
+ }
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+ // Coalesce rows.
+ if (src_stride_argb == width * 4 && dst_stride_y == width) {
+ width *= height;
+ height = 1;
+ src_stride_argb = dst_stride_y = 0;
+ }
+#if defined(HAS_ARGBTOYROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBToYRow = ARGBToYRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYRow = ARGBToYRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToYRow = ARGBToYRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToYRow = ARGBToYRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBToYRow = ARGBToYRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToYRow = ARGBToYRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2)) {
+ ARGBToYRow = ARGBToYRow_Any_DSPR2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToYRow = ARGBToYRow_DSPR2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBToYRow = ARGBToYRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYRow = ARGBToYRow_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ ARGBToYRow(src_argb, dst_y, width);
+ src_argb += src_stride_argb;
+ dst_y += dst_stride_y;
+ }
+ return 0;
+}
+
+// Shuffle table for converting ARGB to RGBA.
+static uvec8 kShuffleMaskARGBToRGBA = {3u, 0u, 1u, 2u, 7u, 4u, 5u, 6u,
+ 11u, 8u, 9u, 10u, 15u, 12u, 13u, 14u};
+
+// Convert ARGB to RGBA.
+LIBYUV_API
+int ARGBToRGBA(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_rgba,
+ int dst_stride_rgba,
+ int width,
+ int height) {
+ return ARGBShuffle(src_argb, src_stride_argb, dst_rgba, dst_stride_rgba,
+ (const uint8*)(&kShuffleMaskARGBToRGBA), width, height);
+}
+
+// Convert ARGB To RGB24.
+LIBYUV_API
+int ARGBToRGB24(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_rgb24,
+ int dst_stride_rgb24,
+ int width,
+ int height) {
+ int y;
+ void (*ARGBToRGB24Row)(const uint8* src_argb, uint8* dst_rgb, int width) =
+ ARGBToRGB24Row_C;
+ if (!src_argb || !dst_rgb24 || width <= 0 || height == 0) {
+ return -1;
+ }
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+ // Coalesce rows.
+ if (src_stride_argb == width * 4 && dst_stride_rgb24 == width * 3) {
+ width *= height;
+ height = 1;
+ src_stride_argb = dst_stride_rgb24 = 0;
+ }
+#if defined(HAS_ARGBTORGB24ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBToRGB24Row = ARGBToRGB24Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToRGB24Row = ARGBToRGB24Row_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTORGB24ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBToRGB24Row = ARGBToRGB24Row_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToRGB24Row = ARGBToRGB24Row_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTORGB24ROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBToRGB24Row = ARGBToRGB24Row_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToRGB24Row = ARGBToRGB24Row_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ ARGBToRGB24Row(src_argb, dst_rgb24, width);
+ src_argb += src_stride_argb;
+ dst_rgb24 += dst_stride_rgb24;
+ }
+ return 0;
+}
+
+// Convert ARGB To RAW.
+LIBYUV_API
+int ARGBToRAW(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_raw,
+ int dst_stride_raw,
+ int width,
+ int height) {
+ int y;
+ void (*ARGBToRAWRow)(const uint8* src_argb, uint8* dst_rgb, int width) =
+ ARGBToRAWRow_C;
+ if (!src_argb || !dst_raw || width <= 0 || height == 0) {
+ return -1;
+ }
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+ // Coalesce rows.
+ if (src_stride_argb == width * 4 && dst_stride_raw == width * 3) {
+ width *= height;
+ height = 1;
+ src_stride_argb = dst_stride_raw = 0;
+ }
+#if defined(HAS_ARGBTORAWROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBToRAWRow = ARGBToRAWRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToRAWRow = ARGBToRAWRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTORAWROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBToRAWRow = ARGBToRAWRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToRAWRow = ARGBToRAWRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTORAWROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBToRAWRow = ARGBToRAWRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToRAWRow = ARGBToRAWRow_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ ARGBToRAWRow(src_argb, dst_raw, width);
+ src_argb += src_stride_argb;
+ dst_raw += dst_stride_raw;
+ }
+ return 0;
+}
+
+// Ordered 8x8 dither for 888 to 565. Values from 0 to 7.
+static const uint8 kDither565_4x4[16] = {
+ 0, 4, 1, 5, 6, 2, 7, 3, 1, 5, 0, 4, 7, 3, 6, 2,
+};
+
+// Convert ARGB To RGB565 with 4x4 dither matrix (16 bytes).
+LIBYUV_API
+int ARGBToRGB565Dither(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_rgb565,
+ int dst_stride_rgb565,
+ const uint8* dither4x4,
+ int width,
+ int height) {
+ int y;
+ void (*ARGBToRGB565DitherRow)(const uint8* src_argb, uint8* dst_rgb,
+ const uint32 dither4, int width) =
+ ARGBToRGB565DitherRow_C;
+ if (!src_argb || !dst_rgb565 || width <= 0 || height == 0) {
+ return -1;
+ }
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+ if (!dither4x4) {
+ dither4x4 = kDither565_4x4;
+ }
+#if defined(HAS_ARGBTORGB565DITHERROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_SSE2;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_AVX2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ ARGBToRGB565DitherRow(src_argb, dst_rgb565,
+ *(uint32*)(dither4x4 + ((y & 3) << 2)),
+ width); /* NOLINT */
+ src_argb += src_stride_argb;
+ dst_rgb565 += dst_stride_rgb565;
+ }
+ return 0;
+}
+
+// Convert ARGB To RGB565.
+// TODO(fbarchard): Consider using dither function low level with zeros.
+LIBYUV_API
+int ARGBToRGB565(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_rgb565,
+ int dst_stride_rgb565,
+ int width,
+ int height) {
+ int y;
+ void (*ARGBToRGB565Row)(const uint8* src_argb, uint8* dst_rgb, int width) =
+ ARGBToRGB565Row_C;
+ if (!src_argb || !dst_rgb565 || width <= 0 || height == 0) {
+ return -1;
+ }
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+ // Coalesce rows.
+ if (src_stride_argb == width * 4 && dst_stride_rgb565 == width * 2) {
+ width *= height;
+ height = 1;
+ src_stride_argb = dst_stride_rgb565 = 0;
+ }
+#if defined(HAS_ARGBTORGB565ROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ ARGBToRGB565Row = ARGBToRGB565Row_Any_SSE2;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBToRGB565Row = ARGBToRGB565Row_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTORGB565ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToRGB565Row = ARGBToRGB565Row_Any_AVX2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToRGB565Row = ARGBToRGB565Row_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTORGB565ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBToRGB565Row = ARGBToRGB565Row_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToRGB565Row = ARGBToRGB565Row_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTORGB565ROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBToRGB565Row = ARGBToRGB565Row_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToRGB565Row = ARGBToRGB565Row_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ ARGBToRGB565Row(src_argb, dst_rgb565, width);
+ src_argb += src_stride_argb;
+ dst_rgb565 += dst_stride_rgb565;
+ }
+ return 0;
+}
+
+// Convert ARGB To ARGB1555.
+LIBYUV_API
+int ARGBToARGB1555(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_argb1555,
+ int dst_stride_argb1555,
+ int width,
+ int height) {
+ int y;
+ void (*ARGBToARGB1555Row)(const uint8* src_argb, uint8* dst_rgb, int width) =
+ ARGBToARGB1555Row_C;
+ if (!src_argb || !dst_argb1555 || width <= 0 || height == 0) {
+ return -1;
+ }
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+ // Coalesce rows.
+ if (src_stride_argb == width * 4 && dst_stride_argb1555 == width * 2) {
+ width *= height;
+ height = 1;
+ src_stride_argb = dst_stride_argb1555 = 0;
+ }
+#if defined(HAS_ARGBTOARGB1555ROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ ARGBToARGB1555Row = ARGBToARGB1555Row_Any_SSE2;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBToARGB1555Row = ARGBToARGB1555Row_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOARGB1555ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToARGB1555Row = ARGBToARGB1555Row_Any_AVX2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToARGB1555Row = ARGBToARGB1555Row_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOARGB1555ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBToARGB1555Row = ARGBToARGB1555Row_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToARGB1555Row = ARGBToARGB1555Row_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOARGB1555ROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBToARGB1555Row = ARGBToARGB1555Row_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToARGB1555Row = ARGBToARGB1555Row_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ ARGBToARGB1555Row(src_argb, dst_argb1555, width);
+ src_argb += src_stride_argb;
+ dst_argb1555 += dst_stride_argb1555;
+ }
+ return 0;
+}
+
+// Convert ARGB To ARGB4444.
+LIBYUV_API
+int ARGBToARGB4444(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_argb4444,
+ int dst_stride_argb4444,
+ int width,
+ int height) {
+ int y;
+ void (*ARGBToARGB4444Row)(const uint8* src_argb, uint8* dst_rgb, int width) =
+ ARGBToARGB4444Row_C;
+ if (!src_argb || !dst_argb4444 || width <= 0 || height == 0) {
+ return -1;
+ }
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+ // Coalesce rows.
+ if (src_stride_argb == width * 4 && dst_stride_argb4444 == width * 2) {
+ width *= height;
+ height = 1;
+ src_stride_argb = dst_stride_argb4444 = 0;
+ }
+#if defined(HAS_ARGBTOARGB4444ROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ ARGBToARGB4444Row = ARGBToARGB4444Row_Any_SSE2;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBToARGB4444Row = ARGBToARGB4444Row_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOARGB4444ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToARGB4444Row = ARGBToARGB4444Row_Any_AVX2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToARGB4444Row = ARGBToARGB4444Row_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOARGB4444ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBToARGB4444Row = ARGBToARGB4444Row_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToARGB4444Row = ARGBToARGB4444Row_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOARGB4444ROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBToARGB4444Row = ARGBToARGB4444Row_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToARGB4444Row = ARGBToARGB4444Row_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ ARGBToARGB4444Row(src_argb, dst_argb4444, width);
+ src_argb += src_stride_argb;
+ dst_argb4444 += dst_stride_argb4444;
+ }
+ return 0;
+}
+
+// Convert ARGB to J420. (JPeg full range I420).
+LIBYUV_API
+int ARGBToJ420(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_yj,
+ int dst_stride_yj,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ int y;
+ void (*ARGBToUVJRow)(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width) = ARGBToUVJRow_C;
+ void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_yj, int width) =
+ ARGBToYJRow_C;
+ if (!src_argb || !dst_yj || !dst_u || !dst_v || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+#if defined(HAS_ARGBTOYJROW_SSSE3) && defined(HAS_ARGBTOUVJROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3;
+ ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVJRow = ARGBToUVJRow_SSSE3;
+ ARGBToYJRow = ARGBToYJRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYJROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToYJRow = ARGBToYJRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToYJRow = ARGBToYJRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYJROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBToYJRow = ARGBToYJRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToYJRow = ARGBToYJRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOUVJROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBToUVJRow = ARGBToUVJRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVJRow = ARGBToUVJRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYJROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBToYJRow = ARGBToYJRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYJRow = ARGBToYJRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOUVJROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBToUVJRow = ARGBToUVJRow_Any_MSA;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToUVJRow = ARGBToUVJRow_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height - 1; y += 2) {
+ ARGBToUVJRow(src_argb, src_stride_argb, dst_u, dst_v, width);
+ ARGBToYJRow(src_argb, dst_yj, width);
+ ARGBToYJRow(src_argb + src_stride_argb, dst_yj + dst_stride_yj, width);
+ src_argb += src_stride_argb * 2;
+ dst_yj += dst_stride_yj * 2;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ if (height & 1) {
+ ARGBToUVJRow(src_argb, 0, dst_u, dst_v, width);
+ ARGBToYJRow(src_argb, dst_yj, width);
+ }
+ return 0;
+}
+
+// Convert ARGB to J422. (JPeg full range I422).
+LIBYUV_API
+int ARGBToJ422(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_yj,
+ int dst_stride_yj,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ int y;
+ void (*ARGBToUVJRow)(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width) = ARGBToUVJRow_C;
+ void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_yj, int width) =
+ ARGBToYJRow_C;
+ if (!src_argb || !dst_yj || !dst_u || !dst_v || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+ // Coalesce rows.
+ if (src_stride_argb == width * 4 && dst_stride_yj == width &&
+ dst_stride_u * 2 == width && dst_stride_v * 2 == width) {
+ width *= height;
+ height = 1;
+ src_stride_argb = dst_stride_yj = dst_stride_u = dst_stride_v = 0;
+ }
+#if defined(HAS_ARGBTOYJROW_SSSE3) && defined(HAS_ARGBTOUVJROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3;
+ ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVJRow = ARGBToUVJRow_SSSE3;
+ ARGBToYJRow = ARGBToYJRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYJROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToYJRow = ARGBToYJRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToYJRow = ARGBToYJRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYJROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBToYJRow = ARGBToYJRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToYJRow = ARGBToYJRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOUVJROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBToUVJRow = ARGBToUVJRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVJRow = ARGBToUVJRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYJROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBToYJRow = ARGBToYJRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYJRow = ARGBToYJRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOUVJROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBToUVJRow = ARGBToUVJRow_Any_MSA;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToUVJRow = ARGBToUVJRow_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ ARGBToUVJRow(src_argb, 0, dst_u, dst_v, width);
+ ARGBToYJRow(src_argb, dst_yj, width);
+ src_argb += src_stride_argb;
+ dst_yj += dst_stride_yj;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ return 0;
+}
+
+// Convert ARGB to J400.
+LIBYUV_API
+int ARGBToJ400(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_yj,
+ int dst_stride_yj,
+ int width,
+ int height) {
+ int y;
+ void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_yj, int width) =
+ ARGBToYJRow_C;
+ if (!src_argb || !dst_yj || width <= 0 || height == 0) {
+ return -1;
+ }
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+ // Coalesce rows.
+ if (src_stride_argb == width * 4 && dst_stride_yj == width) {
+ width *= height;
+ height = 1;
+ src_stride_argb = dst_stride_yj = 0;
+ }
+#if defined(HAS_ARGBTOYJROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYJRow = ARGBToYJRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYJROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToYJRow = ARGBToYJRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToYJRow = ARGBToYJRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYJROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBToYJRow = ARGBToYJRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToYJRow = ARGBToYJRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYJROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBToYJRow = ARGBToYJRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYJRow = ARGBToYJRow_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ ARGBToYJRow(src_argb, dst_yj, width);
+ src_argb += src_stride_argb;
+ dst_yj += dst_stride_yj;
+ }
+ return 0;
+}
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/libyuv/src/main/cpp/libyuv/source/convert_jpeg.cc b/libyuv/src/main/cpp/libyuv/source/convert_jpeg.cc
new file mode 100644
index 0000000..216a9f2
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/source/convert_jpeg.cc
@@ -0,0 +1,318 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/convert.h"
+#include "libyuv/convert_argb.h"
+
+#ifdef HAVE_JPEG
+#include "libyuv/mjpeg_decoder.h"
+#endif
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#ifdef HAVE_JPEG
+struct I420Buffers {
+ uint8* y;
+ int y_stride;
+ uint8* u;
+ int u_stride;
+ uint8* v;
+ int v_stride;
+ int w;
+ int h;
+};
+
+static void JpegCopyI420(void* opaque,
+ const uint8* const* data,
+ const int* strides,
+ int rows) {
+ I420Buffers* dest = (I420Buffers*)(opaque);
+ I420Copy(data[0], strides[0], data[1], strides[1], data[2], strides[2],
+ dest->y, dest->y_stride, dest->u, dest->u_stride, dest->v,
+ dest->v_stride, dest->w, rows);
+ dest->y += rows * dest->y_stride;
+ dest->u += ((rows + 1) >> 1) * dest->u_stride;
+ dest->v += ((rows + 1) >> 1) * dest->v_stride;
+ dest->h -= rows;
+}
+
+static void JpegI422ToI420(void* opaque,
+ const uint8* const* data,
+ const int* strides,
+ int rows) {
+ I420Buffers* dest = (I420Buffers*)(opaque);
+ I422ToI420(data[0], strides[0], data[1], strides[1], data[2], strides[2],
+ dest->y, dest->y_stride, dest->u, dest->u_stride, dest->v,
+ dest->v_stride, dest->w, rows);
+ dest->y += rows * dest->y_stride;
+ dest->u += ((rows + 1) >> 1) * dest->u_stride;
+ dest->v += ((rows + 1) >> 1) * dest->v_stride;
+ dest->h -= rows;
+}
+
+static void JpegI444ToI420(void* opaque,
+ const uint8* const* data,
+ const int* strides,
+ int rows) {
+ I420Buffers* dest = (I420Buffers*)(opaque);
+ I444ToI420(data[0], strides[0], data[1], strides[1], data[2], strides[2],
+ dest->y, dest->y_stride, dest->u, dest->u_stride, dest->v,
+ dest->v_stride, dest->w, rows);
+ dest->y += rows * dest->y_stride;
+ dest->u += ((rows + 1) >> 1) * dest->u_stride;
+ dest->v += ((rows + 1) >> 1) * dest->v_stride;
+ dest->h -= rows;
+}
+
+static void JpegI400ToI420(void* opaque,
+ const uint8* const* data,
+ const int* strides,
+ int rows) {
+ I420Buffers* dest = (I420Buffers*)(opaque);
+ I400ToI420(data[0], strides[0], dest->y, dest->y_stride, dest->u,
+ dest->u_stride, dest->v, dest->v_stride, dest->w, rows);
+ dest->y += rows * dest->y_stride;
+ dest->u += ((rows + 1) >> 1) * dest->u_stride;
+ dest->v += ((rows + 1) >> 1) * dest->v_stride;
+ dest->h -= rows;
+}
+
+// Query size of MJPG in pixels.
+LIBYUV_API
+int MJPGSize(const uint8* sample, size_t sample_size, int* width, int* height) {
+ MJpegDecoder mjpeg_decoder;
+ LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size);
+ if (ret) {
+ *width = mjpeg_decoder.GetWidth();
+ *height = mjpeg_decoder.GetHeight();
+ }
+ mjpeg_decoder.UnloadFrame();
+ return ret ? 0 : -1; // -1 for runtime failure.
+}
+
+// MJPG (Motion JPeg) to I420
+// TODO(fbarchard): review w and h requirement. dw and dh may be enough.
+LIBYUV_API
+int MJPGToI420(const uint8* sample,
+ size_t sample_size,
+ uint8* y,
+ int y_stride,
+ uint8* u,
+ int u_stride,
+ uint8* v,
+ int v_stride,
+ int w,
+ int h,
+ int dw,
+ int dh) {
+ if (sample_size == kUnknownDataSize) {
+ // ERROR: MJPEG frame size unknown
+ return -1;
+ }
+
+ // TODO(fbarchard): Port MJpeg to C.
+ MJpegDecoder mjpeg_decoder;
+ LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size);
+ if (ret &&
+ (mjpeg_decoder.GetWidth() != w || mjpeg_decoder.GetHeight() != h)) {
+ // ERROR: MJPEG frame has unexpected dimensions
+ mjpeg_decoder.UnloadFrame();
+ return 1; // runtime failure
+ }
+ if (ret) {
+ I420Buffers bufs = {y, y_stride, u, u_stride, v, v_stride, dw, dh};
+ // YUV420
+ if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceYCbCr &&
+ mjpeg_decoder.GetNumComponents() == 3 &&
+ mjpeg_decoder.GetVertSampFactor(0) == 2 &&
+ mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
+ mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+ mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+ ret = mjpeg_decoder.DecodeToCallback(&JpegCopyI420, &bufs, dw, dh);
+ // YUV422
+ } else if (mjpeg_decoder.GetColorSpace() ==
+ MJpegDecoder::kColorSpaceYCbCr &&
+ mjpeg_decoder.GetNumComponents() == 3 &&
+ mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
+ mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+ mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+ ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToI420, &bufs, dw, dh);
+ // YUV444
+ } else if (mjpeg_decoder.GetColorSpace() ==
+ MJpegDecoder::kColorSpaceYCbCr &&
+ mjpeg_decoder.GetNumComponents() == 3 &&
+ mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(0) == 1 &&
+ mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+ mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+ ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToI420, &bufs, dw, dh);
+ // YUV400
+ } else if (mjpeg_decoder.GetColorSpace() ==
+ MJpegDecoder::kColorSpaceGrayscale &&
+ mjpeg_decoder.GetNumComponents() == 1 &&
+ mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(0) == 1) {
+ ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToI420, &bufs, dw, dh);
+ } else {
+ // TODO(fbarchard): Implement conversion for any other colorspace/sample
+ // factors that occur in practice.
+ // ERROR: Unable to convert MJPEG frame because format is not supported
+ mjpeg_decoder.UnloadFrame();
+ return 1;
+ }
+ }
+ return ret ? 0 : 1;
+}
+
+#ifdef HAVE_JPEG
+struct ARGBBuffers {
+ uint8* argb;
+ int argb_stride;
+ int w;
+ int h;
+};
+
+static void JpegI420ToARGB(void* opaque,
+ const uint8* const* data,
+ const int* strides,
+ int rows) {
+ ARGBBuffers* dest = (ARGBBuffers*)(opaque);
+ I420ToARGB(data[0], strides[0], data[1], strides[1], data[2], strides[2],
+ dest->argb, dest->argb_stride, dest->w, rows);
+ dest->argb += rows * dest->argb_stride;
+ dest->h -= rows;
+}
+
+static void JpegI422ToARGB(void* opaque,
+ const uint8* const* data,
+ const int* strides,
+ int rows) {
+ ARGBBuffers* dest = (ARGBBuffers*)(opaque);
+ I422ToARGB(data[0], strides[0], data[1], strides[1], data[2], strides[2],
+ dest->argb, dest->argb_stride, dest->w, rows);
+ dest->argb += rows * dest->argb_stride;
+ dest->h -= rows;
+}
+
+static void JpegI444ToARGB(void* opaque,
+ const uint8* const* data,
+ const int* strides,
+ int rows) {
+ ARGBBuffers* dest = (ARGBBuffers*)(opaque);
+ I444ToARGB(data[0], strides[0], data[1], strides[1], data[2], strides[2],
+ dest->argb, dest->argb_stride, dest->w, rows);
+ dest->argb += rows * dest->argb_stride;
+ dest->h -= rows;
+}
+
+static void JpegI400ToARGB(void* opaque,
+ const uint8* const* data,
+ const int* strides,
+ int rows) {
+ ARGBBuffers* dest = (ARGBBuffers*)(opaque);
+ I400ToARGB(data[0], strides[0], dest->argb, dest->argb_stride, dest->w, rows);
+ dest->argb += rows * dest->argb_stride;
+ dest->h -= rows;
+}
+
+// MJPG (Motion JPeg) to ARGB
+// TODO(fbarchard): review w and h requirement. dw and dh may be enough.
+LIBYUV_API
+int MJPGToARGB(const uint8* sample,
+ size_t sample_size,
+ uint8* argb,
+ int argb_stride,
+ int w,
+ int h,
+ int dw,
+ int dh) {
+ if (sample_size == kUnknownDataSize) {
+ // ERROR: MJPEG frame size unknown
+ return -1;
+ }
+
+ // TODO(fbarchard): Port MJpeg to C.
+ MJpegDecoder mjpeg_decoder;
+ LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size);
+ if (ret &&
+ (mjpeg_decoder.GetWidth() != w || mjpeg_decoder.GetHeight() != h)) {
+ // ERROR: MJPEG frame has unexpected dimensions
+ mjpeg_decoder.UnloadFrame();
+ return 1; // runtime failure
+ }
+ if (ret) {
+ ARGBBuffers bufs = {argb, argb_stride, dw, dh};
+ // YUV420
+ if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceYCbCr &&
+ mjpeg_decoder.GetNumComponents() == 3 &&
+ mjpeg_decoder.GetVertSampFactor(0) == 2 &&
+ mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
+ mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+ mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+ ret = mjpeg_decoder.DecodeToCallback(&JpegI420ToARGB, &bufs, dw, dh);
+ // YUV422
+ } else if (mjpeg_decoder.GetColorSpace() ==
+ MJpegDecoder::kColorSpaceYCbCr &&
+ mjpeg_decoder.GetNumComponents() == 3 &&
+ mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
+ mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+ mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+ ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToARGB, &bufs, dw, dh);
+ // YUV444
+ } else if (mjpeg_decoder.GetColorSpace() ==
+ MJpegDecoder::kColorSpaceYCbCr &&
+ mjpeg_decoder.GetNumComponents() == 3 &&
+ mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(0) == 1 &&
+ mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+ mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+ ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToARGB, &bufs, dw, dh);
+ // YUV400
+ } else if (mjpeg_decoder.GetColorSpace() ==
+ MJpegDecoder::kColorSpaceGrayscale &&
+ mjpeg_decoder.GetNumComponents() == 1 &&
+ mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(0) == 1) {
+ ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToARGB, &bufs, dw, dh);
+ } else {
+ // TODO(fbarchard): Implement conversion for any other colorspace/sample
+ // factors that occur in practice.
+ // ERROR: Unable to convert MJPEG frame because format is not supported
+ mjpeg_decoder.UnloadFrame();
+ return 1;
+ }
+ }
+ return ret ? 0 : 1;
+}
+#endif
+
+#endif
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/libyuv/src/main/cpp/libyuv/source/convert_to_argb.cc b/libyuv/src/main/cpp/libyuv/source/convert_to_argb.cc
new file mode 100644
index 0000000..63a5104
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/source/convert_to_argb.cc
@@ -0,0 +1,270 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/convert_argb.h"
+
+#include "libyuv/cpu_id.h"
+#ifdef HAVE_JPEG
+#include "libyuv/mjpeg_decoder.h"
+#endif
+#include "libyuv/rotate_argb.h"
+#include "libyuv/row.h"
+#include "libyuv/video_common.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Convert camera sample to ARGB with cropping, rotation and vertical flip.
+// src_width is used for source stride computation
+// src_height is used to compute location of planes, and indicate inversion
+// sample_size is measured in bytes and is the size of the frame.
+// With MJPEG it is the compressed size of the frame.
+LIBYUV_API
+int ConvertToARGB(const uint8* sample,
+ size_t sample_size,
+ uint8* crop_argb,
+ int argb_stride,
+ int crop_x,
+ int crop_y,
+ int src_width,
+ int src_height,
+ int crop_width,
+ int crop_height,
+ enum RotationMode rotation,
+ uint32 fourcc) {
+ uint32 format = CanonicalFourCC(fourcc);
+ int aligned_src_width = (src_width + 1) & ~1;
+ const uint8* src;
+ const uint8* src_uv;
+ int abs_src_height = (src_height < 0) ? -src_height : src_height;
+ int inv_crop_height = (crop_height < 0) ? -crop_height : crop_height;
+ int r = 0;
+
+ // One pass rotation is available for some formats. For the rest, convert
+ // to ARGB (with optional vertical flipping) into a temporary ARGB buffer,
+ // and then rotate the ARGB to the final destination buffer.
+ // For in-place conversion, if destination crop_argb is same as source sample,
+ // also enable temporary buffer.
+ LIBYUV_BOOL need_buf =
+ (rotation && format != FOURCC_ARGB) || crop_argb == sample;
+ uint8* dest_argb = crop_argb;
+ int dest_argb_stride = argb_stride;
+ uint8* rotate_buffer = NULL;
+ int abs_crop_height = (crop_height < 0) ? -crop_height : crop_height;
+
+ if (crop_argb == NULL || sample == NULL || src_width <= 0 ||
+ crop_width <= 0 || src_height == 0 || crop_height == 0) {
+ return -1;
+ }
+ if (src_height < 0) {
+ inv_crop_height = -inv_crop_height;
+ }
+
+ if (need_buf) {
+ int argb_size = crop_width * 4 * abs_crop_height;
+ rotate_buffer = (uint8*)malloc(argb_size); /* NOLINT */
+ if (!rotate_buffer) {
+ return 1; // Out of memory runtime error.
+ }
+ crop_argb = rotate_buffer;
+ argb_stride = crop_width * 4;
+ }
+
+ switch (format) {
+ // Single plane formats
+ case FOURCC_YUY2:
+ src = sample + (aligned_src_width * crop_y + crop_x) * 2;
+ r = YUY2ToARGB(src, aligned_src_width * 2, crop_argb, argb_stride,
+ crop_width, inv_crop_height);
+ break;
+ case FOURCC_UYVY:
+ src = sample + (aligned_src_width * crop_y + crop_x) * 2;
+ r = UYVYToARGB(src, aligned_src_width * 2, crop_argb, argb_stride,
+ crop_width, inv_crop_height);
+ break;
+ case FOURCC_24BG:
+ src = sample + (src_width * crop_y + crop_x) * 3;
+ r = RGB24ToARGB(src, src_width * 3, crop_argb, argb_stride, crop_width,
+ inv_crop_height);
+ break;
+ case FOURCC_RAW:
+ src = sample + (src_width * crop_y + crop_x) * 3;
+ r = RAWToARGB(src, src_width * 3, crop_argb, argb_stride, crop_width,
+ inv_crop_height);
+ break;
+ case FOURCC_ARGB:
+ if (!need_buf && !rotation) {
+ src = sample + (src_width * crop_y + crop_x) * 4;
+ r = ARGBToARGB(src, src_width * 4, crop_argb, argb_stride, crop_width,
+ inv_crop_height);
+ }
+ break;
+ case FOURCC_BGRA:
+ src = sample + (src_width * crop_y + crop_x) * 4;
+ r = BGRAToARGB(src, src_width * 4, crop_argb, argb_stride, crop_width,
+ inv_crop_height);
+ break;
+ case FOURCC_ABGR:
+ src = sample + (src_width * crop_y + crop_x) * 4;
+ r = ABGRToARGB(src, src_width * 4, crop_argb, argb_stride, crop_width,
+ inv_crop_height);
+ break;
+ case FOURCC_RGBA:
+ src = sample + (src_width * crop_y + crop_x) * 4;
+ r = RGBAToARGB(src, src_width * 4, crop_argb, argb_stride, crop_width,
+ inv_crop_height);
+ break;
+ case FOURCC_RGBP:
+ src = sample + (src_width * crop_y + crop_x) * 2;
+ r = RGB565ToARGB(src, src_width * 2, crop_argb, argb_stride, crop_width,
+ inv_crop_height);
+ break;
+ case FOURCC_RGBO:
+ src = sample + (src_width * crop_y + crop_x) * 2;
+ r = ARGB1555ToARGB(src, src_width * 2, crop_argb, argb_stride, crop_width,
+ inv_crop_height);
+ break;
+ case FOURCC_R444:
+ src = sample + (src_width * crop_y + crop_x) * 2;
+ r = ARGB4444ToARGB(src, src_width * 2, crop_argb, argb_stride, crop_width,
+ inv_crop_height);
+ break;
+ case FOURCC_I400:
+ src = sample + src_width * crop_y + crop_x;
+ r = I400ToARGB(src, src_width, crop_argb, argb_stride, crop_width,
+ inv_crop_height);
+ break;
+
+ // Biplanar formats
+ case FOURCC_NV12:
+ src = sample + (src_width * crop_y + crop_x);
+ src_uv = sample + aligned_src_width * (src_height + crop_y / 2) + crop_x;
+ r = NV12ToARGB(src, src_width, src_uv, aligned_src_width, crop_argb,
+ argb_stride, crop_width, inv_crop_height);
+ break;
+ case FOURCC_NV21:
+ src = sample + (src_width * crop_y + crop_x);
+ src_uv = sample + aligned_src_width * (src_height + crop_y / 2) + crop_x;
+ // Call NV12 but with u and v parameters swapped.
+ r = NV21ToARGB(src, src_width, src_uv, aligned_src_width, crop_argb,
+ argb_stride, crop_width, inv_crop_height);
+ break;
+ case FOURCC_M420:
+ src = sample + (src_width * crop_y) * 12 / 8 + crop_x;
+ r = M420ToARGB(src, src_width, crop_argb, argb_stride, crop_width,
+ inv_crop_height);
+ break;
+ // Triplanar formats
+ case FOURCC_I420:
+ case FOURCC_YV12: {
+ const uint8* src_y = sample + (src_width * crop_y + crop_x);
+ const uint8* src_u;
+ const uint8* src_v;
+ int halfwidth = (src_width + 1) / 2;
+ int halfheight = (abs_src_height + 1) / 2;
+ if (format == FOURCC_YV12) {
+ src_v = sample + src_width * abs_src_height +
+ (halfwidth * crop_y + crop_x) / 2;
+ src_u = sample + src_width * abs_src_height +
+ halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
+ } else {
+ src_u = sample + src_width * abs_src_height +
+ (halfwidth * crop_y + crop_x) / 2;
+ src_v = sample + src_width * abs_src_height +
+ halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
+ }
+ r = I420ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth,
+ crop_argb, argb_stride, crop_width, inv_crop_height);
+ break;
+ }
+
+ case FOURCC_J420: {
+ const uint8* src_y = sample + (src_width * crop_y + crop_x);
+ const uint8* src_u;
+ const uint8* src_v;
+ int halfwidth = (src_width + 1) / 2;
+ int halfheight = (abs_src_height + 1) / 2;
+ src_u = sample + src_width * abs_src_height +
+ (halfwidth * crop_y + crop_x) / 2;
+ src_v = sample + src_width * abs_src_height +
+ halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
+ r = J420ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth,
+ crop_argb, argb_stride, crop_width, inv_crop_height);
+ break;
+ }
+
+ case FOURCC_I422:
+ case FOURCC_YV16: {
+ const uint8* src_y = sample + src_width * crop_y + crop_x;
+ const uint8* src_u;
+ const uint8* src_v;
+ int halfwidth = (src_width + 1) / 2;
+ if (format == FOURCC_YV16) {
+ src_v = sample + src_width * abs_src_height + halfwidth * crop_y +
+ crop_x / 2;
+ src_u = sample + src_width * abs_src_height +
+ halfwidth * (abs_src_height + crop_y) + crop_x / 2;
+ } else {
+ src_u = sample + src_width * abs_src_height + halfwidth * crop_y +
+ crop_x / 2;
+ src_v = sample + src_width * abs_src_height +
+ halfwidth * (abs_src_height + crop_y) + crop_x / 2;
+ }
+ r = I422ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth,
+ crop_argb, argb_stride, crop_width, inv_crop_height);
+ break;
+ }
+ case FOURCC_I444:
+ case FOURCC_YV24: {
+ const uint8* src_y = sample + src_width * crop_y + crop_x;
+ const uint8* src_u;
+ const uint8* src_v;
+ if (format == FOURCC_YV24) {
+ src_v = sample + src_width * (abs_src_height + crop_y) + crop_x;
+ src_u = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
+ } else {
+ src_u = sample + src_width * (abs_src_height + crop_y) + crop_x;
+ src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
+ }
+ r = I444ToARGB(src_y, src_width, src_u, src_width, src_v, src_width,
+ crop_argb, argb_stride, crop_width, inv_crop_height);
+ break;
+ }
+#ifdef HAVE_JPEG
+ case FOURCC_MJPG:
+ r = MJPGToARGB(sample, sample_size, crop_argb, argb_stride, src_width,
+ abs_src_height, crop_width, inv_crop_height);
+ break;
+#endif
+ default:
+ r = -1; // unknown fourcc - return failure code.
+ }
+
+ if (need_buf) {
+ if (!r) {
+ r = ARGBRotate(crop_argb, argb_stride, dest_argb, dest_argb_stride,
+ crop_width, abs_crop_height, rotation);
+ }
+ free(rotate_buffer);
+ } else if (rotation) {
+ src = sample + (src_width * crop_y + crop_x) * 4;
+ r = ARGBRotate(src, src_width * 4, crop_argb, argb_stride, crop_width,
+ inv_crop_height, rotation);
+ }
+
+ return r;
+}
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/libyuv/src/main/cpp/libyuv/source/convert_to_i420.cc b/libyuv/src/main/cpp/libyuv/source/convert_to_i420.cc
new file mode 100644
index 0000000..a50689d
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/source/convert_to_i420.cc
@@ -0,0 +1,263 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include
+
+#include "libyuv/convert.h"
+
+#include "libyuv/video_common.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Convert camera sample to I420 with cropping, rotation and vertical flip.
+// src_width is used for source stride computation
+// src_height is used to compute location of planes, and indicate inversion
+// sample_size is measured in bytes and is the size of the frame.
+// With MJPEG it is the compressed size of the frame.
+LIBYUV_API
+int ConvertToI420(const uint8* sample,
+ size_t sample_size,
+ uint8* y,
+ int y_stride,
+ uint8* u,
+ int u_stride,
+ uint8* v,
+ int v_stride,
+ int crop_x,
+ int crop_y,
+ int src_width,
+ int src_height,
+ int crop_width,
+ int crop_height,
+ enum RotationMode rotation,
+ uint32 fourcc) {
+ uint32 format = CanonicalFourCC(fourcc);
+ int aligned_src_width = (src_width + 1) & ~1;
+ const uint8* src;
+ const uint8* src_uv;
+ const int abs_src_height = (src_height < 0) ? -src_height : src_height;
+ // TODO(nisse): Why allow crop_height < 0?
+ const int abs_crop_height = (crop_height < 0) ? -crop_height : crop_height;
+ int r = 0;
+ LIBYUV_BOOL need_buf =
+ (rotation && format != FOURCC_I420 && format != FOURCC_NV12 &&
+ format != FOURCC_NV21 && format != FOURCC_YV12) ||
+ y == sample;
+ uint8* tmp_y = y;
+ uint8* tmp_u = u;
+ uint8* tmp_v = v;
+ int tmp_y_stride = y_stride;
+ int tmp_u_stride = u_stride;
+ int tmp_v_stride = v_stride;
+ uint8* rotate_buffer = NULL;
+ const int inv_crop_height =
+ (src_height < 0) ? -abs_crop_height : abs_crop_height;
+
+ if (!y || !u || !v || !sample || src_width <= 0 || crop_width <= 0 ||
+ src_height == 0 || crop_height == 0) {
+ return -1;
+ }
+
+ // One pass rotation is available for some formats. For the rest, convert
+ // to I420 (with optional vertical flipping) into a temporary I420 buffer,
+ // and then rotate the I420 to the final destination buffer.
+ // For in-place conversion, if destination y is same as source sample,
+ // also enable temporary buffer.
+ if (need_buf) {
+ int y_size = crop_width * abs_crop_height;
+ int uv_size = ((crop_width + 1) / 2) * ((abs_crop_height + 1) / 2);
+ rotate_buffer = (uint8*)malloc(y_size + uv_size * 2); /* NOLINT */
+ if (!rotate_buffer) {
+ return 1; // Out of memory runtime error.
+ }
+ y = rotate_buffer;
+ u = y + y_size;
+ v = u + uv_size;
+ y_stride = crop_width;
+ u_stride = v_stride = ((crop_width + 1) / 2);
+ }
+
+ switch (format) {
+ // Single plane formats
+ case FOURCC_YUY2:
+ src = sample + (aligned_src_width * crop_y + crop_x) * 2;
+ r = YUY2ToI420(src, aligned_src_width * 2, y, y_stride, u, u_stride, v,
+ v_stride, crop_width, inv_crop_height);
+ break;
+ case FOURCC_UYVY:
+ src = sample + (aligned_src_width * crop_y + crop_x) * 2;
+ r = UYVYToI420(src, aligned_src_width * 2, y, y_stride, u, u_stride, v,
+ v_stride, crop_width, inv_crop_height);
+ break;
+ case FOURCC_RGBP:
+ src = sample + (src_width * crop_y + crop_x) * 2;
+ r = RGB565ToI420(src, src_width * 2, y, y_stride, u, u_stride, v,
+ v_stride, crop_width, inv_crop_height);
+ break;
+ case FOURCC_RGBO:
+ src = sample + (src_width * crop_y + crop_x) * 2;
+ r = ARGB1555ToI420(src, src_width * 2, y, y_stride, u, u_stride, v,
+ v_stride, crop_width, inv_crop_height);
+ break;
+ case FOURCC_R444:
+ src = sample + (src_width * crop_y + crop_x) * 2;
+ r = ARGB4444ToI420(src, src_width * 2, y, y_stride, u, u_stride, v,
+ v_stride, crop_width, inv_crop_height);
+ break;
+ case FOURCC_24BG:
+ src = sample + (src_width * crop_y + crop_x) * 3;
+ r = RGB24ToI420(src, src_width * 3, y, y_stride, u, u_stride, v, v_stride,
+ crop_width, inv_crop_height);
+ break;
+ case FOURCC_RAW:
+ src = sample + (src_width * crop_y + crop_x) * 3;
+ r = RAWToI420(src, src_width * 3, y, y_stride, u, u_stride, v, v_stride,
+ crop_width, inv_crop_height);
+ break;
+ case FOURCC_ARGB:
+ src = sample + (src_width * crop_y + crop_x) * 4;
+ r = ARGBToI420(src, src_width * 4, y, y_stride, u, u_stride, v, v_stride,
+ crop_width, inv_crop_height);
+ break;
+ case FOURCC_BGRA:
+ src = sample + (src_width * crop_y + crop_x) * 4;
+ r = BGRAToI420(src, src_width * 4, y, y_stride, u, u_stride, v, v_stride,
+ crop_width, inv_crop_height);
+ break;
+ case FOURCC_ABGR:
+ src = sample + (src_width * crop_y + crop_x) * 4;
+ r = ABGRToI420(src, src_width * 4, y, y_stride, u, u_stride, v, v_stride,
+ crop_width, inv_crop_height);
+ break;
+ case FOURCC_RGBA:
+ src = sample + (src_width * crop_y + crop_x) * 4;
+ r = RGBAToI420(src, src_width * 4, y, y_stride, u, u_stride, v, v_stride,
+ crop_width, inv_crop_height);
+ break;
+ case FOURCC_I400:
+ src = sample + src_width * crop_y + crop_x;
+ r = I400ToI420(src, src_width, y, y_stride, u, u_stride, v, v_stride,
+ crop_width, inv_crop_height);
+ break;
+ // Biplanar formats
+ case FOURCC_NV12:
+ src = sample + (src_width * crop_y + crop_x);
+ src_uv = sample + (src_width * src_height) +
+ ((crop_y / 2) * aligned_src_width) + ((crop_x / 2) * 2);
+ r = NV12ToI420Rotate(src, src_width, src_uv, aligned_src_width, y,
+ y_stride, u, u_stride, v, v_stride, crop_width,
+ inv_crop_height, rotation);
+ break;
+ case FOURCC_NV21:
+ src = sample + (src_width * crop_y + crop_x);
+ src_uv = sample + (src_width * src_height) +
+ ((crop_y / 2) * aligned_src_width) + ((crop_x / 2) * 2);
+ // Call NV12 but with u and v parameters swapped.
+ r = NV12ToI420Rotate(src, src_width, src_uv, aligned_src_width, y,
+ y_stride, v, v_stride, u, u_stride, crop_width,
+ inv_crop_height, rotation);
+ break;
+ case FOURCC_M420:
+ src = sample + (src_width * crop_y) * 12 / 8 + crop_x;
+ r = M420ToI420(src, src_width, y, y_stride, u, u_stride, v, v_stride,
+ crop_width, inv_crop_height);
+ break;
+ // Triplanar formats
+ case FOURCC_I420:
+ case FOURCC_YV12: {
+ const uint8* src_y = sample + (src_width * crop_y + crop_x);
+ const uint8* src_u;
+ const uint8* src_v;
+ int halfwidth = (src_width + 1) / 2;
+ int halfheight = (abs_src_height + 1) / 2;
+ if (format == FOURCC_YV12) {
+ src_v = sample + src_width * abs_src_height +
+ (halfwidth * crop_y + crop_x) / 2;
+ src_u = sample + src_width * abs_src_height +
+ halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
+ } else {
+ src_u = sample + src_width * abs_src_height +
+ (halfwidth * crop_y + crop_x) / 2;
+ src_v = sample + src_width * abs_src_height +
+ halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
+ }
+ r = I420Rotate(src_y, src_width, src_u, halfwidth, src_v, halfwidth, y,
+ y_stride, u, u_stride, v, v_stride, crop_width,
+ inv_crop_height, rotation);
+ break;
+ }
+ case FOURCC_I422:
+ case FOURCC_YV16: {
+ const uint8* src_y = sample + src_width * crop_y + crop_x;
+ const uint8* src_u;
+ const uint8* src_v;
+ int halfwidth = (src_width + 1) / 2;
+ if (format == FOURCC_YV16) {
+ src_v = sample + src_width * abs_src_height + halfwidth * crop_y +
+ crop_x / 2;
+ src_u = sample + src_width * abs_src_height +
+ halfwidth * (abs_src_height + crop_y) + crop_x / 2;
+ } else {
+ src_u = sample + src_width * abs_src_height + halfwidth * crop_y +
+ crop_x / 2;
+ src_v = sample + src_width * abs_src_height +
+ halfwidth * (abs_src_height + crop_y) + crop_x / 2;
+ }
+ r = I422ToI420(src_y, src_width, src_u, halfwidth, src_v, halfwidth, y,
+ y_stride, u, u_stride, v, v_stride, crop_width,
+ inv_crop_height);
+ break;
+ }
+ case FOURCC_I444:
+ case FOURCC_YV24: {
+ const uint8* src_y = sample + src_width * crop_y + crop_x;
+ const uint8* src_u;
+ const uint8* src_v;
+ if (format == FOURCC_YV24) {
+ src_v = sample + src_width * (abs_src_height + crop_y) + crop_x;
+ src_u = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
+ } else {
+ src_u = sample + src_width * (abs_src_height + crop_y) + crop_x;
+ src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
+ }
+ r = I444ToI420(src_y, src_width, src_u, src_width, src_v, src_width, y,
+ y_stride, u, u_stride, v, v_stride, crop_width,
+ inv_crop_height);
+ break;
+ }
+#ifdef HAVE_JPEG
+ case FOURCC_MJPG:
+ r = MJPGToI420(sample, sample_size, y, y_stride, u, u_stride, v, v_stride,
+ src_width, abs_src_height, crop_width, inv_crop_height);
+ break;
+#endif
+ default:
+ r = -1; // unknown fourcc - return failure code.
+ }
+
+ if (need_buf) {
+ if (!r) {
+ r = I420Rotate(y, y_stride, u, u_stride, v, v_stride, tmp_y, tmp_y_stride,
+ tmp_u, tmp_u_stride, tmp_v, tmp_v_stride, crop_width,
+ abs_crop_height, rotation);
+ }
+ free(rotate_buffer);
+ }
+
+ return r;
+}
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/libyuv/src/main/cpp/libyuv/source/cpu_id.cc b/libyuv/src/main/cpp/libyuv/source/cpu_id.cc
new file mode 100644
index 0000000..ce16554
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/source/cpu_id.cc
@@ -0,0 +1,348 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/cpu_id.h"
+
+#if defined(_MSC_VER)
+#include // For __cpuidex()
+#endif
+#if !defined(__pnacl__) && !defined(__CLR_VER) && \
+ !defined(__native_client__) && (defined(_M_IX86) || defined(_M_X64)) && \
+ defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219)
+#include // For _xgetbv()
+#endif
+
+#if !defined(__native_client__)
+#include // For getenv()
+#endif
+
+// For ArmCpuCaps() but unittested on all platforms
+#include
+#include
+
+#include "libyuv/basic_types.h" // For CPU_X86
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// For functions that use the stack and have runtime checks for overflow,
+// use SAFEBUFFERS to avoid additional check.
+#if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219) && \
+ !defined(__clang__)
+#define SAFEBUFFERS __declspec(safebuffers)
+#else
+#define SAFEBUFFERS
+#endif
+
+// cpu_info_ variable for SIMD instruction sets detected.
+LIBYUV_API int cpu_info_ = 0;
+
+// TODO(fbarchard): Consider using int for cpuid so casting is not needed.
+// Low level cpuid for X86.
+#if (defined(_M_IX86) || defined(_M_X64) || defined(__i386__) || \
+ defined(__x86_64__)) && \
+ !defined(__pnacl__) && !defined(__CLR_VER)
+LIBYUV_API
+void CpuId(int info_eax, int info_ecx, int* cpu_info) {
+#if defined(_MSC_VER)
+// Visual C version uses intrinsic or inline x86 assembly.
+#if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219)
+ __cpuidex(cpu_info, info_eax, info_ecx);
+#elif defined(_M_IX86)
+ __asm {
+ mov eax, info_eax
+ mov ecx, info_ecx
+ mov edi, cpu_info
+ cpuid
+ mov [edi], eax
+ mov [edi + 4], ebx
+ mov [edi + 8], ecx
+ mov [edi + 12], edx
+ }
+#else // Visual C but not x86
+ if (info_ecx == 0) {
+ __cpuid(cpu_info, info_eax);
+ } else {
+ cpu_info[3] = cpu_info[2] = cpu_info[1] = cpu_info[0] = 0u;
+ }
+#endif
+// GCC version uses inline x86 assembly.
+#else // defined(_MSC_VER)
+ int info_ebx, info_edx;
+ asm volatile(
+#if defined(__i386__) && defined(__PIC__)
+ // Preserve ebx for fpic 32 bit.
+ "mov %%ebx, %%edi \n"
+ "cpuid \n"
+ "xchg %%edi, %%ebx \n"
+ : "=D"(info_ebx),
+#else
+ "cpuid \n"
+ : "=b"(info_ebx),
+#endif // defined( __i386__) && defined(__PIC__)
+ "+a"(info_eax), "+c"(info_ecx), "=d"(info_edx));
+ cpu_info[0] = info_eax;
+ cpu_info[1] = info_ebx;
+ cpu_info[2] = info_ecx;
+ cpu_info[3] = info_edx;
+#endif // defined(_MSC_VER)
+}
+#else // (defined(_M_IX86) || defined(_M_X64) ...
+LIBYUV_API
+void CpuId(int eax, int ecx, int* cpu_info) {
+ (void)eax;
+ (void)ecx;
+ cpu_info[0] = cpu_info[1] = cpu_info[2] = cpu_info[3] = 0;
+}
+#endif
+
+// For VS2010 and earlier emit can be used:
+// _asm _emit 0x0f _asm _emit 0x01 _asm _emit 0xd0 // For VS2010 and earlier.
+// __asm {
+// xor ecx, ecx // xcr 0
+// xgetbv
+// mov xcr0, eax
+// }
+// For VS2013 and earlier 32 bit, the _xgetbv(0) optimizer produces bad code.
+// https://code.google.com/p/libyuv/issues/detail?id=529
+#if defined(_M_IX86) && (_MSC_VER < 1900)
+#pragma optimize("g", off)
+#endif
+#if (defined(_M_IX86) || defined(_M_X64) || defined(__i386__) || \
+ defined(__x86_64__)) && \
+ !defined(__pnacl__) && !defined(__CLR_VER) && !defined(__native_client__)
+// X86 CPUs have xgetbv to detect OS saves high parts of ymm registers.
+int GetXCR0() {
+ int xcr0 = 0;
+#if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219)
+ xcr0 = _xgetbv(0); // VS2010 SP1 required.
+#elif defined(__i386__) || defined(__x86_64__)
+ asm(".byte 0x0f, 0x01, 0xd0" : "=a"(xcr0) : "c"(0) : "%edx");
+#endif // defined(__i386__) || defined(__x86_64__)
+ return xcr0;
+}
+#else
+// xgetbv unavailable to query for OSSave support. Return 0.
+#define GetXCR0() 0
+#endif // defined(_M_IX86) || defined(_M_X64) ..
+// Return optimization to previous setting.
+#if defined(_M_IX86) && (_MSC_VER < 1900)
+#pragma optimize("g", on)
+#endif
+
+// based on libvpx arm_cpudetect.c
+// For Arm, but public to allow testing on any CPU
+LIBYUV_API SAFEBUFFERS int ArmCpuCaps(const char* cpuinfo_name) {
+ char cpuinfo_line[512];
+ FILE* f = fopen(cpuinfo_name, "r");
+ if (!f) {
+ // Assume Neon if /proc/cpuinfo is unavailable.
+ // This will occur for Chrome sandbox for Pepper or Render process.
+ return kCpuHasNEON;
+ }
+ while (fgets(cpuinfo_line, sizeof(cpuinfo_line) - 1, f)) {
+ if (memcmp(cpuinfo_line, "Features", 8) == 0) {
+ char* p = strstr(cpuinfo_line, " neon");
+ if (p && (p[5] == ' ' || p[5] == '\n')) {
+ fclose(f);
+ return kCpuHasNEON;
+ }
+ // aarch64 uses asimd for Neon.
+ p = strstr(cpuinfo_line, " asimd");
+ if (p) {
+ fclose(f);
+ return kCpuHasNEON;
+ }
+ }
+ }
+ fclose(f);
+ return 0;
+}
+
+// TODO(fbarchard): Consider read_msa_ir().
+// TODO(fbarchard): Add unittest.
+LIBYUV_API SAFEBUFFERS int MipsCpuCaps(const char* cpuinfo_name,
+ const char ase[]) {
+ char cpuinfo_line[512];
+ FILE* f = fopen(cpuinfo_name, "r");
+ if (!f) {
+ // ase enabled if /proc/cpuinfo is unavailable.
+ if (strcmp(ase, " msa") == 0) {
+ return kCpuHasMSA;
+ }
+ return kCpuHasDSPR2;
+ }
+ while (fgets(cpuinfo_line, sizeof(cpuinfo_line) - 1, f)) {
+ if (memcmp(cpuinfo_line, "ASEs implemented", 16) == 0) {
+ char* p = strstr(cpuinfo_line, ase);
+ if (p) {
+ fclose(f);
+ if (strcmp(ase, " msa") == 0) {
+ return kCpuHasMSA;
+ }
+ return kCpuHasDSPR2;
+ }
+ }
+ }
+ fclose(f);
+ return 0;
+}
+
+// Test environment variable for disabling CPU features. Any non-zero value
+// to disable. Zero ignored to make it easy to set the variable on/off.
+#if !defined(__native_client__) && !defined(_M_ARM)
+
+static LIBYUV_BOOL TestEnv(const char* name) {
+ const char* var = getenv(name);
+ if (var) {
+ if (var[0] != '0') {
+ return LIBYUV_TRUE;
+ }
+ }
+ return LIBYUV_FALSE;
+}
+#else // nacl does not support getenv().
+static LIBYUV_BOOL TestEnv(const char*) {
+ return LIBYUV_FALSE;
+}
+#endif
+
+static SAFEBUFFERS int GetCpuFlags(void) {
+ int cpu_info = 0;
+#if !defined(__pnacl__) && !defined(__CLR_VER) && defined(CPU_X86)
+ int cpu_info0[4] = {0, 0, 0, 0};
+ int cpu_info1[4] = {0, 0, 0, 0};
+ int cpu_info7[4] = {0, 0, 0, 0};
+ CpuId(0, 0, cpu_info0);
+ CpuId(1, 0, cpu_info1);
+ if (cpu_info0[0] >= 7) {
+ CpuId(7, 0, cpu_info7);
+ }
+ cpu_info = kCpuHasX86 | ((cpu_info1[3] & 0x04000000) ? kCpuHasSSE2 : 0) |
+ ((cpu_info1[2] & 0x00000200) ? kCpuHasSSSE3 : 0) |
+ ((cpu_info1[2] & 0x00080000) ? kCpuHasSSE41 : 0) |
+ ((cpu_info1[2] & 0x00100000) ? kCpuHasSSE42 : 0) |
+ ((cpu_info7[1] & 0x00000200) ? kCpuHasERMS : 0);
+
+ // AVX requires OS saves YMM registers.
+ if (((cpu_info1[2] & 0x1c000000) == 0x1c000000) && // AVX and OSXSave
+ ((GetXCR0() & 6) == 6)) { // Test OS saves YMM registers
+ cpu_info |= kCpuHasAVX | ((cpu_info7[1] & 0x00000020) ? kCpuHasAVX2 : 0) |
+ ((cpu_info1[2] & 0x00001000) ? kCpuHasFMA3 : 0) |
+ ((cpu_info1[2] & 0x20000000) ? kCpuHasF16C : 0);
+
+ // Detect AVX512bw
+ if ((GetXCR0() & 0xe0) == 0xe0) {
+ cpu_info |= (cpu_info7[1] & 0x40000000) ? kCpuHasAVX3 : 0;
+ }
+ }
+
+ // Environment variable overrides for testing.
+ if (TestEnv("LIBYUV_DISABLE_X86")) {
+ cpu_info &= ~kCpuHasX86;
+ }
+ if (TestEnv("LIBYUV_DISABLE_SSE2")) {
+ cpu_info &= ~kCpuHasSSE2;
+ }
+ if (TestEnv("LIBYUV_DISABLE_SSSE3")) {
+ cpu_info &= ~kCpuHasSSSE3;
+ }
+ if (TestEnv("LIBYUV_DISABLE_SSE41")) {
+ cpu_info &= ~kCpuHasSSE41;
+ }
+ if (TestEnv("LIBYUV_DISABLE_SSE42")) {
+ cpu_info &= ~kCpuHasSSE42;
+ }
+ if (TestEnv("LIBYUV_DISABLE_AVX")) {
+ cpu_info &= ~kCpuHasAVX;
+ }
+ if (TestEnv("LIBYUV_DISABLE_AVX2")) {
+ cpu_info &= ~kCpuHasAVX2;
+ }
+ if (TestEnv("LIBYUV_DISABLE_ERMS")) {
+ cpu_info &= ~kCpuHasERMS;
+ }
+ if (TestEnv("LIBYUV_DISABLE_FMA3")) {
+ cpu_info &= ~kCpuHasFMA3;
+ }
+ if (TestEnv("LIBYUV_DISABLE_AVX3")) {
+ cpu_info &= ~kCpuHasAVX3;
+ }
+ if (TestEnv("LIBYUV_DISABLE_F16C")) {
+ cpu_info &= ~kCpuHasF16C;
+ }
+
+#endif
+#if defined(__mips__) && defined(__linux__)
+#if defined(__mips_dspr2)
+ cpu_info |= kCpuHasDSPR2;
+#endif
+#if defined(__mips_msa)
+ cpu_info = MipsCpuCaps("/proc/cpuinfo", " msa");
+#endif
+ cpu_info |= kCpuHasMIPS;
+ if (getenv("LIBYUV_DISABLE_DSPR2")) {
+ cpu_info &= ~kCpuHasDSPR2;
+ }
+ if (getenv("LIBYUV_DISABLE_MSA")) {
+ cpu_info &= ~kCpuHasMSA;
+ }
+#endif
+#if defined(__arm__) || defined(__aarch64__)
+// gcc -mfpu=neon defines __ARM_NEON__
+// __ARM_NEON__ generates code that requires Neon. NaCL also requires Neon.
+// For Linux, /proc/cpuinfo can be tested but without that assume Neon.
+#if defined(__ARM_NEON__) || defined(__native_client__) || !defined(__linux__)
+ cpu_info = kCpuHasNEON;
+// For aarch64(arm64), /proc/cpuinfo's feature is not complete, e.g. no neon
+// flag in it.
+// So for aarch64, neon enabling is hard coded here.
+#endif
+#if defined(__aarch64__)
+ cpu_info = kCpuHasNEON;
+#else
+ // Linux arm parse text file for neon detect.
+ cpu_info = ArmCpuCaps("/proc/cpuinfo");
+#endif
+ cpu_info |= kCpuHasARM;
+ if (TestEnv("LIBYUV_DISABLE_NEON")) {
+ cpu_info &= ~kCpuHasNEON;
+ }
+#endif // __arm__
+ if (TestEnv("LIBYUV_DISABLE_ASM")) {
+ cpu_info = 0;
+ }
+ cpu_info |= kCpuInitialized;
+ return cpu_info;
+}
+
+// Note that use of this function is not thread safe.
+LIBYUV_API
+int MaskCpuFlags(int enable_flags) {
+ int cpu_info = GetCpuFlags() & enable_flags;
+#ifdef __ATOMIC_RELAXED
+ __atomic_store_n(&cpu_info_, cpu_info, __ATOMIC_RELAXED);
+#else
+ cpu_info_ = cpu_info;
+#endif
+ return cpu_info;
+}
+
+LIBYUV_API
+int InitCpuFlags(void) {
+ return MaskCpuFlags(-1);
+}
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/libyuv/src/main/cpp/libyuv/source/mjpeg_decoder.cc b/libyuv/src/main/cpp/libyuv/source/mjpeg_decoder.cc
new file mode 100644
index 0000000..67568b4
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/source/mjpeg_decoder.cc
@@ -0,0 +1,577 @@
+/*
+ * Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/mjpeg_decoder.h"
+
+#ifdef HAVE_JPEG
+#include
+
+#ifdef __cplusplus
+#include
+#endif
+
+#if !defined(__pnacl__) && !defined(__CLR_VER) && \
+ !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
+// Must be included before jpeglib.
+#include
+#define HAVE_SETJMP
+
+#if defined(_MSC_VER)
+// disable warning 4324: structure was padded due to __declspec(align())
+#pragma warning(disable : 4324)
+#endif
+
+#endif
+struct FILE; // For jpeglib.h.
+
+// C++ build requires extern C for jpeg internals.
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#include "libyuv/planar_functions.h" // For CopyPlane().
+
+namespace libyuv {
+
+#ifdef HAVE_SETJMP
+struct SetJmpErrorMgr {
+ jpeg_error_mgr base; // Must be at the top
+ jmp_buf setjmp_buffer;
+};
+#endif
+
+const int MJpegDecoder::kColorSpaceUnknown = JCS_UNKNOWN;
+const int MJpegDecoder::kColorSpaceGrayscale = JCS_GRAYSCALE;
+const int MJpegDecoder::kColorSpaceRgb = JCS_RGB;
+const int MJpegDecoder::kColorSpaceYCbCr = JCS_YCbCr;
+const int MJpegDecoder::kColorSpaceCMYK = JCS_CMYK;
+const int MJpegDecoder::kColorSpaceYCCK = JCS_YCCK;
+
+// Methods that are passed to jpeglib.
+boolean fill_input_buffer(jpeg_decompress_struct* cinfo);
+void init_source(jpeg_decompress_struct* cinfo);
+void skip_input_data(jpeg_decompress_struct* cinfo, long num_bytes); // NOLINT
+void term_source(jpeg_decompress_struct* cinfo);
+void ErrorHandler(jpeg_common_struct* cinfo);
+void OutputHandler(jpeg_common_struct* cinfo);
+
+MJpegDecoder::MJpegDecoder()
+ : has_scanline_padding_(LIBYUV_FALSE),
+ num_outbufs_(0),
+ scanlines_(NULL),
+ scanlines_sizes_(NULL),
+ databuf_(NULL),
+ databuf_strides_(NULL) {
+ decompress_struct_ = new jpeg_decompress_struct;
+ source_mgr_ = new jpeg_source_mgr;
+#ifdef HAVE_SETJMP
+ error_mgr_ = new SetJmpErrorMgr;
+ decompress_struct_->err = jpeg_std_error(&error_mgr_->base);
+ // Override standard exit()-based error handler.
+ error_mgr_->base.error_exit = &ErrorHandler;
+ error_mgr_->base.output_message = &OutputHandler;
+#endif
+ decompress_struct_->client_data = NULL;
+ source_mgr_->init_source = &init_source;
+ source_mgr_->fill_input_buffer = &fill_input_buffer;
+ source_mgr_->skip_input_data = &skip_input_data;
+ source_mgr_->resync_to_restart = &jpeg_resync_to_restart;
+ source_mgr_->term_source = &term_source;
+ jpeg_create_decompress(decompress_struct_);
+ decompress_struct_->src = source_mgr_;
+ buf_vec_.buffers = &buf_;
+ buf_vec_.len = 1;
+}
+
+MJpegDecoder::~MJpegDecoder() {
+ jpeg_destroy_decompress(decompress_struct_);
+ delete decompress_struct_;
+ delete source_mgr_;
+#ifdef HAVE_SETJMP
+ delete error_mgr_;
+#endif
+ DestroyOutputBuffers();
+}
+
+LIBYUV_BOOL MJpegDecoder::LoadFrame(const uint8* src, size_t src_len) {
+ if (!ValidateJpeg(src, src_len)) {
+ return LIBYUV_FALSE;
+ }
+
+ buf_.data = src;
+ buf_.len = static_cast(src_len);
+ buf_vec_.pos = 0;
+ decompress_struct_->client_data = &buf_vec_;
+#ifdef HAVE_SETJMP
+ if (setjmp(error_mgr_->setjmp_buffer)) {
+ // We called jpeg_read_header, it experienced an error, and we called
+ // longjmp() and rewound the stack to here. Return error.
+ return LIBYUV_FALSE;
+ }
+#endif
+ if (jpeg_read_header(decompress_struct_, TRUE) != JPEG_HEADER_OK) {
+ // ERROR: Bad MJPEG header
+ return LIBYUV_FALSE;
+ }
+ AllocOutputBuffers(GetNumComponents());
+ for (int i = 0; i < num_outbufs_; ++i) {
+ int scanlines_size = GetComponentScanlinesPerImcuRow(i);
+ if (scanlines_sizes_[i] != scanlines_size) {
+ if (scanlines_[i]) {
+ delete scanlines_[i];
+ }
+ scanlines_[i] = new uint8*[scanlines_size];
+ scanlines_sizes_[i] = scanlines_size;
+ }
+
+ // We allocate padding for the final scanline to pad it up to DCTSIZE bytes
+ // to avoid memory errors, since jpeglib only reads full MCUs blocks. For
+ // the preceding scanlines, the padding is not needed/wanted because the
+ // following addresses will already be valid (they are the initial bytes of
+ // the next scanline) and will be overwritten when jpeglib writes out that
+ // next scanline.
+ int databuf_stride = GetComponentStride(i);
+ int databuf_size = scanlines_size * databuf_stride;
+ if (databuf_strides_[i] != databuf_stride) {
+ if (databuf_[i]) {
+ delete databuf_[i];
+ }
+ databuf_[i] = new uint8[databuf_size];
+ databuf_strides_[i] = databuf_stride;
+ }
+
+ if (GetComponentStride(i) != GetComponentWidth(i)) {
+ has_scanline_padding_ = LIBYUV_TRUE;
+ }
+ }
+ return LIBYUV_TRUE;
+}
+
+static int DivideAndRoundUp(int numerator, int denominator) {
+ return (numerator + denominator - 1) / denominator;
+}
+
+static int DivideAndRoundDown(int numerator, int denominator) {
+ return numerator / denominator;
+}
+
+// Returns width of the last loaded frame.
+int MJpegDecoder::GetWidth() {
+ return decompress_struct_->image_width;
+}
+
+// Returns height of the last loaded frame.
+int MJpegDecoder::GetHeight() {
+ return decompress_struct_->image_height;
+}
+
+// Returns format of the last loaded frame. The return value is one of the
+// kColorSpace* constants.
+int MJpegDecoder::GetColorSpace() {
+ return decompress_struct_->jpeg_color_space;
+}
+
+// Number of color components in the color space.
+int MJpegDecoder::GetNumComponents() {
+ return decompress_struct_->num_components;
+}
+
+// Sample factors of the n-th component.
+int MJpegDecoder::GetHorizSampFactor(int component) {
+ return decompress_struct_->comp_info[component].h_samp_factor;
+}
+
+int MJpegDecoder::GetVertSampFactor(int component) {
+ return decompress_struct_->comp_info[component].v_samp_factor;
+}
+
+int MJpegDecoder::GetHorizSubSampFactor(int component) {
+ return decompress_struct_->max_h_samp_factor / GetHorizSampFactor(component);
+}
+
+int MJpegDecoder::GetVertSubSampFactor(int component) {
+ return decompress_struct_->max_v_samp_factor / GetVertSampFactor(component);
+}
+
+int MJpegDecoder::GetImageScanlinesPerImcuRow() {
+ return decompress_struct_->max_v_samp_factor * DCTSIZE;
+}
+
+int MJpegDecoder::GetComponentScanlinesPerImcuRow(int component) {
+ int vs = GetVertSubSampFactor(component);
+ return DivideAndRoundUp(GetImageScanlinesPerImcuRow(), vs);
+}
+
+int MJpegDecoder::GetComponentWidth(int component) {
+ int hs = GetHorizSubSampFactor(component);
+ return DivideAndRoundUp(GetWidth(), hs);
+}
+
+int MJpegDecoder::GetComponentHeight(int component) {
+ int vs = GetVertSubSampFactor(component);
+ return DivideAndRoundUp(GetHeight(), vs);
+}
+
+// Get width in bytes padded out to a multiple of DCTSIZE
+int MJpegDecoder::GetComponentStride(int component) {
+ return (GetComponentWidth(component) + DCTSIZE - 1) & ~(DCTSIZE - 1);
+}
+
+int MJpegDecoder::GetComponentSize(int component) {
+ return GetComponentWidth(component) * GetComponentHeight(component);
+}
+
+LIBYUV_BOOL MJpegDecoder::UnloadFrame() {
+#ifdef HAVE_SETJMP
+ if (setjmp(error_mgr_->setjmp_buffer)) {
+ // We called jpeg_abort_decompress, it experienced an error, and we called
+ // longjmp() and rewound the stack to here. Return error.
+ return LIBYUV_FALSE;
+ }
+#endif
+ jpeg_abort_decompress(decompress_struct_);
+ return LIBYUV_TRUE;
+}
+
+// TODO(fbarchard): Allow rectangle to be specified: x, y, width, height.
+LIBYUV_BOOL MJpegDecoder::DecodeToBuffers(uint8** planes,
+ int dst_width,
+ int dst_height) {
+ if (dst_width != GetWidth() || dst_height > GetHeight()) {
+ // ERROR: Bad dimensions
+ return LIBYUV_FALSE;
+ }
+#ifdef HAVE_SETJMP
+ if (setjmp(error_mgr_->setjmp_buffer)) {
+ // We called into jpeglib, it experienced an error sometime during this
+ // function call, and we called longjmp() and rewound the stack to here.
+ // Return error.
+ return LIBYUV_FALSE;
+ }
+#endif
+ if (!StartDecode()) {
+ return LIBYUV_FALSE;
+ }
+ SetScanlinePointers(databuf_);
+ int lines_left = dst_height;
+ // Compute amount of lines to skip to implement vertical crop.
+ // TODO(fbarchard): Ensure skip is a multiple of maximum component
+ // subsample. ie 2
+ int skip = (GetHeight() - dst_height) / 2;
+ if (skip > 0) {
+ // There is no API to skip lines in the output data, so we read them
+ // into the temp buffer.
+ while (skip >= GetImageScanlinesPerImcuRow()) {
+ if (!DecodeImcuRow()) {
+ FinishDecode();
+ return LIBYUV_FALSE;
+ }
+ skip -= GetImageScanlinesPerImcuRow();
+ }
+ if (skip > 0) {
+ // Have a partial iMCU row left over to skip. Must read it and then
+ // copy the parts we want into the destination.
+ if (!DecodeImcuRow()) {
+ FinishDecode();
+ return LIBYUV_FALSE;
+ }
+ for (int i = 0; i < num_outbufs_; ++i) {
+ // TODO(fbarchard): Compute skip to avoid this
+ assert(skip % GetVertSubSampFactor(i) == 0);
+ int rows_to_skip = DivideAndRoundDown(skip, GetVertSubSampFactor(i));
+ int scanlines_to_copy =
+ GetComponentScanlinesPerImcuRow(i) - rows_to_skip;
+ int data_to_skip = rows_to_skip * GetComponentStride(i);
+ CopyPlane(databuf_[i] + data_to_skip, GetComponentStride(i), planes[i],
+ GetComponentWidth(i), GetComponentWidth(i),
+ scanlines_to_copy);
+ planes[i] += scanlines_to_copy * GetComponentWidth(i);
+ }
+ lines_left -= (GetImageScanlinesPerImcuRow() - skip);
+ }
+ }
+
+ // Read full MCUs but cropped horizontally
+ for (; lines_left > GetImageScanlinesPerImcuRow();
+ lines_left -= GetImageScanlinesPerImcuRow()) {
+ if (!DecodeImcuRow()) {
+ FinishDecode();
+ return LIBYUV_FALSE;
+ }
+ for (int i = 0; i < num_outbufs_; ++i) {
+ int scanlines_to_copy = GetComponentScanlinesPerImcuRow(i);
+ CopyPlane(databuf_[i], GetComponentStride(i), planes[i],
+ GetComponentWidth(i), GetComponentWidth(i), scanlines_to_copy);
+ planes[i] += scanlines_to_copy * GetComponentWidth(i);
+ }
+ }
+
+ if (lines_left > 0) {
+ // Have a partial iMCU row left over to decode.
+ if (!DecodeImcuRow()) {
+ FinishDecode();
+ return LIBYUV_FALSE;
+ }
+ for (int i = 0; i < num_outbufs_; ++i) {
+ int scanlines_to_copy =
+ DivideAndRoundUp(lines_left, GetVertSubSampFactor(i));
+ CopyPlane(databuf_[i], GetComponentStride(i), planes[i],
+ GetComponentWidth(i), GetComponentWidth(i), scanlines_to_copy);
+ planes[i] += scanlines_to_copy * GetComponentWidth(i);
+ }
+ }
+ return FinishDecode();
+}
+
+LIBYUV_BOOL MJpegDecoder::DecodeToCallback(CallbackFunction fn,
+ void* opaque,
+ int dst_width,
+ int dst_height) {
+ if (dst_width != GetWidth() || dst_height > GetHeight()) {
+ // ERROR: Bad dimensions
+ return LIBYUV_FALSE;
+ }
+#ifdef HAVE_SETJMP
+ if (setjmp(error_mgr_->setjmp_buffer)) {
+ // We called into jpeglib, it experienced an error sometime during this
+ // function call, and we called longjmp() and rewound the stack to here.
+ // Return error.
+ return LIBYUV_FALSE;
+ }
+#endif
+ if (!StartDecode()) {
+ return LIBYUV_FALSE;
+ }
+ SetScanlinePointers(databuf_);
+ int lines_left = dst_height;
+ // TODO(fbarchard): Compute amount of lines to skip to implement vertical crop
+ int skip = (GetHeight() - dst_height) / 2;
+ if (skip > 0) {
+ while (skip >= GetImageScanlinesPerImcuRow()) {
+ if (!DecodeImcuRow()) {
+ FinishDecode();
+ return LIBYUV_FALSE;
+ }
+ skip -= GetImageScanlinesPerImcuRow();
+ }
+ if (skip > 0) {
+ // Have a partial iMCU row left over to skip.
+ if (!DecodeImcuRow()) {
+ FinishDecode();
+ return LIBYUV_FALSE;
+ }
+ for (int i = 0; i < num_outbufs_; ++i) {
+ // TODO(fbarchard): Compute skip to avoid this
+ assert(skip % GetVertSubSampFactor(i) == 0);
+ int rows_to_skip = DivideAndRoundDown(skip, GetVertSubSampFactor(i));
+ int data_to_skip = rows_to_skip * GetComponentStride(i);
+ // Change our own data buffer pointers so we can pass them to the
+ // callback.
+ databuf_[i] += data_to_skip;
+ }
+ int scanlines_to_copy = GetImageScanlinesPerImcuRow() - skip;
+ (*fn)(opaque, databuf_, databuf_strides_, scanlines_to_copy);
+ // Now change them back.
+ for (int i = 0; i < num_outbufs_; ++i) {
+ int rows_to_skip = DivideAndRoundDown(skip, GetVertSubSampFactor(i));
+ int data_to_skip = rows_to_skip * GetComponentStride(i);
+ databuf_[i] -= data_to_skip;
+ }
+ lines_left -= scanlines_to_copy;
+ }
+ }
+ // Read full MCUs until we get to the crop point.
+ for (; lines_left >= GetImageScanlinesPerImcuRow();
+ lines_left -= GetImageScanlinesPerImcuRow()) {
+ if (!DecodeImcuRow()) {
+ FinishDecode();
+ return LIBYUV_FALSE;
+ }
+ (*fn)(opaque, databuf_, databuf_strides_, GetImageScanlinesPerImcuRow());
+ }
+ if (lines_left > 0) {
+ // Have a partial iMCU row left over to decode.
+ if (!DecodeImcuRow()) {
+ FinishDecode();
+ return LIBYUV_FALSE;
+ }
+ (*fn)(opaque, databuf_, databuf_strides_, lines_left);
+ }
+ return FinishDecode();
+}
+
+void init_source(j_decompress_ptr cinfo) {
+ fill_input_buffer(cinfo);
+}
+
+boolean fill_input_buffer(j_decompress_ptr cinfo) {
+ BufferVector* buf_vec = reinterpret_cast(cinfo->client_data);
+ if (buf_vec->pos >= buf_vec->len) {
+ assert(0 && "No more data");
+ // ERROR: No more data
+ return FALSE;
+ }
+ cinfo->src->next_input_byte = buf_vec->buffers[buf_vec->pos].data;
+ cinfo->src->bytes_in_buffer = buf_vec->buffers[buf_vec->pos].len;
+ ++buf_vec->pos;
+ return TRUE;
+}
+
+void skip_input_data(j_decompress_ptr cinfo, long num_bytes) { // NOLINT
+ cinfo->src->next_input_byte += num_bytes;
+}
+
+void term_source(j_decompress_ptr cinfo) {
+ (void)cinfo; // Nothing to do.
+}
+
+#ifdef HAVE_SETJMP
+void ErrorHandler(j_common_ptr cinfo) {
+// This is called when a jpeglib command experiences an error. Unfortunately
+// jpeglib's error handling model is not very flexible, because it expects the
+// error handler to not return--i.e., it wants the program to terminate. To
+// recover from errors we use setjmp() as shown in their example. setjmp() is
+// C's implementation for the "call with current continuation" functionality
+// seen in some functional programming languages.
+// A formatted message can be output, but is unsafe for release.
+#ifdef DEBUG
+ char buf[JMSG_LENGTH_MAX];
+ (*cinfo->err->format_message)(cinfo, buf);
+// ERROR: Error in jpeglib: buf
+#endif
+
+ SetJmpErrorMgr* mgr = reinterpret_cast(cinfo->err);
+ // This rewinds the call stack to the point of the corresponding setjmp()
+ // and causes it to return (for a second time) with value 1.
+ longjmp(mgr->setjmp_buffer, 1);
+}
+
+// Suppress fprintf warnings.
+void OutputHandler(j_common_ptr cinfo) {
+ (void)cinfo;
+}
+
+#endif // HAVE_SETJMP
+
+void MJpegDecoder::AllocOutputBuffers(int num_outbufs) {
+ if (num_outbufs != num_outbufs_) {
+ // We could perhaps optimize this case to resize the output buffers without
+ // necessarily having to delete and recreate each one, but it's not worth
+ // it.
+ DestroyOutputBuffers();
+
+ scanlines_ = new uint8**[num_outbufs];
+ scanlines_sizes_ = new int[num_outbufs];
+ databuf_ = new uint8*[num_outbufs];
+ databuf_strides_ = new int[num_outbufs];
+
+ for (int i = 0; i < num_outbufs; ++i) {
+ scanlines_[i] = NULL;
+ scanlines_sizes_[i] = 0;
+ databuf_[i] = NULL;
+ databuf_strides_[i] = 0;
+ }
+
+ num_outbufs_ = num_outbufs;
+ }
+}
+
+void MJpegDecoder::DestroyOutputBuffers() {
+ for (int i = 0; i < num_outbufs_; ++i) {
+ delete[] scanlines_[i];
+ delete[] databuf_[i];
+ }
+ delete[] scanlines_;
+ delete[] databuf_;
+ delete[] scanlines_sizes_;
+ delete[] databuf_strides_;
+ scanlines_ = NULL;
+ databuf_ = NULL;
+ scanlines_sizes_ = NULL;
+ databuf_strides_ = NULL;
+ num_outbufs_ = 0;
+}
+
+// JDCT_IFAST and do_block_smoothing improve performance substantially.
+LIBYUV_BOOL MJpegDecoder::StartDecode() {
+ decompress_struct_->raw_data_out = TRUE;
+ decompress_struct_->dct_method = JDCT_IFAST; // JDCT_ISLOW is default
+ decompress_struct_->dither_mode = JDITHER_NONE;
+ // Not applicable to 'raw':
+ decompress_struct_->do_fancy_upsampling = (boolean)(LIBYUV_FALSE);
+ // Only for buffered mode:
+ decompress_struct_->enable_2pass_quant = (boolean)(LIBYUV_FALSE);
+ // Blocky but fast:
+ decompress_struct_->do_block_smoothing = (boolean)(LIBYUV_FALSE);
+
+ if (!jpeg_start_decompress(decompress_struct_)) {
+ // ERROR: Couldn't start JPEG decompressor";
+ return LIBYUV_FALSE;
+ }
+ return LIBYUV_TRUE;
+}
+
+LIBYUV_BOOL MJpegDecoder::FinishDecode() {
+ // jpeglib considers it an error if we finish without decoding the whole
+ // image, so we call "abort" rather than "finish".
+ jpeg_abort_decompress(decompress_struct_);
+ return LIBYUV_TRUE;
+}
+
+void MJpegDecoder::SetScanlinePointers(uint8** data) {
+ for (int i = 0; i < num_outbufs_; ++i) {
+ uint8* data_i = data[i];
+ for (int j = 0; j < scanlines_sizes_[i]; ++j) {
+ scanlines_[i][j] = data_i;
+ data_i += GetComponentStride(i);
+ }
+ }
+}
+
+inline LIBYUV_BOOL MJpegDecoder::DecodeImcuRow() {
+ return (unsigned int)(GetImageScanlinesPerImcuRow()) ==
+ jpeg_read_raw_data(decompress_struct_, scanlines_,
+ GetImageScanlinesPerImcuRow());
+}
+
+// The helper function which recognizes the jpeg sub-sampling type.
+JpegSubsamplingType MJpegDecoder::JpegSubsamplingTypeHelper(
+ int* subsample_x,
+ int* subsample_y,
+ int number_of_components) {
+ if (number_of_components == 3) { // Color images.
+ if (subsample_x[0] == 1 && subsample_y[0] == 1 && subsample_x[1] == 2 &&
+ subsample_y[1] == 2 && subsample_x[2] == 2 && subsample_y[2] == 2) {
+ return kJpegYuv420;
+ } else if (subsample_x[0] == 1 && subsample_y[0] == 1 &&
+ subsample_x[1] == 2 && subsample_y[1] == 1 &&
+ subsample_x[2] == 2 && subsample_y[2] == 1) {
+ return kJpegYuv422;
+ } else if (subsample_x[0] == 1 && subsample_y[0] == 1 &&
+ subsample_x[1] == 1 && subsample_y[1] == 1 &&
+ subsample_x[2] == 1 && subsample_y[2] == 1) {
+ return kJpegYuv444;
+ }
+ } else if (number_of_components == 1) { // Grey-scale images.
+ if (subsample_x[0] == 1 && subsample_y[0] == 1) {
+ return kJpegYuv400;
+ }
+ }
+ return kJpegUnknown;
+}
+
+} // namespace libyuv
+#endif // HAVE_JPEG
diff --git a/libyuv/src/main/cpp/libyuv/source/mjpeg_validate.cc b/libyuv/src/main/cpp/libyuv/source/mjpeg_validate.cc
new file mode 100644
index 0000000..1a17dd7
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/source/mjpeg_validate.cc
@@ -0,0 +1,70 @@
+/*
+ * Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/mjpeg_decoder.h"
+
+#include // For memchr.
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Helper function to scan for EOI marker (0xff 0xd9).
+static LIBYUV_BOOL ScanEOI(const uint8* sample, size_t sample_size) {
+ if (sample_size >= 2) {
+ const uint8* end = sample + sample_size - 1;
+ const uint8* it = sample;
+ while (it < end) {
+ // TODO(fbarchard): scan for 0xd9 instead.
+ it = static_cast(memchr(it, 0xff, end - it));
+ if (it == NULL) {
+ break;
+ }
+ if (it[1] == 0xd9) {
+ return LIBYUV_TRUE; // Success: Valid jpeg.
+ }
+ ++it; // Skip over current 0xff.
+ }
+ }
+ // ERROR: Invalid jpeg end code not found. Size sample_size
+ return LIBYUV_FALSE;
+}
+
+// Helper function to validate the jpeg appears intact.
+LIBYUV_BOOL ValidateJpeg(const uint8* sample, size_t sample_size) {
+ // Maximum size that ValidateJpeg will consider valid.
+ const size_t kMaxJpegSize = 0x7fffffffull;
+ const size_t kBackSearchSize = 1024;
+ if (sample_size < 64 || sample_size > kMaxJpegSize || !sample) {
+ // ERROR: Invalid jpeg size: sample_size
+ return LIBYUV_FALSE;
+ }
+ if (sample[0] != 0xff || sample[1] != 0xd8) { // SOI marker
+ // ERROR: Invalid jpeg initial start code
+ return LIBYUV_FALSE;
+ }
+
+ // Look for the End Of Image (EOI) marker near the end of the buffer.
+ if (sample_size > kBackSearchSize) {
+ if (ScanEOI(sample + sample_size - kBackSearchSize, kBackSearchSize)) {
+ return LIBYUV_TRUE; // Success: Valid jpeg.
+ }
+ // Reduce search size for forward search.
+ sample_size = sample_size - kBackSearchSize + 1;
+ }
+ // Step over SOI marker and scan for EOI.
+ return ScanEOI(sample + 2, sample_size - 2);
+}
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/libyuv/src/main/cpp/libyuv/source/planar_functions.cc b/libyuv/src/main/cpp/libyuv/source/planar_functions.cc
new file mode 100644
index 0000000..b8a53e8
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/source/planar_functions.cc
@@ -0,0 +1,3333 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/planar_functions.h"
+
+#include // for memset()
+
+#include "libyuv/cpu_id.h"
+#ifdef HAVE_JPEG
+#include "libyuv/mjpeg_decoder.h"
+#endif
+#include "libyuv/row.h"
+#include "libyuv/scale_row.h" // for ScaleRowDown2
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Copy a plane of data
+LIBYUV_API
+void CopyPlane(const uint8* src_y,
+ int src_stride_y,
+ uint8* dst_y,
+ int dst_stride_y,
+ int width,
+ int height) {
+ int y;
+ void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_y = dst_y + (height - 1) * dst_stride_y;
+ dst_stride_y = -dst_stride_y;
+ }
+ // Coalesce rows.
+ if (src_stride_y == width && dst_stride_y == width) {
+ width *= height;
+ height = 1;
+ src_stride_y = dst_stride_y = 0;
+ }
+ // Nothing to do.
+ if (src_y == dst_y && src_stride_y == dst_stride_y) {
+ return;
+ }
+#if defined(HAS_COPYROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
+ }
+#endif
+#if defined(HAS_COPYROW_AVX)
+ if (TestCpuFlag(kCpuHasAVX)) {
+ CopyRow = IS_ALIGNED(width, 64) ? CopyRow_AVX : CopyRow_Any_AVX;
+ }
+#endif
+#if defined(HAS_COPYROW_ERMS)
+ if (TestCpuFlag(kCpuHasERMS)) {
+ CopyRow = CopyRow_ERMS;
+ }
+#endif
+#if defined(HAS_COPYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
+ }
+#endif
+#if defined(HAS_COPYROW_MIPS)
+ if (TestCpuFlag(kCpuHasMIPS)) {
+ CopyRow = CopyRow_MIPS;
+ }
+#endif
+
+ // Copy plane
+ for (y = 0; y < height; ++y) {
+ CopyRow(src_y, dst_y, width);
+ src_y += src_stride_y;
+ dst_y += dst_stride_y;
+ }
+}
+
+// TODO(fbarchard): Consider support for negative height.
+// TODO(fbarchard): Consider stride measured in bytes.
+LIBYUV_API
+void CopyPlane_16(const uint16* src_y,
+ int src_stride_y,
+ uint16* dst_y,
+ int dst_stride_y,
+ int width,
+ int height) {
+ int y;
+ void (*CopyRow)(const uint16* src, uint16* dst, int width) = CopyRow_16_C;
+ // Coalesce rows.
+ if (src_stride_y == width && dst_stride_y == width) {
+ width *= height;
+ height = 1;
+ src_stride_y = dst_stride_y = 0;
+ }
+#if defined(HAS_COPYROW_16_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32)) {
+ CopyRow = CopyRow_16_SSE2;
+ }
+#endif
+#if defined(HAS_COPYROW_16_ERMS)
+ if (TestCpuFlag(kCpuHasERMS)) {
+ CopyRow = CopyRow_16_ERMS;
+ }
+#endif
+#if defined(HAS_COPYROW_16_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) {
+ CopyRow = CopyRow_16_NEON;
+ }
+#endif
+#if defined(HAS_COPYROW_16_MIPS)
+ if (TestCpuFlag(kCpuHasMIPS)) {
+ CopyRow = CopyRow_16_MIPS;
+ }
+#endif
+
+ // Copy plane
+ for (y = 0; y < height; ++y) {
+ CopyRow(src_y, dst_y, width);
+ src_y += src_stride_y;
+ dst_y += dst_stride_y;
+ }
+}
+
+// Copy I422.
+LIBYUV_API
+int I422Copy(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ int halfwidth = (width + 1) >> 1;
+ if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_u = src_u + (height - 1) * src_stride_u;
+ src_v = src_v + (height - 1) * src_stride_v;
+ src_stride_y = -src_stride_y;
+ src_stride_u = -src_stride_u;
+ src_stride_v = -src_stride_v;
+ }
+
+ if (dst_y) {
+ CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ }
+ CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, height);
+ CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, height);
+ return 0;
+}
+
+// Copy I444.
+LIBYUV_API
+int I444Copy(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_u = src_u + (height - 1) * src_stride_u;
+ src_v = src_v + (height - 1) * src_stride_v;
+ src_stride_y = -src_stride_y;
+ src_stride_u = -src_stride_u;
+ src_stride_v = -src_stride_v;
+ }
+
+ if (dst_y) {
+ CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ }
+ CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, width, height);
+ CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, width, height);
+ return 0;
+}
+
+// Copy I400.
+LIBYUV_API
+int I400ToI400(const uint8* src_y,
+ int src_stride_y,
+ uint8* dst_y,
+ int dst_stride_y,
+ int width,
+ int height) {
+ if (!src_y || !dst_y || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_stride_y = -src_stride_y;
+ }
+ CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ return 0;
+}
+
+// Convert I420 to I400.
+LIBYUV_API
+int I420ToI400(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_y,
+ int dst_stride_y,
+ int width,
+ int height) {
+ (void)src_u;
+ (void)src_stride_u;
+ (void)src_v;
+ (void)src_stride_v;
+ if (!src_y || !dst_y || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_stride_y = -src_stride_y;
+ }
+
+ CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ return 0;
+}
+
+// Support function for NV12 etc UV channels.
+// Width and height are plane sizes (typically half pixel width).
+LIBYUV_API
+void SplitUVPlane(const uint8* src_uv,
+ int src_stride_uv,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ int y;
+ void (*SplitUVRow)(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+ int width) = SplitUVRow_C;
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_u = dst_u + (height - 1) * dst_stride_u;
+ dst_v = dst_v + (height - 1) * dst_stride_v;
+ dst_stride_u = -dst_stride_u;
+ dst_stride_v = -dst_stride_v;
+ }
+ // Coalesce rows.
+ if (src_stride_uv == width * 2 && dst_stride_u == width &&
+ dst_stride_v == width) {
+ width *= height;
+ height = 1;
+ src_stride_uv = dst_stride_u = dst_stride_v = 0;
+ }
+#if defined(HAS_SPLITUVROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ SplitUVRow = SplitUVRow_Any_SSE2;
+ if (IS_ALIGNED(width, 16)) {
+ SplitUVRow = SplitUVRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_SPLITUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ SplitUVRow = SplitUVRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ SplitUVRow = SplitUVRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_SPLITUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ SplitUVRow = SplitUVRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ SplitUVRow = SplitUVRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_SPLITUVROW_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(dst_u, 4) &&
+ IS_ALIGNED(dst_stride_u, 4) && IS_ALIGNED(dst_v, 4) &&
+ IS_ALIGNED(dst_stride_v, 4)) {
+ SplitUVRow = SplitUVRow_Any_DSPR2;
+ if (IS_ALIGNED(width, 16)) {
+ SplitUVRow = SplitUVRow_DSPR2;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ // Copy a row of UV.
+ SplitUVRow(src_uv, dst_u, dst_v, width);
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ src_uv += src_stride_uv;
+ }
+}
+
+LIBYUV_API
+void MergeUVPlane(const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height) {
+ int y;
+ void (*MergeUVRow)(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+ int width) = MergeUVRow_C;
+ // Coalesce rows.
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_uv = dst_uv + (height - 1) * dst_stride_uv;
+ dst_stride_uv = -dst_stride_uv;
+ }
+ // Coalesce rows.
+ if (src_stride_u == width && src_stride_v == width &&
+ dst_stride_uv == width * 2) {
+ width *= height;
+ height = 1;
+ src_stride_u = src_stride_v = dst_stride_uv = 0;
+ }
+#if defined(HAS_MERGEUVROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ MergeUVRow = MergeUVRow_Any_SSE2;
+ if (IS_ALIGNED(width, 16)) {
+ MergeUVRow = MergeUVRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_MERGEUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ MergeUVRow = MergeUVRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ MergeUVRow = MergeUVRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_MERGEUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ MergeUVRow = MergeUVRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ MergeUVRow = MergeUVRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_MERGEUVROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ MergeUVRow = MergeUVRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ MergeUVRow = MergeUVRow_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ // Merge a row of U and V into a row of UV.
+ MergeUVRow(src_u, src_v, dst_uv, width);
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ dst_uv += dst_stride_uv;
+ }
+}
+
+// Mirror a plane of data.
+void MirrorPlane(const uint8* src_y,
+ int src_stride_y,
+ uint8* dst_y,
+ int dst_stride_y,
+ int width,
+ int height) {
+ int y;
+ void (*MirrorRow)(const uint8* src, uint8* dst, int width) = MirrorRow_C;
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_stride_y = -src_stride_y;
+ }
+#if defined(HAS_MIRRORROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ MirrorRow = MirrorRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ MirrorRow = MirrorRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_MIRRORROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ MirrorRow = MirrorRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ MirrorRow = MirrorRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_MIRRORROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ MirrorRow = MirrorRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ MirrorRow = MirrorRow_AVX2;
+ }
+ }
+#endif
+// TODO(fbarchard): Mirror on mips handle unaligned memory.
+#if defined(HAS_MIRRORROW_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(src_y, 4) &&
+ IS_ALIGNED(src_stride_y, 4) && IS_ALIGNED(dst_y, 4) &&
+ IS_ALIGNED(dst_stride_y, 4)) {
+ MirrorRow = MirrorRow_DSPR2;
+ }
+#endif
+#if defined(HAS_MIRRORROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ MirrorRow = MirrorRow_Any_MSA;
+ if (IS_ALIGNED(width, 64)) {
+ MirrorRow = MirrorRow_MSA;
+ }
+ }
+#endif
+
+ // Mirror plane
+ for (y = 0; y < height; ++y) {
+ MirrorRow(src_y, dst_y, width);
+ src_y += src_stride_y;
+ dst_y += dst_stride_y;
+ }
+}
+
+// Convert YUY2 to I422.
+LIBYUV_API
+int YUY2ToI422(const uint8* src_yuy2,
+ int src_stride_yuy2,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ int y;
+ void (*YUY2ToUV422Row)(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
+ int width) = YUY2ToUV422Row_C;
+ void (*YUY2ToYRow)(const uint8* src_yuy2, uint8* dst_y, int width) =
+ YUY2ToYRow_C;
+ if (!src_yuy2 || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2;
+ src_stride_yuy2 = -src_stride_yuy2;
+ }
+ // Coalesce rows.
+ if (src_stride_yuy2 == width * 2 && dst_stride_y == width &&
+ dst_stride_u * 2 == width && dst_stride_v * 2 == width &&
+ width * height <= 32768) {
+ width *= height;
+ height = 1;
+ src_stride_yuy2 = dst_stride_y = dst_stride_u = dst_stride_v = 0;
+ }
+#if defined(HAS_YUY2TOYROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ YUY2ToUV422Row = YUY2ToUV422Row_Any_SSE2;
+ YUY2ToYRow = YUY2ToYRow_Any_SSE2;
+ if (IS_ALIGNED(width, 16)) {
+ YUY2ToUV422Row = YUY2ToUV422Row_SSE2;
+ YUY2ToYRow = YUY2ToYRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_YUY2TOYROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ YUY2ToUV422Row = YUY2ToUV422Row_Any_AVX2;
+ YUY2ToYRow = YUY2ToYRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ YUY2ToUV422Row = YUY2ToUV422Row_AVX2;
+ YUY2ToYRow = YUY2ToYRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_YUY2TOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ YUY2ToYRow = YUY2ToYRow_Any_NEON;
+ YUY2ToUV422Row = YUY2ToUV422Row_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ YUY2ToYRow = YUY2ToYRow_NEON;
+ YUY2ToUV422Row = YUY2ToUV422Row_NEON;
+ }
+ }
+#endif
+#if defined(HAS_YUY2TOYROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ YUY2ToYRow = YUY2ToYRow_Any_MSA;
+ YUY2ToUV422Row = YUY2ToUV422Row_Any_MSA;
+ if (IS_ALIGNED(width, 32)) {
+ YUY2ToYRow = YUY2ToYRow_MSA;
+ YUY2ToUV422Row = YUY2ToUV422Row_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ YUY2ToUV422Row(src_yuy2, dst_u, dst_v, width);
+ YUY2ToYRow(src_yuy2, dst_y, width);
+ src_yuy2 += src_stride_yuy2;
+ dst_y += dst_stride_y;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ return 0;
+}
+
+// Convert UYVY to I422.
+LIBYUV_API
+int UYVYToI422(const uint8* src_uyvy,
+ int src_stride_uyvy,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ int y;
+ void (*UYVYToUV422Row)(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v,
+ int width) = UYVYToUV422Row_C;
+ void (*UYVYToYRow)(const uint8* src_uyvy, uint8* dst_y, int width) =
+ UYVYToYRow_C;
+ if (!src_uyvy || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy;
+ src_stride_uyvy = -src_stride_uyvy;
+ }
+ // Coalesce rows.
+ if (src_stride_uyvy == width * 2 && dst_stride_y == width &&
+ dst_stride_u * 2 == width && dst_stride_v * 2 == width &&
+ width * height <= 32768) {
+ width *= height;
+ height = 1;
+ src_stride_uyvy = dst_stride_y = dst_stride_u = dst_stride_v = 0;
+ }
+#if defined(HAS_UYVYTOYROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ UYVYToUV422Row = UYVYToUV422Row_Any_SSE2;
+ UYVYToYRow = UYVYToYRow_Any_SSE2;
+ if (IS_ALIGNED(width, 16)) {
+ UYVYToUV422Row = UYVYToUV422Row_SSE2;
+ UYVYToYRow = UYVYToYRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_UYVYTOYROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ UYVYToUV422Row = UYVYToUV422Row_Any_AVX2;
+ UYVYToYRow = UYVYToYRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ UYVYToUV422Row = UYVYToUV422Row_AVX2;
+ UYVYToYRow = UYVYToYRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_UYVYTOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ UYVYToYRow = UYVYToYRow_Any_NEON;
+ UYVYToUV422Row = UYVYToUV422Row_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ UYVYToYRow = UYVYToYRow_NEON;
+ UYVYToUV422Row = UYVYToUV422Row_NEON;
+ }
+ }
+#endif
+#if defined(HAS_UYVYTOYROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ UYVYToYRow = UYVYToYRow_Any_MSA;
+ UYVYToUV422Row = UYVYToUV422Row_Any_MSA;
+ if (IS_ALIGNED(width, 32)) {
+ UYVYToYRow = UYVYToYRow_MSA;
+ UYVYToUV422Row = UYVYToUV422Row_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ UYVYToUV422Row(src_uyvy, dst_u, dst_v, width);
+ UYVYToYRow(src_uyvy, dst_y, width);
+ src_uyvy += src_stride_uyvy;
+ dst_y += dst_stride_y;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ return 0;
+}
+
+// Convert YUY2 to Y.
+LIBYUV_API
+int YUY2ToY(const uint8* src_yuy2,
+ int src_stride_yuy2,
+ uint8* dst_y,
+ int dst_stride_y,
+ int width,
+ int height) {
+ int y;
+ void (*YUY2ToYRow)(const uint8* src_yuy2, uint8* dst_y, int width) =
+ YUY2ToYRow_C;
+ if (!src_yuy2 || !dst_y || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2;
+ src_stride_yuy2 = -src_stride_yuy2;
+ }
+ // Coalesce rows.
+ if (src_stride_yuy2 == width * 2 && dst_stride_y == width) {
+ width *= height;
+ height = 1;
+ src_stride_yuy2 = dst_stride_y = 0;
+ }
+#if defined(HAS_YUY2TOYROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ YUY2ToYRow = YUY2ToYRow_Any_SSE2;
+ if (IS_ALIGNED(width, 16)) {
+ YUY2ToYRow = YUY2ToYRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_YUY2TOYROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ YUY2ToYRow = YUY2ToYRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ YUY2ToYRow = YUY2ToYRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_YUY2TOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ YUY2ToYRow = YUY2ToYRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ YUY2ToYRow = YUY2ToYRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_YUY2TOYROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ YUY2ToYRow = YUY2ToYRow_Any_MSA;
+ if (IS_ALIGNED(width, 32)) {
+ YUY2ToYRow = YUY2ToYRow_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ YUY2ToYRow(src_yuy2, dst_y, width);
+ src_yuy2 += src_stride_yuy2;
+ dst_y += dst_stride_y;
+ }
+ return 0;
+}
+
+// Mirror I400 with optional flipping
+LIBYUV_API
+int I400Mirror(const uint8* src_y,
+ int src_stride_y,
+ uint8* dst_y,
+ int dst_stride_y,
+ int width,
+ int height) {
+ if (!src_y || !dst_y || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_stride_y = -src_stride_y;
+ }
+
+ MirrorPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ return 0;
+}
+
+// Mirror I420 with optional flipping
+LIBYUV_API
+int I420Mirror(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ int halfwidth = (width + 1) >> 1;
+ int halfheight = (height + 1) >> 1;
+ if (!src_y || !src_u || !src_v || !dst_y || !dst_u || !dst_v || width <= 0 ||
+ height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ halfheight = (height + 1) >> 1;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_u = src_u + (halfheight - 1) * src_stride_u;
+ src_v = src_v + (halfheight - 1) * src_stride_v;
+ src_stride_y = -src_stride_y;
+ src_stride_u = -src_stride_u;
+ src_stride_v = -src_stride_v;
+ }
+
+ if (dst_y) {
+ MirrorPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ }
+ MirrorPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, halfheight);
+ MirrorPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, halfheight);
+ return 0;
+}
+
+// ARGB mirror.
+LIBYUV_API
+int ARGBMirror(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ int y;
+ void (*ARGBMirrorRow)(const uint8* src, uint8* dst, int width) =
+ ARGBMirrorRow_C;
+ if (!src_argb || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+#if defined(HAS_ARGBMIRRORROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBMirrorRow = ARGBMirrorRow_Any_NEON;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBMirrorRow = ARGBMirrorRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBMIRRORROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ ARGBMirrorRow = ARGBMirrorRow_Any_SSE2;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBMirrorRow = ARGBMirrorRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBMIRRORROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBMirrorRow = ARGBMirrorRow_Any_AVX2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBMirrorRow = ARGBMirrorRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBMIRRORROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBMirrorRow = ARGBMirrorRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBMirrorRow = ARGBMirrorRow_MSA;
+ }
+ }
+#endif
+
+ // Mirror plane
+ for (y = 0; y < height; ++y) {
+ ARGBMirrorRow(src_argb, dst_argb, width);
+ src_argb += src_stride_argb;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Get a blender that optimized for the CPU and pixel count.
+// As there are 6 blenders to choose from, the caller should try to use
+// the same blend function for all pixels if possible.
+LIBYUV_API
+ARGBBlendRow GetARGBBlend() {
+ void (*ARGBBlendRow)(const uint8* src_argb, const uint8* src_argb1,
+ uint8* dst_argb, int width) = ARGBBlendRow_C;
+#if defined(HAS_ARGBBLENDROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBBlendRow = ARGBBlendRow_SSSE3;
+ return ARGBBlendRow;
+ }
+#endif
+#if defined(HAS_ARGBBLENDROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBBlendRow = ARGBBlendRow_NEON;
+ }
+#endif
+ return ARGBBlendRow;
+}
+
+// Alpha Blend 2 ARGB images and store to destination.
+LIBYUV_API
+int ARGBBlend(const uint8* src_argb0,
+ int src_stride_argb0,
+ const uint8* src_argb1,
+ int src_stride_argb1,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ int y;
+ void (*ARGBBlendRow)(const uint8* src_argb, const uint8* src_argb1,
+ uint8* dst_argb, int width) = GetARGBBlend();
+ if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+ // Coalesce rows.
+ if (src_stride_argb0 == width * 4 && src_stride_argb1 == width * 4 &&
+ dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0;
+ }
+
+ for (y = 0; y < height; ++y) {
+ ARGBBlendRow(src_argb0, src_argb1, dst_argb, width);
+ src_argb0 += src_stride_argb0;
+ src_argb1 += src_stride_argb1;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Alpha Blend plane and store to destination.
+LIBYUV_API
+int BlendPlane(const uint8* src_y0,
+ int src_stride_y0,
+ const uint8* src_y1,
+ int src_stride_y1,
+ const uint8* alpha,
+ int alpha_stride,
+ uint8* dst_y,
+ int dst_stride_y,
+ int width,
+ int height) {
+ int y;
+ void (*BlendPlaneRow)(const uint8* src0, const uint8* src1,
+ const uint8* alpha, uint8* dst, int width) =
+ BlendPlaneRow_C;
+ if (!src_y0 || !src_y1 || !alpha || !dst_y || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_y = dst_y + (height - 1) * dst_stride_y;
+ dst_stride_y = -dst_stride_y;
+ }
+
+ // Coalesce rows for Y plane.
+ if (src_stride_y0 == width && src_stride_y1 == width &&
+ alpha_stride == width && dst_stride_y == width) {
+ width *= height;
+ height = 1;
+ src_stride_y0 = src_stride_y1 = alpha_stride = dst_stride_y = 0;
+ }
+
+#if defined(HAS_BLENDPLANEROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ BlendPlaneRow = BlendPlaneRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ BlendPlaneRow = BlendPlaneRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_BLENDPLANEROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ BlendPlaneRow = BlendPlaneRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ BlendPlaneRow = BlendPlaneRow_AVX2;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ BlendPlaneRow(src_y0, src_y1, alpha, dst_y, width);
+ src_y0 += src_stride_y0;
+ src_y1 += src_stride_y1;
+ alpha += alpha_stride;
+ dst_y += dst_stride_y;
+ }
+ return 0;
+}
+
+#define MAXTWIDTH 2048
+// Alpha Blend YUV images and store to destination.
+LIBYUV_API
+int I420Blend(const uint8* src_y0,
+ int src_stride_y0,
+ const uint8* src_u0,
+ int src_stride_u0,
+ const uint8* src_v0,
+ int src_stride_v0,
+ const uint8* src_y1,
+ int src_stride_y1,
+ const uint8* src_u1,
+ int src_stride_u1,
+ const uint8* src_v1,
+ int src_stride_v1,
+ const uint8* alpha,
+ int alpha_stride,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ int y;
+ // Half width/height for UV.
+ int halfwidth = (width + 1) >> 1;
+ void (*BlendPlaneRow)(const uint8* src0, const uint8* src1,
+ const uint8* alpha, uint8* dst, int width) =
+ BlendPlaneRow_C;
+ void (*ScaleRowDown2)(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) = ScaleRowDown2Box_C;
+ if (!src_y0 || !src_u0 || !src_v0 || !src_y1 || !src_u1 || !src_v1 ||
+ !alpha || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+ return -1;
+ }
+
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_y = dst_y + (height - 1) * dst_stride_y;
+ dst_stride_y = -dst_stride_y;
+ }
+
+ // Blend Y plane.
+ BlendPlane(src_y0, src_stride_y0, src_y1, src_stride_y1, alpha, alpha_stride,
+ dst_y, dst_stride_y, width, height);
+
+#if defined(HAS_BLENDPLANEROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ BlendPlaneRow = BlendPlaneRow_Any_SSSE3;
+ if (IS_ALIGNED(halfwidth, 8)) {
+ BlendPlaneRow = BlendPlaneRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_BLENDPLANEROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ BlendPlaneRow = BlendPlaneRow_Any_AVX2;
+ if (IS_ALIGNED(halfwidth, 32)) {
+ BlendPlaneRow = BlendPlaneRow_AVX2;
+ }
+ }
+#endif
+ if (!IS_ALIGNED(width, 2)) {
+ ScaleRowDown2 = ScaleRowDown2Box_Odd_C;
+ }
+#if defined(HAS_SCALEROWDOWN2_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ScaleRowDown2 = ScaleRowDown2Box_Odd_NEON;
+ if (IS_ALIGNED(width, 2)) {
+ ScaleRowDown2 = ScaleRowDown2Box_Any_NEON;
+ if (IS_ALIGNED(halfwidth, 16)) {
+ ScaleRowDown2 = ScaleRowDown2Box_NEON;
+ }
+ }
+ }
+#endif
+#if defined(HAS_SCALEROWDOWN2_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ScaleRowDown2 = ScaleRowDown2Box_Odd_SSSE3;
+ if (IS_ALIGNED(width, 2)) {
+ ScaleRowDown2 = ScaleRowDown2Box_Any_SSSE3;
+ if (IS_ALIGNED(halfwidth, 16)) {
+ ScaleRowDown2 = ScaleRowDown2Box_SSSE3;
+ }
+ }
+ }
+#endif
+#if defined(HAS_SCALEROWDOWN2_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ScaleRowDown2 = ScaleRowDown2Box_Odd_AVX2;
+ if (IS_ALIGNED(width, 2)) {
+ ScaleRowDown2 = ScaleRowDown2Box_Any_AVX2;
+ if (IS_ALIGNED(halfwidth, 32)) {
+ ScaleRowDown2 = ScaleRowDown2Box_AVX2;
+ }
+ }
+ }
+#endif
+
+ // Row buffer for intermediate alpha pixels.
+ align_buffer_64(halfalpha, halfwidth);
+ for (y = 0; y < height; y += 2) {
+ // last row of odd height image use 1 row of alpha instead of 2.
+ if (y == (height - 1)) {
+ alpha_stride = 0;
+ }
+ // Subsample 2 rows of UV to half width and half height.
+ ScaleRowDown2(alpha, alpha_stride, halfalpha, halfwidth);
+ alpha += alpha_stride * 2;
+ BlendPlaneRow(src_u0, src_u1, halfalpha, dst_u, halfwidth);
+ BlendPlaneRow(src_v0, src_v1, halfalpha, dst_v, halfwidth);
+ src_u0 += src_stride_u0;
+ src_u1 += src_stride_u1;
+ dst_u += dst_stride_u;
+ src_v0 += src_stride_v0;
+ src_v1 += src_stride_v1;
+ dst_v += dst_stride_v;
+ }
+ free_aligned_buffer_64(halfalpha);
+ return 0;
+}
+
+// Multiply 2 ARGB images and store to destination.
+LIBYUV_API
+int ARGBMultiply(const uint8* src_argb0,
+ int src_stride_argb0,
+ const uint8* src_argb1,
+ int src_stride_argb1,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ int y;
+ void (*ARGBMultiplyRow)(const uint8* src0, const uint8* src1, uint8* dst,
+ int width) = ARGBMultiplyRow_C;
+ if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+ // Coalesce rows.
+ if (src_stride_argb0 == width * 4 && src_stride_argb1 == width * 4 &&
+ dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0;
+ }
+#if defined(HAS_ARGBMULTIPLYROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ ARGBMultiplyRow = ARGBMultiplyRow_Any_SSE2;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBMultiplyRow = ARGBMultiplyRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBMULTIPLYROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBMultiplyRow = ARGBMultiplyRow_Any_AVX2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBMultiplyRow = ARGBMultiplyRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBMULTIPLYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBMultiplyRow = ARGBMultiplyRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBMultiplyRow = ARGBMultiplyRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBMULTIPLYROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBMultiplyRow = ARGBMultiplyRow_Any_MSA;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBMultiplyRow = ARGBMultiplyRow_MSA;
+ }
+ }
+#endif
+
+ // Multiply plane
+ for (y = 0; y < height; ++y) {
+ ARGBMultiplyRow(src_argb0, src_argb1, dst_argb, width);
+ src_argb0 += src_stride_argb0;
+ src_argb1 += src_stride_argb1;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Add 2 ARGB images and store to destination.
+LIBYUV_API
+int ARGBAdd(const uint8* src_argb0,
+ int src_stride_argb0,
+ const uint8* src_argb1,
+ int src_stride_argb1,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ int y;
+ void (*ARGBAddRow)(const uint8* src0, const uint8* src1, uint8* dst,
+ int width) = ARGBAddRow_C;
+ if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+ // Coalesce rows.
+ if (src_stride_argb0 == width * 4 && src_stride_argb1 == width * 4 &&
+ dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0;
+ }
+#if defined(HAS_ARGBADDROW_SSE2) && (defined(_MSC_VER) && !defined(__clang__))
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ ARGBAddRow = ARGBAddRow_SSE2;
+ }
+#endif
+#if defined(HAS_ARGBADDROW_SSE2) && !(defined(_MSC_VER) && !defined(__clang__))
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ ARGBAddRow = ARGBAddRow_Any_SSE2;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBAddRow = ARGBAddRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBADDROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBAddRow = ARGBAddRow_Any_AVX2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBAddRow = ARGBAddRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBADDROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBAddRow = ARGBAddRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBAddRow = ARGBAddRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBADDROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBAddRow = ARGBAddRow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBAddRow = ARGBAddRow_MSA;
+ }
+ }
+#endif
+
+ // Add plane
+ for (y = 0; y < height; ++y) {
+ ARGBAddRow(src_argb0, src_argb1, dst_argb, width);
+ src_argb0 += src_stride_argb0;
+ src_argb1 += src_stride_argb1;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Subtract 2 ARGB images and store to destination.
+LIBYUV_API
+int ARGBSubtract(const uint8* src_argb0,
+ int src_stride_argb0,
+ const uint8* src_argb1,
+ int src_stride_argb1,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ int y;
+ void (*ARGBSubtractRow)(const uint8* src0, const uint8* src1, uint8* dst,
+ int width) = ARGBSubtractRow_C;
+ if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+ // Coalesce rows.
+ if (src_stride_argb0 == width * 4 && src_stride_argb1 == width * 4 &&
+ dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0;
+ }
+#if defined(HAS_ARGBSUBTRACTROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ ARGBSubtractRow = ARGBSubtractRow_Any_SSE2;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBSubtractRow = ARGBSubtractRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBSUBTRACTROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBSubtractRow = ARGBSubtractRow_Any_AVX2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBSubtractRow = ARGBSubtractRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBSUBTRACTROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBSubtractRow = ARGBSubtractRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBSubtractRow = ARGBSubtractRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBSUBTRACTROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBSubtractRow = ARGBSubtractRow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBSubtractRow = ARGBSubtractRow_MSA;
+ }
+ }
+#endif
+
+ // Subtract plane
+ for (y = 0; y < height; ++y) {
+ ARGBSubtractRow(src_argb0, src_argb1, dst_argb, width);
+ src_argb0 += src_stride_argb0;
+ src_argb1 += src_stride_argb1;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+// Convert I422 to RGBA with matrix
+static int I422ToRGBAMatrix(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_rgba,
+ int dst_stride_rgba,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
+ int y;
+ void (*I422ToRGBARow)(const uint8* y_buf, const uint8* u_buf,
+ const uint8* v_buf, uint8* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) =
+ I422ToRGBARow_C;
+ if (!src_y || !src_u || !src_v || !dst_rgba || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_rgba = dst_rgba + (height - 1) * dst_stride_rgba;
+ dst_stride_rgba = -dst_stride_rgba;
+ }
+#if defined(HAS_I422TORGBAROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I422ToRGBARow = I422ToRGBARow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToRGBARow = I422ToRGBARow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGBAROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I422ToRGBARow = I422ToRGBARow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToRGBARow = I422ToRGBARow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGBAROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I422ToRGBARow = I422ToRGBARow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToRGBARow = I422ToRGBARow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGBAROW_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 4) &&
+ IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
+ IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
+ IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
+ IS_ALIGNED(dst_rgba, 4) && IS_ALIGNED(dst_stride_rgba, 4)) {
+ I422ToRGBARow = I422ToRGBARow_DSPR2;
+ }
+#endif
+#if defined(HAS_I422TORGBAROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ I422ToRGBARow = I422ToRGBARow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToRGBARow = I422ToRGBARow_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ I422ToRGBARow(src_y, src_u, src_v, dst_rgba, yuvconstants, width);
+ dst_rgba += dst_stride_rgba;
+ src_y += src_stride_y;
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ return 0;
+}
+
+// Convert I422 to RGBA.
+LIBYUV_API
+int I422ToRGBA(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_rgba,
+ int dst_stride_rgba,
+ int width,
+ int height) {
+ return I422ToRGBAMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_rgba, dst_stride_rgba,
+ &kYuvI601Constants, width, height);
+}
+
+// Convert I422 to BGRA.
+LIBYUV_API
+int I422ToBGRA(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_bgra,
+ int dst_stride_bgra,
+ int width,
+ int height) {
+ return I422ToRGBAMatrix(src_y, src_stride_y, src_v,
+ src_stride_v, // Swap U and V
+ src_u, src_stride_u, dst_bgra, dst_stride_bgra,
+ &kYvuI601Constants, // Use Yvu matrix
+ width, height);
+}
+
+// Convert NV12 to RGB565.
+LIBYUV_API
+int NV12ToRGB565(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_uv,
+ int src_stride_uv,
+ uint8* dst_rgb565,
+ int dst_stride_rgb565,
+ int width,
+ int height) {
+ int y;
+ void (*NV12ToRGB565Row)(
+ const uint8* y_buf, const uint8* uv_buf, uint8* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) = NV12ToRGB565Row_C;
+ if (!src_y || !src_uv || !dst_rgb565 || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
+ dst_stride_rgb565 = -dst_stride_rgb565;
+ }
+#if defined(HAS_NV12TORGB565ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ NV12ToRGB565Row = NV12ToRGB565Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ NV12ToRGB565Row = NV12ToRGB565Row_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_NV12TORGB565ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ NV12ToRGB565Row = NV12ToRGB565Row_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ NV12ToRGB565Row = NV12ToRGB565Row_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_NV12TORGB565ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ NV12ToRGB565Row = NV12ToRGB565Row_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ NV12ToRGB565Row = NV12ToRGB565Row_NEON;
+ }
+ }
+#endif
+#if defined(HAS_NV12TORGB565ROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ NV12ToRGB565Row = NV12ToRGB565Row_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ NV12ToRGB565Row = NV12ToRGB565Row_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ NV12ToRGB565Row(src_y, src_uv, dst_rgb565, &kYuvI601Constants, width);
+ dst_rgb565 += dst_stride_rgb565;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_uv += src_stride_uv;
+ }
+ }
+ return 0;
+}
+
+// Convert RAW to RGB24.
+LIBYUV_API
+int RAWToRGB24(const uint8* src_raw,
+ int src_stride_raw,
+ uint8* dst_rgb24,
+ int dst_stride_rgb24,
+ int width,
+ int height) {
+ int y;
+ void (*RAWToRGB24Row)(const uint8* src_rgb, uint8* dst_rgb24, int width) =
+ RAWToRGB24Row_C;
+ if (!src_raw || !dst_rgb24 || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_raw = src_raw + (height - 1) * src_stride_raw;
+ src_stride_raw = -src_stride_raw;
+ }
+ // Coalesce rows.
+ if (src_stride_raw == width * 3 && dst_stride_rgb24 == width * 3) {
+ width *= height;
+ height = 1;
+ src_stride_raw = dst_stride_rgb24 = 0;
+ }
+#if defined(HAS_RAWTORGB24ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ RAWToRGB24Row = RAWToRGB24Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ RAWToRGB24Row = RAWToRGB24Row_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_RAWTORGB24ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ RAWToRGB24Row = RAWToRGB24Row_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ RAWToRGB24Row = RAWToRGB24Row_NEON;
+ }
+ }
+#endif
+#if defined(HAS_RAWTORGB24ROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ RAWToRGB24Row = RAWToRGB24Row_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ RAWToRGB24Row = RAWToRGB24Row_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ RAWToRGB24Row(src_raw, dst_rgb24, width);
+ src_raw += src_stride_raw;
+ dst_rgb24 += dst_stride_rgb24;
+ }
+ return 0;
+}
+
+LIBYUV_API
+void SetPlane(uint8* dst_y,
+ int dst_stride_y,
+ int width,
+ int height,
+ uint32 value) {
+ int y;
+ void (*SetRow)(uint8 * dst, uint8 value, int width) = SetRow_C;
+ if (height < 0) {
+ height = -height;
+ dst_y = dst_y + (height - 1) * dst_stride_y;
+ dst_stride_y = -dst_stride_y;
+ }
+ // Coalesce rows.
+ if (dst_stride_y == width) {
+ width *= height;
+ height = 1;
+ dst_stride_y = 0;
+ }
+#if defined(HAS_SETROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ SetRow = SetRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ SetRow = SetRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_SETROW_X86)
+ if (TestCpuFlag(kCpuHasX86)) {
+ SetRow = SetRow_Any_X86;
+ if (IS_ALIGNED(width, 4)) {
+ SetRow = SetRow_X86;
+ }
+ }
+#endif
+#if defined(HAS_SETROW_ERMS)
+ if (TestCpuFlag(kCpuHasERMS)) {
+ SetRow = SetRow_ERMS;
+ }
+#endif
+
+ // Set plane
+ for (y = 0; y < height; ++y) {
+ SetRow(dst_y, value, width);
+ dst_y += dst_stride_y;
+ }
+}
+
+// Draw a rectangle into I420
+LIBYUV_API
+int I420Rect(uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int x,
+ int y,
+ int width,
+ int height,
+ int value_y,
+ int value_u,
+ int value_v) {
+ int halfwidth = (width + 1) >> 1;
+ int halfheight = (height + 1) >> 1;
+ uint8* start_y = dst_y + y * dst_stride_y + x;
+ uint8* start_u = dst_u + (y / 2) * dst_stride_u + (x / 2);
+ uint8* start_v = dst_v + (y / 2) * dst_stride_v + (x / 2);
+ if (!dst_y || !dst_u || !dst_v || width <= 0 || height == 0 || x < 0 ||
+ y < 0 || value_y < 0 || value_y > 255 || value_u < 0 || value_u > 255 ||
+ value_v < 0 || value_v > 255) {
+ return -1;
+ }
+
+ SetPlane(start_y, dst_stride_y, width, height, value_y);
+ SetPlane(start_u, dst_stride_u, halfwidth, halfheight, value_u);
+ SetPlane(start_v, dst_stride_v, halfwidth, halfheight, value_v);
+ return 0;
+}
+
+// Draw a rectangle into ARGB
+LIBYUV_API
+int ARGBRect(uint8* dst_argb,
+ int dst_stride_argb,
+ int dst_x,
+ int dst_y,
+ int width,
+ int height,
+ uint32 value) {
+ int y;
+ void (*ARGBSetRow)(uint8 * dst_argb, uint32 value, int width) = ARGBSetRow_C;
+ if (!dst_argb || width <= 0 || height == 0 || dst_x < 0 || dst_y < 0) {
+ return -1;
+ }
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+ dst_argb += dst_y * dst_stride_argb + dst_x * 4;
+ // Coalesce rows.
+ if (dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ dst_stride_argb = 0;
+ }
+
+#if defined(HAS_ARGBSETROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBSetRow = ARGBSetRow_Any_NEON;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBSetRow = ARGBSetRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBSETROW_X86)
+ if (TestCpuFlag(kCpuHasX86)) {
+ ARGBSetRow = ARGBSetRow_X86;
+ }
+#endif
+#if defined(HAS_ARGBSETROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBSetRow = ARGBSetRow_Any_MSA;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBSetRow = ARGBSetRow_MSA;
+ }
+ }
+#endif
+
+ // Set plane
+ for (y = 0; y < height; ++y) {
+ ARGBSetRow(dst_argb, value, width);
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Convert unattentuated ARGB to preattenuated ARGB.
+// An unattenutated ARGB alpha blend uses the formula
+// p = a * f + (1 - a) * b
+// where
+// p is output pixel
+// f is foreground pixel
+// b is background pixel
+// a is alpha value from foreground pixel
+// An preattenutated ARGB alpha blend uses the formula
+// p = f + (1 - a) * b
+// where
+// f is foreground pixel premultiplied by alpha
+
+LIBYUV_API
+int ARGBAttenuate(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ int y;
+ void (*ARGBAttenuateRow)(const uint8* src_argb, uint8* dst_argb, int width) =
+ ARGBAttenuateRow_C;
+ if (!src_argb || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+ // Coalesce rows.
+ if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_argb = dst_stride_argb = 0;
+ }
+#if defined(HAS_ARGBATTENUATEROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ ARGBAttenuateRow(src_argb, dst_argb, width);
+ src_argb += src_stride_argb;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Convert preattentuated ARGB to unattenuated ARGB.
+LIBYUV_API
+int ARGBUnattenuate(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ int y;
+ void (*ARGBUnattenuateRow)(const uint8* src_argb, uint8* dst_argb,
+ int width) = ARGBUnattenuateRow_C;
+ if (!src_argb || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+ // Coalesce rows.
+ if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_argb = dst_stride_argb = 0;
+ }
+#if defined(HAS_ARGBUNATTENUATEROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ ARGBUnattenuateRow = ARGBUnattenuateRow_Any_SSE2;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBUnattenuateRow = ARGBUnattenuateRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBUNATTENUATEROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBUnattenuateRow = ARGBUnattenuateRow_Any_AVX2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBUnattenuateRow = ARGBUnattenuateRow_AVX2;
+ }
+ }
+#endif
+ // TODO(fbarchard): Neon version.
+
+ for (y = 0; y < height; ++y) {
+ ARGBUnattenuateRow(src_argb, dst_argb, width);
+ src_argb += src_stride_argb;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Convert ARGB to Grayed ARGB.
+LIBYUV_API
+int ARGBGrayTo(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ int y;
+ void (*ARGBGrayRow)(const uint8* src_argb, uint8* dst_argb, int width) =
+ ARGBGrayRow_C;
+ if (!src_argb || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+ // Coalesce rows.
+ if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_argb = dst_stride_argb = 0;
+ }
+#if defined(HAS_ARGBGRAYROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) {
+ ARGBGrayRow = ARGBGrayRow_SSSE3;
+ }
+#endif
+#if defined(HAS_ARGBGRAYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
+ ARGBGrayRow = ARGBGrayRow_NEON;
+ }
+#endif
+#if defined(HAS_ARGBGRAYROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) {
+ ARGBGrayRow = ARGBGrayRow_MSA;
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ ARGBGrayRow(src_argb, dst_argb, width);
+ src_argb += src_stride_argb;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Make a rectangle of ARGB gray scale.
+LIBYUV_API
+int ARGBGray(uint8* dst_argb,
+ int dst_stride_argb,
+ int dst_x,
+ int dst_y,
+ int width,
+ int height) {
+ int y;
+ void (*ARGBGrayRow)(const uint8* src_argb, uint8* dst_argb, int width) =
+ ARGBGrayRow_C;
+ uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
+ if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0) {
+ return -1;
+ }
+ // Coalesce rows.
+ if (dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ dst_stride_argb = 0;
+ }
+#if defined(HAS_ARGBGRAYROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) {
+ ARGBGrayRow = ARGBGrayRow_SSSE3;
+ }
+#endif
+#if defined(HAS_ARGBGRAYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
+ ARGBGrayRow = ARGBGrayRow_NEON;
+ }
+#endif
+#if defined(HAS_ARGBGRAYROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) {
+ ARGBGrayRow = ARGBGrayRow_MSA;
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ ARGBGrayRow(dst, dst, width);
+ dst += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Make a rectangle of ARGB Sepia tone.
+LIBYUV_API
+int ARGBSepia(uint8* dst_argb,
+ int dst_stride_argb,
+ int dst_x,
+ int dst_y,
+ int width,
+ int height) {
+ int y;
+ void (*ARGBSepiaRow)(uint8 * dst_argb, int width) = ARGBSepiaRow_C;
+ uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
+ if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0) {
+ return -1;
+ }
+ // Coalesce rows.
+ if (dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ dst_stride_argb = 0;
+ }
+#if defined(HAS_ARGBSEPIAROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) {
+ ARGBSepiaRow = ARGBSepiaRow_SSSE3;
+ }
+#endif
+#if defined(HAS_ARGBSEPIAROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
+ ARGBSepiaRow = ARGBSepiaRow_NEON;
+ }
+#endif
+#if defined(HAS_ARGBSEPIAROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) {
+ ARGBSepiaRow = ARGBSepiaRow_MSA;
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ ARGBSepiaRow(dst, width);
+ dst += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Apply a 4x4 matrix to each ARGB pixel.
+// Note: Normally for shading, but can be used to swizzle or invert.
+LIBYUV_API
+int ARGBColorMatrix(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ const int8* matrix_argb,
+ int width,
+ int height) {
+ int y;
+ void (*ARGBColorMatrixRow)(const uint8* src_argb, uint8* dst_argb,
+ const int8* matrix_argb, int width) =
+ ARGBColorMatrixRow_C;
+ if (!src_argb || !dst_argb || !matrix_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+ // Coalesce rows.
+ if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_argb = dst_stride_argb = 0;
+ }
+#if defined(HAS_ARGBCOLORMATRIXROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) {
+ ARGBColorMatrixRow = ARGBColorMatrixRow_SSSE3;
+ }
+#endif
+#if defined(HAS_ARGBCOLORMATRIXROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
+ ARGBColorMatrixRow = ARGBColorMatrixRow_NEON;
+ }
+#endif
+ for (y = 0; y < height; ++y) {
+ ARGBColorMatrixRow(src_argb, dst_argb, matrix_argb, width);
+ src_argb += src_stride_argb;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Apply a 4x3 matrix to each ARGB pixel.
+// Deprecated.
+LIBYUV_API
+int RGBColorMatrix(uint8* dst_argb,
+ int dst_stride_argb,
+ const int8* matrix_rgb,
+ int dst_x,
+ int dst_y,
+ int width,
+ int height) {
+ SIMD_ALIGNED(int8 matrix_argb[16]);
+ uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
+ if (!dst_argb || !matrix_rgb || width <= 0 || height <= 0 || dst_x < 0 ||
+ dst_y < 0) {
+ return -1;
+ }
+
+ // Convert 4x3 7 bit matrix to 4x4 6 bit matrix.
+ matrix_argb[0] = matrix_rgb[0] / 2;
+ matrix_argb[1] = matrix_rgb[1] / 2;
+ matrix_argb[2] = matrix_rgb[2] / 2;
+ matrix_argb[3] = matrix_rgb[3] / 2;
+ matrix_argb[4] = matrix_rgb[4] / 2;
+ matrix_argb[5] = matrix_rgb[5] / 2;
+ matrix_argb[6] = matrix_rgb[6] / 2;
+ matrix_argb[7] = matrix_rgb[7] / 2;
+ matrix_argb[8] = matrix_rgb[8] / 2;
+ matrix_argb[9] = matrix_rgb[9] / 2;
+ matrix_argb[10] = matrix_rgb[10] / 2;
+ matrix_argb[11] = matrix_rgb[11] / 2;
+ matrix_argb[14] = matrix_argb[13] = matrix_argb[12] = 0;
+ matrix_argb[15] = 64; // 1.0
+
+ return ARGBColorMatrix((const uint8*)(dst), dst_stride_argb, dst,
+ dst_stride_argb, &matrix_argb[0], width, height);
+}
+
+// Apply a color table each ARGB pixel.
+// Table contains 256 ARGB values.
+LIBYUV_API
+int ARGBColorTable(uint8* dst_argb,
+ int dst_stride_argb,
+ const uint8* table_argb,
+ int dst_x,
+ int dst_y,
+ int width,
+ int height) {
+ int y;
+ void (*ARGBColorTableRow)(uint8 * dst_argb, const uint8* table_argb,
+ int width) = ARGBColorTableRow_C;
+ uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
+ if (!dst_argb || !table_argb || width <= 0 || height <= 0 || dst_x < 0 ||
+ dst_y < 0) {
+ return -1;
+ }
+ // Coalesce rows.
+ if (dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ dst_stride_argb = 0;
+ }
+#if defined(HAS_ARGBCOLORTABLEROW_X86)
+ if (TestCpuFlag(kCpuHasX86)) {
+ ARGBColorTableRow = ARGBColorTableRow_X86;
+ }
+#endif
+ for (y = 0; y < height; ++y) {
+ ARGBColorTableRow(dst, table_argb, width);
+ dst += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Apply a color table each ARGB pixel but preserve destination alpha.
+// Table contains 256 ARGB values.
+LIBYUV_API
+int RGBColorTable(uint8* dst_argb,
+ int dst_stride_argb,
+ const uint8* table_argb,
+ int dst_x,
+ int dst_y,
+ int width,
+ int height) {
+ int y;
+ void (*RGBColorTableRow)(uint8 * dst_argb, const uint8* table_argb,
+ int width) = RGBColorTableRow_C;
+ uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
+ if (!dst_argb || !table_argb || width <= 0 || height <= 0 || dst_x < 0 ||
+ dst_y < 0) {
+ return -1;
+ }
+ // Coalesce rows.
+ if (dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ dst_stride_argb = 0;
+ }
+#if defined(HAS_RGBCOLORTABLEROW_X86)
+ if (TestCpuFlag(kCpuHasX86)) {
+ RGBColorTableRow = RGBColorTableRow_X86;
+ }
+#endif
+ for (y = 0; y < height; ++y) {
+ RGBColorTableRow(dst, table_argb, width);
+ dst += dst_stride_argb;
+ }
+ return 0;
+}
+
+// ARGBQuantize is used to posterize art.
+// e.g. rgb / qvalue * qvalue + qvalue / 2
+// But the low levels implement efficiently with 3 parameters, and could be
+// used for other high level operations.
+// dst_argb[0] = (b * scale >> 16) * interval_size + interval_offset;
+// where scale is 1 / interval_size as a fixed point value.
+// The divide is replaces with a multiply by reciprocal fixed point multiply.
+// Caveat - although SSE2 saturates, the C function does not and should be used
+// with care if doing anything but quantization.
+LIBYUV_API
+int ARGBQuantize(uint8* dst_argb,
+ int dst_stride_argb,
+ int scale,
+ int interval_size,
+ int interval_offset,
+ int dst_x,
+ int dst_y,
+ int width,
+ int height) {
+ int y;
+ void (*ARGBQuantizeRow)(uint8 * dst_argb, int scale, int interval_size,
+ int interval_offset, int width) = ARGBQuantizeRow_C;
+ uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
+ if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0 ||
+ interval_size < 1 || interval_size > 255) {
+ return -1;
+ }
+ // Coalesce rows.
+ if (dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ dst_stride_argb = 0;
+ }
+#if defined(HAS_ARGBQUANTIZEROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4)) {
+ ARGBQuantizeRow = ARGBQuantizeRow_SSE2;
+ }
+#endif
+#if defined(HAS_ARGBQUANTIZEROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
+ ARGBQuantizeRow = ARGBQuantizeRow_NEON;
+ }
+#endif
+ for (y = 0; y < height; ++y) {
+ ARGBQuantizeRow(dst, scale, interval_size, interval_offset, width);
+ dst += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Computes table of cumulative sum for image where the value is the sum
+// of all values above and to the left of the entry. Used by ARGBBlur.
+LIBYUV_API
+int ARGBComputeCumulativeSum(const uint8* src_argb,
+ int src_stride_argb,
+ int32* dst_cumsum,
+ int dst_stride32_cumsum,
+ int width,
+ int height) {
+ int y;
+ void (*ComputeCumulativeSumRow)(const uint8* row, int32* cumsum,
+ const int32* previous_cumsum, int width) =
+ ComputeCumulativeSumRow_C;
+ int32* previous_cumsum = dst_cumsum;
+ if (!dst_cumsum || !src_argb || width <= 0 || height <= 0) {
+ return -1;
+ }
+#if defined(HAS_CUMULATIVESUMTOAVERAGEROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ ComputeCumulativeSumRow = ComputeCumulativeSumRow_SSE2;
+ }
+#endif
+ memset(dst_cumsum, 0, width * sizeof(dst_cumsum[0]) * 4); // 4 int per pixel.
+ for (y = 0; y < height; ++y) {
+ ComputeCumulativeSumRow(src_argb, dst_cumsum, previous_cumsum, width);
+ previous_cumsum = dst_cumsum;
+ dst_cumsum += dst_stride32_cumsum;
+ src_argb += src_stride_argb;
+ }
+ return 0;
+}
+
+// Blur ARGB image.
+// Caller should allocate CumulativeSum table of width * height * 16 bytes
+// aligned to 16 byte boundary. height can be radius * 2 + 2 to save memory
+// as the buffer is treated as circular.
+LIBYUV_API
+int ARGBBlur(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int32* dst_cumsum,
+ int dst_stride32_cumsum,
+ int width,
+ int height,
+ int radius) {
+ int y;
+ void (*ComputeCumulativeSumRow)(const uint8* row, int32* cumsum,
+ const int32* previous_cumsum, int width) =
+ ComputeCumulativeSumRow_C;
+ void (*CumulativeSumToAverageRow)(const int32* topleft, const int32* botleft,
+ int width, int area, uint8* dst,
+ int count) = CumulativeSumToAverageRow_C;
+ int32* cumsum_bot_row;
+ int32* max_cumsum_bot_row;
+ int32* cumsum_top_row;
+
+ if (!src_argb || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+ if (radius > height) {
+ radius = height;
+ }
+ if (radius > (width / 2 - 1)) {
+ radius = width / 2 - 1;
+ }
+ if (radius <= 0) {
+ return -1;
+ }
+#if defined(HAS_CUMULATIVESUMTOAVERAGEROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ ComputeCumulativeSumRow = ComputeCumulativeSumRow_SSE2;
+ CumulativeSumToAverageRow = CumulativeSumToAverageRow_SSE2;
+ }
+#endif
+ // Compute enough CumulativeSum for first row to be blurred. After this
+ // one row of CumulativeSum is updated at a time.
+ ARGBComputeCumulativeSum(src_argb, src_stride_argb, dst_cumsum,
+ dst_stride32_cumsum, width, radius);
+
+ src_argb = src_argb + radius * src_stride_argb;
+ cumsum_bot_row = &dst_cumsum[(radius - 1) * dst_stride32_cumsum];
+
+ max_cumsum_bot_row = &dst_cumsum[(radius * 2 + 2) * dst_stride32_cumsum];
+ cumsum_top_row = &dst_cumsum[0];
+
+ for (y = 0; y < height; ++y) {
+ int top_y = ((y - radius - 1) >= 0) ? (y - radius - 1) : 0;
+ int bot_y = ((y + radius) < height) ? (y + radius) : (height - 1);
+ int area = radius * (bot_y - top_y);
+ int boxwidth = radius * 4;
+ int x;
+ int n;
+
+ // Increment cumsum_top_row pointer with circular buffer wrap around.
+ if (top_y) {
+ cumsum_top_row += dst_stride32_cumsum;
+ if (cumsum_top_row >= max_cumsum_bot_row) {
+ cumsum_top_row = dst_cumsum;
+ }
+ }
+ // Increment cumsum_bot_row pointer with circular buffer wrap around and
+ // then fill in a row of CumulativeSum.
+ if ((y + radius) < height) {
+ const int32* prev_cumsum_bot_row = cumsum_bot_row;
+ cumsum_bot_row += dst_stride32_cumsum;
+ if (cumsum_bot_row >= max_cumsum_bot_row) {
+ cumsum_bot_row = dst_cumsum;
+ }
+ ComputeCumulativeSumRow(src_argb, cumsum_bot_row, prev_cumsum_bot_row,
+ width);
+ src_argb += src_stride_argb;
+ }
+
+ // Left clipped.
+ for (x = 0; x < radius + 1; ++x) {
+ CumulativeSumToAverageRow(cumsum_top_row, cumsum_bot_row, boxwidth, area,
+ &dst_argb[x * 4], 1);
+ area += (bot_y - top_y);
+ boxwidth += 4;
+ }
+
+ // Middle unclipped.
+ n = (width - 1) - radius - x + 1;
+ CumulativeSumToAverageRow(cumsum_top_row, cumsum_bot_row, boxwidth, area,
+ &dst_argb[x * 4], n);
+
+ // Right clipped.
+ for (x += n; x <= width - 1; ++x) {
+ area -= (bot_y - top_y);
+ boxwidth -= 4;
+ CumulativeSumToAverageRow(cumsum_top_row + (x - radius - 1) * 4,
+ cumsum_bot_row + (x - radius - 1) * 4, boxwidth,
+ area, &dst_argb[x * 4], 1);
+ }
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Multiply ARGB image by a specified ARGB value.
+LIBYUV_API
+int ARGBShade(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height,
+ uint32 value) {
+ int y;
+ void (*ARGBShadeRow)(const uint8* src_argb, uint8* dst_argb, int width,
+ uint32 value) = ARGBShadeRow_C;
+ if (!src_argb || !dst_argb || width <= 0 || height == 0 || value == 0u) {
+ return -1;
+ }
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+ // Coalesce rows.
+ if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_argb = dst_stride_argb = 0;
+ }
+#if defined(HAS_ARGBSHADEROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4)) {
+ ARGBShadeRow = ARGBShadeRow_SSE2;
+ }
+#endif
+#if defined(HAS_ARGBSHADEROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
+ ARGBShadeRow = ARGBShadeRow_NEON;
+ }
+#endif
+#if defined(HAS_ARGBSHADEROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 4)) {
+ ARGBShadeRow = ARGBShadeRow_MSA;
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ ARGBShadeRow(src_argb, dst_argb, width, value);
+ src_argb += src_stride_argb;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Interpolate 2 planes by specified amount (0 to 255).
+LIBYUV_API
+int InterpolatePlane(const uint8* src0,
+ int src_stride0,
+ const uint8* src1,
+ int src_stride1,
+ uint8* dst,
+ int dst_stride,
+ int width,
+ int height,
+ int interpolation) {
+ int y;
+ void (*InterpolateRow)(uint8 * dst_ptr, const uint8* src_ptr,
+ ptrdiff_t src_stride, int dst_width,
+ int source_y_fraction) = InterpolateRow_C;
+ if (!src0 || !src1 || !dst || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst = dst + (height - 1) * dst_stride;
+ dst_stride = -dst_stride;
+ }
+ // Coalesce rows.
+ if (src_stride0 == width && src_stride1 == width && dst_stride == width) {
+ width *= height;
+ height = 1;
+ src_stride0 = src_stride1 = dst_stride = 0;
+ }
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ InterpolateRow = InterpolateRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ InterpolateRow = InterpolateRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ InterpolateRow = InterpolateRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ InterpolateRow = InterpolateRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ InterpolateRow = InterpolateRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ InterpolateRow = InterpolateRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(src0, 4) &&
+ IS_ALIGNED(src_stride0, 4) && IS_ALIGNED(src1, 4) &&
+ IS_ALIGNED(src_stride1, 4) && IS_ALIGNED(dst, 4) &&
+ IS_ALIGNED(dst_stride, 4) && IS_ALIGNED(width, 4)) {
+ InterpolateRow = InterpolateRow_DSPR2;
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ InterpolateRow = InterpolateRow_Any_MSA;
+ if (IS_ALIGNED(width, 32)) {
+ InterpolateRow = InterpolateRow_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ InterpolateRow(dst, src0, src1 - src0, width, interpolation);
+ src0 += src_stride0;
+ src1 += src_stride1;
+ dst += dst_stride;
+ }
+ return 0;
+}
+
+// Interpolate 2 ARGB images by specified amount (0 to 255).
+LIBYUV_API
+int ARGBInterpolate(const uint8* src_argb0,
+ int src_stride_argb0,
+ const uint8* src_argb1,
+ int src_stride_argb1,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height,
+ int interpolation) {
+ return InterpolatePlane(src_argb0, src_stride_argb0, src_argb1,
+ src_stride_argb1, dst_argb, dst_stride_argb,
+ width * 4, height, interpolation);
+}
+
+// Interpolate 2 YUV images by specified amount (0 to 255).
+LIBYUV_API
+int I420Interpolate(const uint8* src0_y,
+ int src0_stride_y,
+ const uint8* src0_u,
+ int src0_stride_u,
+ const uint8* src0_v,
+ int src0_stride_v,
+ const uint8* src1_y,
+ int src1_stride_y,
+ const uint8* src1_u,
+ int src1_stride_u,
+ const uint8* src1_v,
+ int src1_stride_v,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height,
+ int interpolation) {
+ int halfwidth = (width + 1) >> 1;
+ int halfheight = (height + 1) >> 1;
+ if (!src0_y || !src0_u || !src0_v || !src1_y || !src1_u || !src1_v ||
+ !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+ return -1;
+ }
+ InterpolatePlane(src0_y, src0_stride_y, src1_y, src1_stride_y, dst_y,
+ dst_stride_y, width, height, interpolation);
+ InterpolatePlane(src0_u, src0_stride_u, src1_u, src1_stride_u, dst_u,
+ dst_stride_u, halfwidth, halfheight, interpolation);
+ InterpolatePlane(src0_v, src0_stride_v, src1_v, src1_stride_v, dst_v,
+ dst_stride_v, halfwidth, halfheight, interpolation);
+ return 0;
+}
+
+// Shuffle ARGB channel order. e.g. BGRA to ARGB.
+LIBYUV_API
+int ARGBShuffle(const uint8* src_bgra,
+ int src_stride_bgra,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ const uint8* shuffler,
+ int width,
+ int height) {
+ int y;
+ void (*ARGBShuffleRow)(const uint8* src_bgra, uint8* dst_argb,
+ const uint8* shuffler, int width) = ARGBShuffleRow_C;
+ if (!src_bgra || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_bgra = src_bgra + (height - 1) * src_stride_bgra;
+ src_stride_bgra = -src_stride_bgra;
+ }
+ // Coalesce rows.
+ if (src_stride_bgra == width * 4 && dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_bgra = dst_stride_argb = 0;
+ }
+#if defined(HAS_ARGBSHUFFLEROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ ARGBShuffleRow = ARGBShuffleRow_Any_SSE2;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBShuffleRow = ARGBShuffleRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBSHUFFLEROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBShuffleRow = ARGBShuffleRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBShuffleRow = ARGBShuffleRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBSHUFFLEROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBShuffleRow = ARGBShuffleRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBShuffleRow = ARGBShuffleRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBSHUFFLEROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBShuffleRow = ARGBShuffleRow_Any_NEON;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBShuffleRow = ARGBShuffleRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBSHUFFLEROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBShuffleRow = ARGBShuffleRow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBShuffleRow = ARGBShuffleRow_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ ARGBShuffleRow(src_bgra, dst_argb, shuffler, width);
+ src_bgra += src_stride_bgra;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Sobel ARGB effect.
+static int ARGBSobelize(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height,
+ void (*SobelRow)(const uint8* src_sobelx,
+ const uint8* src_sobely,
+ uint8* dst,
+ int width)) {
+ int y;
+ void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_g, int width) =
+ ARGBToYJRow_C;
+ void (*SobelYRow)(const uint8* src_y0, const uint8* src_y1, uint8* dst_sobely,
+ int width) = SobelYRow_C;
+ void (*SobelXRow)(const uint8* src_y0, const uint8* src_y1,
+ const uint8* src_y2, uint8* dst_sobely, int width) =
+ SobelXRow_C;
+ const int kEdge = 16; // Extra pixels at start of row for extrude/align.
+ if (!src_argb || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+
+#if defined(HAS_ARGBTOYJROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYJRow = ARGBToYJRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYJROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToYJRow = ARGBToYJRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToYJRow = ARGBToYJRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYJROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBToYJRow = ARGBToYJRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToYJRow = ARGBToYJRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYJROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBToYJRow = ARGBToYJRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYJRow = ARGBToYJRow_MSA;
+ }
+ }
+#endif
+
+#if defined(HAS_SOBELYROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ SobelYRow = SobelYRow_SSE2;
+ }
+#endif
+#if defined(HAS_SOBELYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ SobelYRow = SobelYRow_NEON;
+ }
+#endif
+#if defined(HAS_SOBELXROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ SobelXRow = SobelXRow_SSE2;
+ }
+#endif
+#if defined(HAS_SOBELXROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ SobelXRow = SobelXRow_NEON;
+ }
+#endif
+ {
+ // 3 rows with edges before/after.
+ const int kRowSize = (width + kEdge + 31) & ~31;
+ align_buffer_64(rows, kRowSize * 2 + (kEdge + kRowSize * 3 + kEdge));
+ uint8* row_sobelx = rows;
+ uint8* row_sobely = rows + kRowSize;
+ uint8* row_y = rows + kRowSize * 2;
+
+ // Convert first row.
+ uint8* row_y0 = row_y + kEdge;
+ uint8* row_y1 = row_y0 + kRowSize;
+ uint8* row_y2 = row_y1 + kRowSize;
+ ARGBToYJRow(src_argb, row_y0, width);
+ row_y0[-1] = row_y0[0];
+ memset(row_y0 + width, row_y0[width - 1], 16); // Extrude 16 for valgrind.
+ ARGBToYJRow(src_argb, row_y1, width);
+ row_y1[-1] = row_y1[0];
+ memset(row_y1 + width, row_y1[width - 1], 16);
+ memset(row_y2 + width, 0, 16);
+
+ for (y = 0; y < height; ++y) {
+ // Convert next row of ARGB to G.
+ if (y < (height - 1)) {
+ src_argb += src_stride_argb;
+ }
+ ARGBToYJRow(src_argb, row_y2, width);
+ row_y2[-1] = row_y2[0];
+ row_y2[width] = row_y2[width - 1];
+
+ SobelXRow(row_y0 - 1, row_y1 - 1, row_y2 - 1, row_sobelx, width);
+ SobelYRow(row_y0 - 1, row_y2 - 1, row_sobely, width);
+ SobelRow(row_sobelx, row_sobely, dst_argb, width);
+
+ // Cycle thru circular queue of 3 row_y buffers.
+ {
+ uint8* row_yt = row_y0;
+ row_y0 = row_y1;
+ row_y1 = row_y2;
+ row_y2 = row_yt;
+ }
+
+ dst_argb += dst_stride_argb;
+ }
+ free_aligned_buffer_64(rows);
+ }
+ return 0;
+}
+
+// Sobel ARGB effect.
+LIBYUV_API
+int ARGBSobel(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ void (*SobelRow)(const uint8* src_sobelx, const uint8* src_sobely,
+ uint8* dst_argb, int width) = SobelRow_C;
+#if defined(HAS_SOBELROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ SobelRow = SobelRow_Any_SSE2;
+ if (IS_ALIGNED(width, 16)) {
+ SobelRow = SobelRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_SOBELROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ SobelRow = SobelRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ SobelRow = SobelRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_SOBELROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ SobelRow = SobelRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ SobelRow = SobelRow_MSA;
+ }
+ }
+#endif
+ return ARGBSobelize(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
+ width, height, SobelRow);
+}
+
+// Sobel ARGB effect with planar output.
+LIBYUV_API
+int ARGBSobelToPlane(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_y,
+ int dst_stride_y,
+ int width,
+ int height) {
+ void (*SobelToPlaneRow)(const uint8* src_sobelx, const uint8* src_sobely,
+ uint8* dst_, int width) = SobelToPlaneRow_C;
+#if defined(HAS_SOBELTOPLANEROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ SobelToPlaneRow = SobelToPlaneRow_Any_SSE2;
+ if (IS_ALIGNED(width, 16)) {
+ SobelToPlaneRow = SobelToPlaneRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_SOBELTOPLANEROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ SobelToPlaneRow = SobelToPlaneRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ SobelToPlaneRow = SobelToPlaneRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_SOBELTOPLANEROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ SobelToPlaneRow = SobelToPlaneRow_Any_MSA;
+ if (IS_ALIGNED(width, 32)) {
+ SobelToPlaneRow = SobelToPlaneRow_MSA;
+ }
+ }
+#endif
+ return ARGBSobelize(src_argb, src_stride_argb, dst_y, dst_stride_y, width,
+ height, SobelToPlaneRow);
+}
+
+// SobelXY ARGB effect.
+// Similar to Sobel, but also stores Sobel X in R and Sobel Y in B. G = Sobel.
+LIBYUV_API
+int ARGBSobelXY(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ void (*SobelXYRow)(const uint8* src_sobelx, const uint8* src_sobely,
+ uint8* dst_argb, int width) = SobelXYRow_C;
+#if defined(HAS_SOBELXYROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ SobelXYRow = SobelXYRow_Any_SSE2;
+ if (IS_ALIGNED(width, 16)) {
+ SobelXYRow = SobelXYRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_SOBELXYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ SobelXYRow = SobelXYRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ SobelXYRow = SobelXYRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_SOBELXYROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ SobelXYRow = SobelXYRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ SobelXYRow = SobelXYRow_MSA;
+ }
+ }
+#endif
+ return ARGBSobelize(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
+ width, height, SobelXYRow);
+}
+
+// Apply a 4x4 polynomial to each ARGB pixel.
+LIBYUV_API
+int ARGBPolynomial(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ const float* poly,
+ int width,
+ int height) {
+ int y;
+ void (*ARGBPolynomialRow)(const uint8* src_argb, uint8* dst_argb,
+ const float* poly, int width) = ARGBPolynomialRow_C;
+ if (!src_argb || !dst_argb || !poly || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+ // Coalesce rows.
+ if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_argb = dst_stride_argb = 0;
+ }
+#if defined(HAS_ARGBPOLYNOMIALROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 2)) {
+ ARGBPolynomialRow = ARGBPolynomialRow_SSE2;
+ }
+#endif
+#if defined(HAS_ARGBPOLYNOMIALROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2) && TestCpuFlag(kCpuHasFMA3) &&
+ IS_ALIGNED(width, 2)) {
+ ARGBPolynomialRow = ARGBPolynomialRow_AVX2;
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ ARGBPolynomialRow(src_argb, dst_argb, poly, width);
+ src_argb += src_stride_argb;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Convert plane of 16 bit shorts to half floats.
+// Source values are multiplied by scale before storing as half float.
+LIBYUV_API
+int HalfFloatPlane(const uint16* src_y,
+ int src_stride_y,
+ uint16* dst_y,
+ int dst_stride_y,
+ float scale,
+ int width,
+ int height) {
+ int y;
+ void (*HalfFloatRow)(const uint16* src, uint16* dst, float scale, int width) =
+ HalfFloatRow_C;
+ if (!src_y || !dst_y || width <= 0 || height == 0) {
+ return -1;
+ }
+ src_stride_y >>= 1;
+ dst_stride_y >>= 1;
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_stride_y = -src_stride_y;
+ }
+ // Coalesce rows.
+ if (src_stride_y == width && dst_stride_y == width) {
+ width *= height;
+ height = 1;
+ src_stride_y = dst_stride_y = 0;
+ }
+#if defined(HAS_HALFFLOATROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ HalfFloatRow = HalfFloatRow_Any_SSE2;
+ if (IS_ALIGNED(width, 8)) {
+ HalfFloatRow = HalfFloatRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_HALFFLOATROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ HalfFloatRow = HalfFloatRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ HalfFloatRow = HalfFloatRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_HALFFLOATROW_F16C)
+ if (TestCpuFlag(kCpuHasAVX2) && TestCpuFlag(kCpuHasF16C)) {
+ HalfFloatRow =
+ (scale == 1.0f) ? HalfFloat1Row_Any_F16C : HalfFloatRow_Any_F16C;
+ if (IS_ALIGNED(width, 16)) {
+ HalfFloatRow = (scale == 1.0f) ? HalfFloat1Row_F16C : HalfFloatRow_F16C;
+ }
+ }
+#endif
+#if defined(HAS_HALFFLOATROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ HalfFloatRow =
+ (scale == 1.0f) ? HalfFloat1Row_Any_NEON : HalfFloatRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ HalfFloatRow = (scale == 1.0f) ? HalfFloat1Row_NEON : HalfFloatRow_NEON;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ HalfFloatRow(src_y, dst_y, scale, width);
+ src_y += src_stride_y;
+ dst_y += dst_stride_y;
+ }
+ return 0;
+}
+
+// Apply a lumacolortable to each ARGB pixel.
+LIBYUV_API
+int ARGBLumaColorTable(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ const uint8* luma,
+ int width,
+ int height) {
+ int y;
+ void (*ARGBLumaColorTableRow)(
+ const uint8* src_argb, uint8* dst_argb, int width, const uint8* luma,
+ const uint32 lumacoeff) = ARGBLumaColorTableRow_C;
+ if (!src_argb || !dst_argb || !luma || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+ // Coalesce rows.
+ if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_argb = dst_stride_argb = 0;
+ }
+#if defined(HAS_ARGBLUMACOLORTABLEROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 4)) {
+ ARGBLumaColorTableRow = ARGBLumaColorTableRow_SSSE3;
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ ARGBLumaColorTableRow(src_argb, dst_argb, width, luma, 0x00264b0f);
+ src_argb += src_stride_argb;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Copy Alpha from one ARGB image to another.
+LIBYUV_API
+int ARGBCopyAlpha(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ int y;
+ void (*ARGBCopyAlphaRow)(const uint8* src_argb, uint8* dst_argb, int width) =
+ ARGBCopyAlphaRow_C;
+ if (!src_argb || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+ // Coalesce rows.
+ if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_argb = dst_stride_argb = 0;
+ }
+#if defined(HAS_ARGBCOPYALPHAROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ ARGBCopyAlphaRow = ARGBCopyAlphaRow_Any_SSE2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBCopyAlphaRow = ARGBCopyAlphaRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBCOPYALPHAROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBCopyAlphaRow = ARGBCopyAlphaRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBCopyAlphaRow = ARGBCopyAlphaRow_AVX2;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ ARGBCopyAlphaRow(src_argb, dst_argb, width);
+ src_argb += src_stride_argb;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Extract just the alpha channel from ARGB.
+LIBYUV_API
+int ARGBExtractAlpha(const uint8* src_argb,
+ int src_stride,
+ uint8* dst_a,
+ int dst_stride,
+ int width,
+ int height) {
+ if (!src_argb || !dst_a || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_argb += (height - 1) * src_stride;
+ src_stride = -src_stride;
+ }
+ // Coalesce rows.
+ if (src_stride == width * 4 && dst_stride == width) {
+ width *= height;
+ height = 1;
+ src_stride = dst_stride = 0;
+ }
+ void (*ARGBExtractAlphaRow)(const uint8* src_argb, uint8* dst_a, int width) =
+ ARGBExtractAlphaRow_C;
+#if defined(HAS_ARGBEXTRACTALPHAROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ ARGBExtractAlphaRow = IS_ALIGNED(width, 8) ? ARGBExtractAlphaRow_SSE2
+ : ARGBExtractAlphaRow_Any_SSE2;
+ }
+#endif
+#if defined(HAS_ARGBEXTRACTALPHAROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBExtractAlphaRow = IS_ALIGNED(width, 32) ? ARGBExtractAlphaRow_AVX2
+ : ARGBExtractAlphaRow_Any_AVX2;
+ }
+#endif
+#if defined(HAS_ARGBEXTRACTALPHAROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBExtractAlphaRow = IS_ALIGNED(width, 16) ? ARGBExtractAlphaRow_NEON
+ : ARGBExtractAlphaRow_Any_NEON;
+ }
+#endif
+
+ for (int y = 0; y < height; ++y) {
+ ARGBExtractAlphaRow(src_argb, dst_a, width);
+ src_argb += src_stride;
+ dst_a += dst_stride;
+ }
+ return 0;
+}
+
+// Copy a planar Y channel to the alpha channel of a destination ARGB image.
+LIBYUV_API
+int ARGBCopyYToAlpha(const uint8* src_y,
+ int src_stride_y,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ int y;
+ void (*ARGBCopyYToAlphaRow)(const uint8* src_y, uint8* dst_argb, int width) =
+ ARGBCopyYToAlphaRow_C;
+ if (!src_y || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_stride_y = -src_stride_y;
+ }
+ // Coalesce rows.
+ if (src_stride_y == width && dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_y = dst_stride_argb = 0;
+ }
+#if defined(HAS_ARGBCOPYYTOALPHAROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_Any_SSE2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBCOPYYTOALPHAROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_AVX2;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ ARGBCopyYToAlphaRow(src_y, dst_argb, width);
+ src_y += src_stride_y;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+// TODO(fbarchard): Consider if width is even Y channel can be split
+// directly. A SplitUVRow_Odd function could copy the remaining chroma.
+
+LIBYUV_API
+int YUY2ToNV12(const uint8* src_yuy2,
+ int src_stride_yuy2,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height) {
+ int y;
+ int halfwidth = (width + 1) >> 1;
+ void (*SplitUVRow)(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+ int width) = SplitUVRow_C;
+ void (*InterpolateRow)(uint8 * dst_ptr, const uint8* src_ptr,
+ ptrdiff_t src_stride, int dst_width,
+ int source_y_fraction) = InterpolateRow_C;
+ if (!src_yuy2 || !dst_y || !dst_uv || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2;
+ src_stride_yuy2 = -src_stride_yuy2;
+ }
+#if defined(HAS_SPLITUVROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ SplitUVRow = SplitUVRow_Any_SSE2;
+ if (IS_ALIGNED(width, 16)) {
+ SplitUVRow = SplitUVRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_SPLITUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ SplitUVRow = SplitUVRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ SplitUVRow = SplitUVRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_SPLITUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ SplitUVRow = SplitUVRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ SplitUVRow = SplitUVRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ InterpolateRow = InterpolateRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ InterpolateRow = InterpolateRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ InterpolateRow = InterpolateRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ InterpolateRow = InterpolateRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ InterpolateRow = InterpolateRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ InterpolateRow = InterpolateRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ InterpolateRow = InterpolateRow_Any_MSA;
+ if (IS_ALIGNED(width, 32)) {
+ InterpolateRow = InterpolateRow_MSA;
+ }
+ }
+#endif
+
+ {
+ int awidth = halfwidth * 2;
+ // row of y and 2 rows of uv
+ align_buffer_64(rows, awidth * 3);
+
+ for (y = 0; y < height - 1; y += 2) {
+ // Split Y from UV.
+ SplitUVRow(src_yuy2, rows, rows + awidth, awidth);
+ memcpy(dst_y, rows, width);
+ SplitUVRow(src_yuy2 + src_stride_yuy2, rows, rows + awidth * 2, awidth);
+ memcpy(dst_y + dst_stride_y, rows, width);
+ InterpolateRow(dst_uv, rows + awidth, awidth, awidth, 128);
+ src_yuy2 += src_stride_yuy2 * 2;
+ dst_y += dst_stride_y * 2;
+ dst_uv += dst_stride_uv;
+ }
+ if (height & 1) {
+ // Split Y from UV.
+ SplitUVRow(src_yuy2, rows, dst_uv, awidth);
+ memcpy(dst_y, rows, width);
+ }
+ free_aligned_buffer_64(rows);
+ }
+ return 0;
+}
+
+LIBYUV_API
+int UYVYToNV12(const uint8* src_uyvy,
+ int src_stride_uyvy,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height) {
+ int y;
+ int halfwidth = (width + 1) >> 1;
+ void (*SplitUVRow)(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+ int width) = SplitUVRow_C;
+ void (*InterpolateRow)(uint8 * dst_ptr, const uint8* src_ptr,
+ ptrdiff_t src_stride, int dst_width,
+ int source_y_fraction) = InterpolateRow_C;
+ if (!src_uyvy || !dst_y || !dst_uv || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy;
+ src_stride_uyvy = -src_stride_uyvy;
+ }
+#if defined(HAS_SPLITUVROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ SplitUVRow = SplitUVRow_Any_SSE2;
+ if (IS_ALIGNED(width, 16)) {
+ SplitUVRow = SplitUVRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_SPLITUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ SplitUVRow = SplitUVRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ SplitUVRow = SplitUVRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_SPLITUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ SplitUVRow = SplitUVRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ SplitUVRow = SplitUVRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ InterpolateRow = InterpolateRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ InterpolateRow = InterpolateRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ InterpolateRow = InterpolateRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ InterpolateRow = InterpolateRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ InterpolateRow = InterpolateRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ InterpolateRow = InterpolateRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ InterpolateRow = InterpolateRow_Any_MSA;
+ if (IS_ALIGNED(width, 32)) {
+ InterpolateRow = InterpolateRow_MSA;
+ }
+ }
+#endif
+
+ {
+ int awidth = halfwidth * 2;
+ // row of y and 2 rows of uv
+ align_buffer_64(rows, awidth * 3);
+
+ for (y = 0; y < height - 1; y += 2) {
+ // Split Y from UV.
+ SplitUVRow(src_uyvy, rows + awidth, rows, awidth);
+ memcpy(dst_y, rows, width);
+ SplitUVRow(src_uyvy + src_stride_uyvy, rows + awidth * 2, rows, awidth);
+ memcpy(dst_y + dst_stride_y, rows, width);
+ InterpolateRow(dst_uv, rows + awidth, awidth, awidth, 128);
+ src_uyvy += src_stride_uyvy * 2;
+ dst_y += dst_stride_y * 2;
+ dst_uv += dst_stride_uv;
+ }
+ if (height & 1) {
+ // Split Y from UV.
+ SplitUVRow(src_uyvy, dst_uv, rows, awidth);
+ memcpy(dst_y, rows, width);
+ }
+ free_aligned_buffer_64(rows);
+ }
+ return 0;
+}
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/libyuv/src/main/cpp/libyuv/source/rotate.cc b/libyuv/src/main/cpp/libyuv/source/rotate.cc
new file mode 100644
index 0000000..4330884
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/source/rotate.cc
@@ -0,0 +1,544 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/rotate.h"
+
+#include "libyuv/convert.h"
+#include "libyuv/cpu_id.h"
+#include "libyuv/planar_functions.h"
+#include "libyuv/rotate_row.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+LIBYUV_API
+void TransposePlane(const uint8* src,
+ int src_stride,
+ uint8* dst,
+ int dst_stride,
+ int width,
+ int height) {
+ int i = height;
+#if defined(HAS_TRANSPOSEWX16_MSA)
+ void (*TransposeWx16)(const uint8* src, int src_stride, uint8* dst,
+ int dst_stride, int width) = TransposeWx16_C;
+#else
+ void (*TransposeWx8)(const uint8* src, int src_stride, uint8* dst,
+ int dst_stride, int width) = TransposeWx8_C;
+#endif
+#if defined(HAS_TRANSPOSEWX8_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ TransposeWx8 = TransposeWx8_NEON;
+ }
+#endif
+#if defined(HAS_TRANSPOSEWX8_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ TransposeWx8 = TransposeWx8_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ TransposeWx8 = TransposeWx8_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_TRANSPOSEWX8_FAST_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ TransposeWx8 = TransposeWx8_Fast_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ TransposeWx8 = TransposeWx8_Fast_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_TRANSPOSEWX8_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2)) {
+ if (IS_ALIGNED(width, 4) && IS_ALIGNED(src, 4) &&
+ IS_ALIGNED(src_stride, 4)) {
+ TransposeWx8 = TransposeWx8_Fast_DSPR2;
+ } else {
+ TransposeWx8 = TransposeWx8_DSPR2;
+ }
+ }
+#endif
+#if defined(HAS_TRANSPOSEWX16_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ TransposeWx16 = TransposeWx16_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ TransposeWx16 = TransposeWx16_MSA;
+ }
+ }
+#endif
+
+#if defined(HAS_TRANSPOSEWX16_MSA)
+ // Work across the source in 16x16 tiles
+ while (i >= 16) {
+ TransposeWx16(src, src_stride, dst, dst_stride, width);
+ src += 16 * src_stride; // Go down 16 rows.
+ dst += 16; // Move over 16 columns.
+ i -= 16;
+ }
+#else
+ // Work across the source in 8x8 tiles
+ while (i >= 8) {
+ TransposeWx8(src, src_stride, dst, dst_stride, width);
+ src += 8 * src_stride; // Go down 8 rows.
+ dst += 8; // Move over 8 columns.
+ i -= 8;
+ }
+#endif
+
+ if (i > 0) {
+ TransposeWxH_C(src, src_stride, dst, dst_stride, width, i);
+ }
+}
+
+LIBYUV_API
+void RotatePlane90(const uint8* src,
+ int src_stride,
+ uint8* dst,
+ int dst_stride,
+ int width,
+ int height) {
+ // Rotate by 90 is a transpose with the source read
+ // from bottom to top. So set the source pointer to the end
+ // of the buffer and flip the sign of the source stride.
+ src += src_stride * (height - 1);
+ src_stride = -src_stride;
+ TransposePlane(src, src_stride, dst, dst_stride, width, height);
+}
+
+LIBYUV_API
+void RotatePlane270(const uint8* src,
+ int src_stride,
+ uint8* dst,
+ int dst_stride,
+ int width,
+ int height) {
+ // Rotate by 270 is a transpose with the destination written
+ // from bottom to top. So set the destination pointer to the end
+ // of the buffer and flip the sign of the destination stride.
+ dst += dst_stride * (width - 1);
+ dst_stride = -dst_stride;
+ TransposePlane(src, src_stride, dst, dst_stride, width, height);
+}
+
+LIBYUV_API
+void RotatePlane180(const uint8* src,
+ int src_stride,
+ uint8* dst,
+ int dst_stride,
+ int width,
+ int height) {
+ // Swap first and last row and mirror the content. Uses a temporary row.
+ align_buffer_64(row, width);
+ const uint8* src_bot = src + src_stride * (height - 1);
+ uint8* dst_bot = dst + dst_stride * (height - 1);
+ int half_height = (height + 1) >> 1;
+ int y;
+ void (*MirrorRow)(const uint8* src, uint8* dst, int width) = MirrorRow_C;
+ void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
+#if defined(HAS_MIRRORROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ MirrorRow = MirrorRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ MirrorRow = MirrorRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_MIRRORROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ MirrorRow = MirrorRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ MirrorRow = MirrorRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_MIRRORROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ MirrorRow = MirrorRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ MirrorRow = MirrorRow_AVX2;
+ }
+ }
+#endif
+// TODO(fbarchard): Mirror on mips handle unaligned memory.
+#if defined(HAS_MIRRORROW_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(src, 4) &&
+ IS_ALIGNED(src_stride, 4) && IS_ALIGNED(dst, 4) &&
+ IS_ALIGNED(dst_stride, 4)) {
+ MirrorRow = MirrorRow_DSPR2;
+ }
+#endif
+#if defined(HAS_MIRRORROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ MirrorRow = MirrorRow_Any_MSA;
+ if (IS_ALIGNED(width, 64)) {
+ MirrorRow = MirrorRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_COPYROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
+ }
+#endif
+#if defined(HAS_COPYROW_AVX)
+ if (TestCpuFlag(kCpuHasAVX)) {
+ CopyRow = IS_ALIGNED(width, 64) ? CopyRow_AVX : CopyRow_Any_AVX;
+ }
+#endif
+#if defined(HAS_COPYROW_ERMS)
+ if (TestCpuFlag(kCpuHasERMS)) {
+ CopyRow = CopyRow_ERMS;
+ }
+#endif
+#if defined(HAS_COPYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
+ }
+#endif
+#if defined(HAS_COPYROW_MIPS)
+ if (TestCpuFlag(kCpuHasMIPS)) {
+ CopyRow = CopyRow_MIPS;
+ }
+#endif
+
+ // Odd height will harmlessly mirror the middle row twice.
+ for (y = 0; y < half_height; ++y) {
+ MirrorRow(src, row, width); // Mirror first row into a buffer
+ src += src_stride;
+ MirrorRow(src_bot, dst, width); // Mirror last row into first row
+ dst += dst_stride;
+ CopyRow(row, dst_bot, width); // Copy first mirrored row into last
+ src_bot -= src_stride;
+ dst_bot -= dst_stride;
+ }
+ free_aligned_buffer_64(row);
+}
+
+LIBYUV_API
+void TransposeUV(const uint8* src,
+ int src_stride,
+ uint8* dst_a,
+ int dst_stride_a,
+ uint8* dst_b,
+ int dst_stride_b,
+ int width,
+ int height) {
+ int i = height;
+#if defined(HAS_TRANSPOSEUVWX16_MSA)
+ void (*TransposeUVWx16)(const uint8* src, int src_stride, uint8* dst_a,
+ int dst_stride_a, uint8* dst_b, int dst_stride_b,
+ int width) = TransposeUVWx16_C;
+#else
+ void (*TransposeUVWx8)(const uint8* src, int src_stride, uint8* dst_a,
+ int dst_stride_a, uint8* dst_b, int dst_stride_b,
+ int width) = TransposeUVWx8_C;
+#endif
+#if defined(HAS_TRANSPOSEUVWX8_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ TransposeUVWx8 = TransposeUVWx8_NEON;
+ }
+#endif
+#if defined(HAS_TRANSPOSEUVWX8_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ TransposeUVWx8 = TransposeUVWx8_Any_SSE2;
+ if (IS_ALIGNED(width, 8)) {
+ TransposeUVWx8 = TransposeUVWx8_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_TRANSPOSEUVWX8_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 2) && IS_ALIGNED(src, 4) &&
+ IS_ALIGNED(src_stride, 4)) {
+ TransposeUVWx8 = TransposeUVWx8_DSPR2;
+ }
+#endif
+#if defined(HAS_TRANSPOSEUVWX16_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ TransposeUVWx16 = TransposeUVWx16_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ TransposeUVWx16 = TransposeUVWx16_MSA;
+ }
+ }
+#endif
+
+#if defined(HAS_TRANSPOSEUVWX16_MSA)
+ // Work through the source in 8x8 tiles.
+ while (i >= 16) {
+ TransposeUVWx16(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b,
+ width);
+ src += 16 * src_stride; // Go down 16 rows.
+ dst_a += 16; // Move over 8 columns.
+ dst_b += 16; // Move over 8 columns.
+ i -= 16;
+ }
+#else
+ // Work through the source in 8x8 tiles.
+ while (i >= 8) {
+ TransposeUVWx8(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b,
+ width);
+ src += 8 * src_stride; // Go down 8 rows.
+ dst_a += 8; // Move over 8 columns.
+ dst_b += 8; // Move over 8 columns.
+ i -= 8;
+ }
+#endif
+
+ if (i > 0) {
+ TransposeUVWxH_C(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b,
+ width, i);
+ }
+}
+
+LIBYUV_API
+void RotateUV90(const uint8* src,
+ int src_stride,
+ uint8* dst_a,
+ int dst_stride_a,
+ uint8* dst_b,
+ int dst_stride_b,
+ int width,
+ int height) {
+ src += src_stride * (height - 1);
+ src_stride = -src_stride;
+
+ TransposeUV(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b, width,
+ height);
+}
+
+LIBYUV_API
+void RotateUV270(const uint8* src,
+ int src_stride,
+ uint8* dst_a,
+ int dst_stride_a,
+ uint8* dst_b,
+ int dst_stride_b,
+ int width,
+ int height) {
+ dst_a += dst_stride_a * (width - 1);
+ dst_b += dst_stride_b * (width - 1);
+ dst_stride_a = -dst_stride_a;
+ dst_stride_b = -dst_stride_b;
+
+ TransposeUV(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b, width,
+ height);
+}
+
+// Rotate 180 is a horizontal and vertical flip.
+LIBYUV_API
+void RotateUV180(const uint8* src,
+ int src_stride,
+ uint8* dst_a,
+ int dst_stride_a,
+ uint8* dst_b,
+ int dst_stride_b,
+ int width,
+ int height) {
+ int i;
+ void (*MirrorUVRow)(const uint8* src, uint8* dst_u, uint8* dst_v, int width) =
+ MirrorUVRow_C;
+#if defined(HAS_MIRRORUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
+ MirrorUVRow = MirrorUVRow_NEON;
+ }
+#endif
+#if defined(HAS_MIRRORUVROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16)) {
+ MirrorUVRow = MirrorUVRow_SSSE3;
+ }
+#endif
+#if defined(HAS_MIRRORUVROW_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(src, 4) &&
+ IS_ALIGNED(src_stride, 4)) {
+ MirrorUVRow = MirrorUVRow_DSPR2;
+ }
+#endif
+
+ dst_a += dst_stride_a * (height - 1);
+ dst_b += dst_stride_b * (height - 1);
+
+ for (i = 0; i < height; ++i) {
+ MirrorUVRow(src, dst_a, dst_b, width);
+ src += src_stride;
+ dst_a -= dst_stride_a;
+ dst_b -= dst_stride_b;
+ }
+}
+
+LIBYUV_API
+int RotatePlane(const uint8* src,
+ int src_stride,
+ uint8* dst,
+ int dst_stride,
+ int width,
+ int height,
+ enum RotationMode mode) {
+ if (!src || width <= 0 || height == 0 || !dst) {
+ return -1;
+ }
+
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src = src + (height - 1) * src_stride;
+ src_stride = -src_stride;
+ }
+
+ switch (mode) {
+ case kRotate0:
+ // copy frame
+ CopyPlane(src, src_stride, dst, dst_stride, width, height);
+ return 0;
+ case kRotate90:
+ RotatePlane90(src, src_stride, dst, dst_stride, width, height);
+ return 0;
+ case kRotate270:
+ RotatePlane270(src, src_stride, dst, dst_stride, width, height);
+ return 0;
+ case kRotate180:
+ RotatePlane180(src, src_stride, dst, dst_stride, width, height);
+ return 0;
+ default:
+ break;
+ }
+ return -1;
+}
+
+LIBYUV_API
+int I420Rotate(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height,
+ enum RotationMode mode) {
+ int halfwidth = (width + 1) >> 1;
+ int halfheight = (height + 1) >> 1;
+ if (!src_y || !src_u || !src_v || width <= 0 || height == 0 || !dst_y ||
+ !dst_u || !dst_v) {
+ return -1;
+ }
+
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ halfheight = (height + 1) >> 1;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_u = src_u + (halfheight - 1) * src_stride_u;
+ src_v = src_v + (halfheight - 1) * src_stride_v;
+ src_stride_y = -src_stride_y;
+ src_stride_u = -src_stride_u;
+ src_stride_v = -src_stride_v;
+ }
+
+ switch (mode) {
+ case kRotate0:
+ // copy frame
+ return I420Copy(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u,
+ dst_v, dst_stride_v, width, height);
+ case kRotate90:
+ RotatePlane90(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ RotatePlane90(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth,
+ halfheight);
+ RotatePlane90(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth,
+ halfheight);
+ return 0;
+ case kRotate270:
+ RotatePlane270(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ RotatePlane270(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth,
+ halfheight);
+ RotatePlane270(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth,
+ halfheight);
+ return 0;
+ case kRotate180:
+ RotatePlane180(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ RotatePlane180(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth,
+ halfheight);
+ RotatePlane180(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth,
+ halfheight);
+ return 0;
+ default:
+ break;
+ }
+ return -1;
+}
+
+LIBYUV_API
+int NV12ToI420Rotate(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_uv,
+ int src_stride_uv,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height,
+ enum RotationMode mode) {
+ int halfwidth = (width + 1) >> 1;
+ int halfheight = (height + 1) >> 1;
+ if (!src_y || !src_uv || width <= 0 || height == 0 || !dst_y || !dst_u ||
+ !dst_v) {
+ return -1;
+ }
+
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ halfheight = (height + 1) >> 1;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_uv = src_uv + (halfheight - 1) * src_stride_uv;
+ src_stride_y = -src_stride_y;
+ src_stride_uv = -src_stride_uv;
+ }
+
+ switch (mode) {
+ case kRotate0:
+ // copy frame
+ return NV12ToI420(src_y, src_stride_y, src_uv, src_stride_uv, dst_y,
+ dst_stride_y, dst_u, dst_stride_u, dst_v, dst_stride_v,
+ width, height);
+ case kRotate90:
+ RotatePlane90(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ RotateUV90(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v,
+ dst_stride_v, halfwidth, halfheight);
+ return 0;
+ case kRotate270:
+ RotatePlane270(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ RotateUV270(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v,
+ dst_stride_v, halfwidth, halfheight);
+ return 0;
+ case kRotate180:
+ RotatePlane180(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ RotateUV180(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v,
+ dst_stride_v, halfwidth, halfheight);
+ return 0;
+ default:
+ break;
+ }
+ return -1;
+}
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/libyuv/src/main/cpp/libyuv/source/rotate_any.cc b/libyuv/src/main/cpp/libyuv/source/rotate_any.cc
new file mode 100644
index 0000000..562096b
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/source/rotate_any.cc
@@ -0,0 +1,78 @@
+/*
+ * Copyright 2015 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/rotate.h"
+#include "libyuv/rotate_row.h"
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#define TANY(NAMEANY, TPOS_SIMD, MASK) \
+ void NAMEANY(const uint8* src, int src_stride, uint8* dst, int dst_stride, \
+ int width) { \
+ int r = width & MASK; \
+ int n = width - r; \
+ if (n > 0) { \
+ TPOS_SIMD(src, src_stride, dst, dst_stride, n); \
+ } \
+ TransposeWx8_C(src + n, src_stride, dst + n * dst_stride, dst_stride, r); \
+ }
+
+#ifdef HAS_TRANSPOSEWX8_NEON
+TANY(TransposeWx8_Any_NEON, TransposeWx8_NEON, 7)
+#endif
+#ifdef HAS_TRANSPOSEWX8_SSSE3
+TANY(TransposeWx8_Any_SSSE3, TransposeWx8_SSSE3, 7)
+#endif
+#ifdef HAS_TRANSPOSEWX8_FAST_SSSE3
+TANY(TransposeWx8_Fast_Any_SSSE3, TransposeWx8_Fast_SSSE3, 15)
+#endif
+#ifdef HAS_TRANSPOSEWX8_DSPR2
+TANY(TransposeWx8_Any_DSPR2, TransposeWx8_DSPR2, 7)
+#endif
+#ifdef HAS_TRANSPOSEWX16_MSA
+TANY(TransposeWx16_Any_MSA, TransposeWx16_MSA, 15)
+#endif
+#undef TANY
+
+#define TUVANY(NAMEANY, TPOS_SIMD, MASK) \
+ void NAMEANY(const uint8* src, int src_stride, uint8* dst_a, \
+ int dst_stride_a, uint8* dst_b, int dst_stride_b, int width) { \
+ int r = width & MASK; \
+ int n = width - r; \
+ if (n > 0) { \
+ TPOS_SIMD(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b, n); \
+ } \
+ TransposeUVWx8_C(src + n * 2, src_stride, dst_a + n * dst_stride_a, \
+ dst_stride_a, dst_b + n * dst_stride_b, dst_stride_b, r); \
+ }
+
+#ifdef HAS_TRANSPOSEUVWX8_NEON
+TUVANY(TransposeUVWx8_Any_NEON, TransposeUVWx8_NEON, 7)
+#endif
+#ifdef HAS_TRANSPOSEUVWX8_SSE2
+TUVANY(TransposeUVWx8_Any_SSE2, TransposeUVWx8_SSE2, 7)
+#endif
+#ifdef HAS_TRANSPOSEUVWX8_DSPR2
+TUVANY(TransposeUVWx8_Any_DSPR2, TransposeUVWx8_DSPR2, 7)
+#endif
+#ifdef HAS_TRANSPOSEUVWX16_MSA
+TUVANY(TransposeUVWx16_Any_MSA, TransposeUVWx16_MSA, 7)
+#endif
+#undef TUVANY
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/libyuv/src/main/cpp/libyuv/source/rotate_argb.cc b/libyuv/src/main/cpp/libyuv/source/rotate_argb.cc
new file mode 100644
index 0000000..ede4eaf
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/source/rotate_argb.cc
@@ -0,0 +1,240 @@
+/*
+ * Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/rotate.h"
+
+#include "libyuv/convert.h"
+#include "libyuv/cpu_id.h"
+#include "libyuv/planar_functions.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// ARGBScale has a function to copy pixels to a row, striding each source
+// pixel by a constant.
+#if !defined(LIBYUV_DISABLE_X86) && \
+ (defined(_M_IX86) || \
+ (defined(__x86_64__) && !defined(__native_client__)) || \
+ defined(__i386__))
+#define HAS_SCALEARGBROWDOWNEVEN_SSE2
+void ScaleARGBRowDownEven_SSE2(const uint8* src_ptr,
+ int src_stride,
+ int src_stepx,
+ uint8* dst_ptr,
+ int dst_width);
+#endif
+#if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \
+ (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
+#define HAS_SCALEARGBROWDOWNEVEN_NEON
+void ScaleARGBRowDownEven_NEON(const uint8* src_ptr,
+ int src_stride,
+ int src_stepx,
+ uint8* dst_ptr,
+ int dst_width);
+#endif
+
+void ScaleARGBRowDownEven_C(const uint8* src_ptr,
+ int,
+ int src_stepx,
+ uint8* dst_ptr,
+ int dst_width);
+
+static void ARGBTranspose(const uint8* src,
+ int src_stride,
+ uint8* dst,
+ int dst_stride,
+ int width,
+ int height) {
+ int i;
+ int src_pixel_step = src_stride >> 2;
+ void (*ScaleARGBRowDownEven)(const uint8* src_ptr, int src_stride,
+ int src_step, uint8* dst_ptr, int dst_width) =
+ ScaleARGBRowDownEven_C;
+#if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(height, 4)) { // Width of dest.
+ ScaleARGBRowDownEven = ScaleARGBRowDownEven_SSE2;
+ }
+#endif
+#if defined(HAS_SCALEARGBROWDOWNEVEN_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(height, 4)) { // Width of dest.
+ ScaleARGBRowDownEven = ScaleARGBRowDownEven_NEON;
+ }
+#endif
+
+ for (i = 0; i < width; ++i) { // column of source to row of dest.
+ ScaleARGBRowDownEven(src, 0, src_pixel_step, dst, height);
+ dst += dst_stride;
+ src += 4;
+ }
+}
+
+void ARGBRotate90(const uint8* src,
+ int src_stride,
+ uint8* dst,
+ int dst_stride,
+ int width,
+ int height) {
+ // Rotate by 90 is a ARGBTranspose with the source read
+ // from bottom to top. So set the source pointer to the end
+ // of the buffer and flip the sign of the source stride.
+ src += src_stride * (height - 1);
+ src_stride = -src_stride;
+ ARGBTranspose(src, src_stride, dst, dst_stride, width, height);
+}
+
+void ARGBRotate270(const uint8* src,
+ int src_stride,
+ uint8* dst,
+ int dst_stride,
+ int width,
+ int height) {
+ // Rotate by 270 is a ARGBTranspose with the destination written
+ // from bottom to top. So set the destination pointer to the end
+ // of the buffer and flip the sign of the destination stride.
+ dst += dst_stride * (width - 1);
+ dst_stride = -dst_stride;
+ ARGBTranspose(src, src_stride, dst, dst_stride, width, height);
+}
+
+void ARGBRotate180(const uint8* src,
+ int src_stride,
+ uint8* dst,
+ int dst_stride,
+ int width,
+ int height) {
+ // Swap first and last row and mirror the content. Uses a temporary row.
+ align_buffer_64(row, width * 4);
+ const uint8* src_bot = src + src_stride * (height - 1);
+ uint8* dst_bot = dst + dst_stride * (height - 1);
+ int half_height = (height + 1) >> 1;
+ int y;
+ void (*ARGBMirrorRow)(const uint8* src, uint8* dst, int width) =
+ ARGBMirrorRow_C;
+ void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
+#if defined(HAS_ARGBMIRRORROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBMirrorRow = ARGBMirrorRow_Any_NEON;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBMirrorRow = ARGBMirrorRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBMIRRORROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ ARGBMirrorRow = ARGBMirrorRow_Any_SSE2;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBMirrorRow = ARGBMirrorRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBMIRRORROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBMirrorRow = ARGBMirrorRow_Any_AVX2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBMirrorRow = ARGBMirrorRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBMIRRORROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBMirrorRow = ARGBMirrorRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBMirrorRow = ARGBMirrorRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_COPYROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ CopyRow = IS_ALIGNED(width * 4, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
+ }
+#endif
+#if defined(HAS_COPYROW_AVX)
+ if (TestCpuFlag(kCpuHasAVX)) {
+ CopyRow = IS_ALIGNED(width * 4, 64) ? CopyRow_AVX : CopyRow_Any_AVX;
+ }
+#endif
+#if defined(HAS_COPYROW_ERMS)
+ if (TestCpuFlag(kCpuHasERMS)) {
+ CopyRow = CopyRow_ERMS;
+ }
+#endif
+#if defined(HAS_COPYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ CopyRow = IS_ALIGNED(width * 4, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
+ }
+#endif
+#if defined(HAS_COPYROW_MIPS)
+ if (TestCpuFlag(kCpuHasMIPS)) {
+ CopyRow = CopyRow_MIPS;
+ }
+#endif
+
+ // Odd height will harmlessly mirror the middle row twice.
+ for (y = 0; y < half_height; ++y) {
+ ARGBMirrorRow(src, row, width); // Mirror first row into a buffer
+ ARGBMirrorRow(src_bot, dst, width); // Mirror last row into first row
+ CopyRow(row, dst_bot, width * 4); // Copy first mirrored row into last
+ src += src_stride;
+ dst += dst_stride;
+ src_bot -= src_stride;
+ dst_bot -= dst_stride;
+ }
+ free_aligned_buffer_64(row);
+}
+
+LIBYUV_API
+int ARGBRotate(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height,
+ enum RotationMode mode) {
+ if (!src_argb || width <= 0 || height == 0 || !dst_argb) {
+ return -1;
+ }
+
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+
+ switch (mode) {
+ case kRotate0:
+ // copy frame
+ return ARGBCopy(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
+ width, height);
+ case kRotate90:
+ ARGBRotate90(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width,
+ height);
+ return 0;
+ case kRotate270:
+ ARGBRotate270(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width,
+ height);
+ return 0;
+ case kRotate180:
+ ARGBRotate180(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width,
+ height);
+ return 0;
+ default:
+ break;
+ }
+ return -1;
+}
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/libyuv/src/main/cpp/libyuv/source/rotate_common.cc b/libyuv/src/main/cpp/libyuv/source/rotate_common.cc
new file mode 100644
index 0000000..89357e7
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/source/rotate_common.cc
@@ -0,0 +1,106 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/rotate_row.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+void TransposeWx8_C(const uint8* src,
+ int src_stride,
+ uint8* dst,
+ int dst_stride,
+ int width) {
+ int i;
+ for (i = 0; i < width; ++i) {
+ dst[0] = src[0 * src_stride];
+ dst[1] = src[1 * src_stride];
+ dst[2] = src[2 * src_stride];
+ dst[3] = src[3 * src_stride];
+ dst[4] = src[4 * src_stride];
+ dst[5] = src[5 * src_stride];
+ dst[6] = src[6 * src_stride];
+ dst[7] = src[7 * src_stride];
+ ++src;
+ dst += dst_stride;
+ }
+}
+
+void TransposeUVWx8_C(const uint8* src,
+ int src_stride,
+ uint8* dst_a,
+ int dst_stride_a,
+ uint8* dst_b,
+ int dst_stride_b,
+ int width) {
+ int i;
+ for (i = 0; i < width; ++i) {
+ dst_a[0] = src[0 * src_stride + 0];
+ dst_b[0] = src[0 * src_stride + 1];
+ dst_a[1] = src[1 * src_stride + 0];
+ dst_b[1] = src[1 * src_stride + 1];
+ dst_a[2] = src[2 * src_stride + 0];
+ dst_b[2] = src[2 * src_stride + 1];
+ dst_a[3] = src[3 * src_stride + 0];
+ dst_b[3] = src[3 * src_stride + 1];
+ dst_a[4] = src[4 * src_stride + 0];
+ dst_b[4] = src[4 * src_stride + 1];
+ dst_a[5] = src[5 * src_stride + 0];
+ dst_b[5] = src[5 * src_stride + 1];
+ dst_a[6] = src[6 * src_stride + 0];
+ dst_b[6] = src[6 * src_stride + 1];
+ dst_a[7] = src[7 * src_stride + 0];
+ dst_b[7] = src[7 * src_stride + 1];
+ src += 2;
+ dst_a += dst_stride_a;
+ dst_b += dst_stride_b;
+ }
+}
+
+void TransposeWxH_C(const uint8* src,
+ int src_stride,
+ uint8* dst,
+ int dst_stride,
+ int width,
+ int height) {
+ int i;
+ for (i = 0; i < width; ++i) {
+ int j;
+ for (j = 0; j < height; ++j) {
+ dst[i * dst_stride + j] = src[j * src_stride + i];
+ }
+ }
+}
+
+void TransposeUVWxH_C(const uint8* src,
+ int src_stride,
+ uint8* dst_a,
+ int dst_stride_a,
+ uint8* dst_b,
+ int dst_stride_b,
+ int width,
+ int height) {
+ int i;
+ for (i = 0; i < width * 2; i += 2) {
+ int j;
+ for (j = 0; j < height; ++j) {
+ dst_a[j + ((i >> 1) * dst_stride_a)] = src[i + (j * src_stride)];
+ dst_b[j + ((i >> 1) * dst_stride_b)] = src[i + (j * src_stride) + 1];
+ }
+ }
+}
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/libyuv/src/main/cpp/libyuv/source/rotate_dspr2.cc b/libyuv/src/main/cpp/libyuv/source/rotate_dspr2.cc
new file mode 100644
index 0000000..5d2338d
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/source/rotate_dspr2.cc
@@ -0,0 +1,475 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/rotate_row.h"
+#include "libyuv/row.h"
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#if !defined(LIBYUV_DISABLE_DSPR2) && defined(__mips_dsp) && \
+ (__mips_dsp_rev >= 2) && (_MIPS_SIM == _MIPS_SIM_ABI32)
+
+void TransposeWx8_DSPR2(const uint8* src,
+ int src_stride,
+ uint8* dst,
+ int dst_stride,
+ int width) {
+ __asm__ __volatile__(
+ ".set push \n"
+ ".set noreorder \n"
+ "sll $t2, %[src_stride], 0x1 \n" // src_stride x 2
+ "sll $t4, %[src_stride], 0x2 \n" // src_stride x 4
+ "sll $t9, %[src_stride], 0x3 \n" // src_stride x 8
+ "addu $t3, $t2, %[src_stride] \n"
+ "addu $t5, $t4, %[src_stride] \n"
+ "addu $t6, $t2, $t4 \n"
+ "andi $t0, %[dst], 0x3 \n"
+ "andi $t1, %[dst_stride], 0x3 \n"
+ "or $t0, $t0, $t1 \n"
+ "bnez $t0, 11f \n"
+ " subu $t7, $t9, %[src_stride] \n"
+ // dst + dst_stride word aligned
+ "1: \n"
+ "lbu $t0, 0(%[src]) \n"
+ "lbux $t1, %[src_stride](%[src]) \n"
+ "lbux $t8, $t2(%[src]) \n"
+ "lbux $t9, $t3(%[src]) \n"
+ "sll $t1, $t1, 16 \n"
+ "sll $t9, $t9, 16 \n"
+ "or $t0, $t0, $t1 \n"
+ "or $t8, $t8, $t9 \n"
+ "precr.qb.ph $s0, $t8, $t0 \n"
+ "lbux $t0, $t4(%[src]) \n"
+ "lbux $t1, $t5(%[src]) \n"
+ "lbux $t8, $t6(%[src]) \n"
+ "lbux $t9, $t7(%[src]) \n"
+ "sll $t1, $t1, 16 \n"
+ "sll $t9, $t9, 16 \n"
+ "or $t0, $t0, $t1 \n"
+ "or $t8, $t8, $t9 \n"
+ "precr.qb.ph $s1, $t8, $t0 \n"
+ "sw $s0, 0(%[dst]) \n"
+ "addiu %[width], -1 \n"
+ "addiu %[src], 1 \n"
+ "sw $s1, 4(%[dst]) \n"
+ "bnez %[width], 1b \n"
+ " addu %[dst], %[dst], %[dst_stride] \n"
+ "b 2f \n"
+ // dst + dst_stride unaligned
+ "11: \n"
+ "lbu $t0, 0(%[src]) \n"
+ "lbux $t1, %[src_stride](%[src]) \n"
+ "lbux $t8, $t2(%[src]) \n"
+ "lbux $t9, $t3(%[src]) \n"
+ "sll $t1, $t1, 16 \n"
+ "sll $t9, $t9, 16 \n"
+ "or $t0, $t0, $t1 \n"
+ "or $t8, $t8, $t9 \n"
+ "precr.qb.ph $s0, $t8, $t0 \n"
+ "lbux $t0, $t4(%[src]) \n"
+ "lbux $t1, $t5(%[src]) \n"
+ "lbux $t8, $t6(%[src]) \n"
+ "lbux $t9, $t7(%[src]) \n"
+ "sll $t1, $t1, 16 \n"
+ "sll $t9, $t9, 16 \n"
+ "or $t0, $t0, $t1 \n"
+ "or $t8, $t8, $t9 \n"
+ "precr.qb.ph $s1, $t8, $t0 \n"
+ "swr $s0, 0(%[dst]) \n"
+ "swl $s0, 3(%[dst]) \n"
+ "addiu %[width], -1 \n"
+ "addiu %[src], 1 \n"
+ "swr $s1, 4(%[dst]) \n"
+ "swl $s1, 7(%[dst]) \n"
+ "bnez %[width], 11b \n"
+ "addu %[dst], %[dst], %[dst_stride] \n"
+ "2: \n"
+ ".set pop \n"
+ : [src] "+r"(src), [dst] "+r"(dst), [width] "+r"(width)
+ : [src_stride] "r"(src_stride), [dst_stride] "r"(dst_stride)
+ : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9", "s0", "s1");
+}
+
+void TransposeWx8_Fast_DSPR2(const uint8* src,
+ int src_stride,
+ uint8* dst,
+ int dst_stride,
+ int width) {
+ __asm__ __volatile__(
+ ".set noat \n"
+ ".set push \n"
+ ".set noreorder \n"
+ "beqz %[width], 2f \n"
+ " sll $t2, %[src_stride], 0x1 \n" // src_stride x 2
+ "sll $t4, %[src_stride], 0x2 \n" // src_stride x 4
+ "sll $t9, %[src_stride], 0x3 \n" // src_stride x 8
+ "addu $t3, $t2, %[src_stride] \n"
+ "addu $t5, $t4, %[src_stride] \n"
+ "addu $t6, $t2, $t4 \n"
+
+ "srl $AT, %[width], 0x2 \n"
+ "andi $t0, %[dst], 0x3 \n"
+ "andi $t1, %[dst_stride], 0x3 \n"
+ "or $t0, $t0, $t1 \n"
+ "bnez $t0, 11f \n"
+ " subu $t7, $t9, %[src_stride] \n"
+ // dst + dst_stride word aligned
+ "1: \n"
+ "lw $t0, 0(%[src]) \n"
+ "lwx $t1, %[src_stride](%[src]) \n"
+ "lwx $t8, $t2(%[src]) \n"
+ "lwx $t9, $t3(%[src]) \n"
+
+ // t0 = | 30 | 20 | 10 | 00 |
+ // t1 = | 31 | 21 | 11 | 01 |
+ // t8 = | 32 | 22 | 12 | 02 |
+ // t9 = | 33 | 23 | 13 | 03 |
+
+ "precr.qb.ph $s0, $t1, $t0 \n"
+ "precr.qb.ph $s1, $t9, $t8 \n"
+ "precrq.qb.ph $s2, $t1, $t0 \n"
+ "precrq.qb.ph $s3, $t9, $t8 \n"
+
+ // s0 = | 21 | 01 | 20 | 00 |
+ // s1 = | 23 | 03 | 22 | 02 |
+ // s2 = | 31 | 11 | 30 | 10 |
+ // s3 = | 33 | 13 | 32 | 12 |
+
+ "precr.qb.ph $s4, $s1, $s0 \n"
+ "precrq.qb.ph $s5, $s1, $s0 \n"
+ "precr.qb.ph $s6, $s3, $s2 \n"
+ "precrq.qb.ph $s7, $s3, $s2 \n"
+
+ // s4 = | 03 | 02 | 01 | 00 |
+ // s5 = | 23 | 22 | 21 | 20 |
+ // s6 = | 13 | 12 | 11 | 10 |
+ // s7 = | 33 | 32 | 31 | 30 |
+
+ "lwx $t0, $t4(%[src]) \n"
+ "lwx $t1, $t5(%[src]) \n"
+ "lwx $t8, $t6(%[src]) \n"
+ "lwx $t9, $t7(%[src]) \n"
+
+ // t0 = | 34 | 24 | 14 | 04 |
+ // t1 = | 35 | 25 | 15 | 05 |
+ // t8 = | 36 | 26 | 16 | 06 |
+ // t9 = | 37 | 27 | 17 | 07 |
+
+ "precr.qb.ph $s0, $t1, $t0 \n"
+ "precr.qb.ph $s1, $t9, $t8 \n"
+ "precrq.qb.ph $s2, $t1, $t0 \n"
+ "precrq.qb.ph $s3, $t9, $t8 \n"
+
+ // s0 = | 25 | 05 | 24 | 04 |
+ // s1 = | 27 | 07 | 26 | 06 |
+ // s2 = | 35 | 15 | 34 | 14 |
+ // s3 = | 37 | 17 | 36 | 16 |
+
+ "precr.qb.ph $t0, $s1, $s0 \n"
+ "precrq.qb.ph $t1, $s1, $s0 \n"
+ "precr.qb.ph $t8, $s3, $s2 \n"
+ "precrq.qb.ph $t9, $s3, $s2 \n"
+
+ // t0 = | 07 | 06 | 05 | 04 |
+ // t1 = | 27 | 26 | 25 | 24 |
+ // t8 = | 17 | 16 | 15 | 14 |
+ // t9 = | 37 | 36 | 35 | 34 |
+
+ "addu $s0, %[dst], %[dst_stride] \n"
+ "addu $s1, $s0, %[dst_stride] \n"
+ "addu $s2, $s1, %[dst_stride] \n"
+
+ "sw $s4, 0(%[dst]) \n"
+ "sw $t0, 4(%[dst]) \n"
+ "sw $s6, 0($s0) \n"
+ "sw $t8, 4($s0) \n"
+ "sw $s5, 0($s1) \n"
+ "sw $t1, 4($s1) \n"
+ "sw $s7, 0($s2) \n"
+ "sw $t9, 4($s2) \n"
+
+ "addiu $AT, -1 \n"
+ "addiu %[src], 4 \n"
+
+ "bnez $AT, 1b \n"
+ " addu %[dst], $s2, %[dst_stride] \n"
+ "b 2f \n"
+ // dst + dst_stride unaligned
+ "11: \n"
+ "lw $t0, 0(%[src]) \n"
+ "lwx $t1, %[src_stride](%[src]) \n"
+ "lwx $t8, $t2(%[src]) \n"
+ "lwx $t9, $t3(%[src]) \n"
+
+ // t0 = | 30 | 20 | 10 | 00 |
+ // t1 = | 31 | 21 | 11 | 01 |
+ // t8 = | 32 | 22 | 12 | 02 |
+ // t9 = | 33 | 23 | 13 | 03 |
+
+ "precr.qb.ph $s0, $t1, $t0 \n"
+ "precr.qb.ph $s1, $t9, $t8 \n"
+ "precrq.qb.ph $s2, $t1, $t0 \n"
+ "precrq.qb.ph $s3, $t9, $t8 \n"
+
+ // s0 = | 21 | 01 | 20 | 00 |
+ // s1 = | 23 | 03 | 22 | 02 |
+ // s2 = | 31 | 11 | 30 | 10 |
+ // s3 = | 33 | 13 | 32 | 12 |
+
+ "precr.qb.ph $s4, $s1, $s0 \n"
+ "precrq.qb.ph $s5, $s1, $s0 \n"
+ "precr.qb.ph $s6, $s3, $s2 \n"
+ "precrq.qb.ph $s7, $s3, $s2 \n"
+
+ // s4 = | 03 | 02 | 01 | 00 |
+ // s5 = | 23 | 22 | 21 | 20 |
+ // s6 = | 13 | 12 | 11 | 10 |
+ // s7 = | 33 | 32 | 31 | 30 |
+
+ "lwx $t0, $t4(%[src]) \n"
+ "lwx $t1, $t5(%[src]) \n"
+ "lwx $t8, $t6(%[src]) \n"
+ "lwx $t9, $t7(%[src]) \n"
+
+ // t0 = | 34 | 24 | 14 | 04 |
+ // t1 = | 35 | 25 | 15 | 05 |
+ // t8 = | 36 | 26 | 16 | 06 |
+ // t9 = | 37 | 27 | 17 | 07 |
+
+ "precr.qb.ph $s0, $t1, $t0 \n"
+ "precr.qb.ph $s1, $t9, $t8 \n"
+ "precrq.qb.ph $s2, $t1, $t0 \n"
+ "precrq.qb.ph $s3, $t9, $t8 \n"
+
+ // s0 = | 25 | 05 | 24 | 04 |
+ // s1 = | 27 | 07 | 26 | 06 |
+ // s2 = | 35 | 15 | 34 | 14 |
+ // s3 = | 37 | 17 | 36 | 16 |
+
+ "precr.qb.ph $t0, $s1, $s0 \n"
+ "precrq.qb.ph $t1, $s1, $s0 \n"
+ "precr.qb.ph $t8, $s3, $s2 \n"
+ "precrq.qb.ph $t9, $s3, $s2 \n"
+
+ // t0 = | 07 | 06 | 05 | 04 |
+ // t1 = | 27 | 26 | 25 | 24 |
+ // t8 = | 17 | 16 | 15 | 14 |
+ // t9 = | 37 | 36 | 35 | 34 |
+
+ "addu $s0, %[dst], %[dst_stride] \n"
+ "addu $s1, $s0, %[dst_stride] \n"
+ "addu $s2, $s1, %[dst_stride] \n"
+
+ "swr $s4, 0(%[dst]) \n"
+ "swl $s4, 3(%[dst]) \n"
+ "swr $t0, 4(%[dst]) \n"
+ "swl $t0, 7(%[dst]) \n"
+ "swr $s6, 0($s0) \n"
+ "swl $s6, 3($s0) \n"
+ "swr $t8, 4($s0) \n"
+ "swl $t8, 7($s0) \n"
+ "swr $s5, 0($s1) \n"
+ "swl $s5, 3($s1) \n"
+ "swr $t1, 4($s1) \n"
+ "swl $t1, 7($s1) \n"
+ "swr $s7, 0($s2) \n"
+ "swl $s7, 3($s2) \n"
+ "swr $t9, 4($s2) \n"
+ "swl $t9, 7($s2) \n"
+
+ "addiu $AT, -1 \n"
+ "addiu %[src], 4 \n"
+
+ "bnez $AT, 11b \n"
+ " addu %[dst], $s2, %[dst_stride] \n"
+ "2: \n"
+ ".set pop \n"
+ ".set at \n"
+ : [src] "+r"(src), [dst] "+r"(dst), [width] "+r"(width)
+ : [src_stride] "r"(src_stride), [dst_stride] "r"(dst_stride)
+ : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9", "s0", "s1",
+ "s2", "s3", "s4", "s5", "s6", "s7");
+}
+
+void TransposeUVWx8_DSPR2(const uint8* src,
+ int src_stride,
+ uint8* dst_a,
+ int dst_stride_a,
+ uint8* dst_b,
+ int dst_stride_b,
+ int width) {
+ __asm__ __volatile__(
+ ".set push \n"
+ ".set noreorder \n"
+ "beqz %[width], 2f \n"
+ " sll $t2, %[src_stride], 0x1 \n" // src_stride x 2
+ "sll $t4, %[src_stride], 0x2 \n" // src_stride x 4
+ "sll $t9, %[src_stride], 0x3 \n" // src_stride x 8
+ "addu $t3, $t2, %[src_stride] \n"
+ "addu $t5, $t4, %[src_stride] \n"
+ "addu $t6, $t2, $t4 \n"
+ "subu $t7, $t9, %[src_stride] \n"
+ "srl $t1, %[width], 1 \n"
+
+ // check word aligment for dst_a, dst_b, dst_stride_a and dst_stride_b
+ "andi $t0, %[dst_a], 0x3 \n"
+ "andi $t8, %[dst_b], 0x3 \n"
+ "or $t0, $t0, $t8 \n"
+ "andi $t8, %[dst_stride_a], 0x3 \n"
+ "andi $s5, %[dst_stride_b], 0x3 \n"
+ "or $t8, $t8, $s5 \n"
+ "or $t0, $t0, $t8 \n"
+ "bnez $t0, 11f \n"
+ " nop \n"
+ // dst + dst_stride word aligned (both, a & b dst addresses)
+ "1: \n"
+ "lw $t0, 0(%[src]) \n" // |B0|A0|b0|a0|
+ "lwx $t8, %[src_stride](%[src]) \n" // |B1|A1|b1|a1|
+ "addu $s5, %[dst_a], %[dst_stride_a] \n"
+ "lwx $t9, $t2(%[src]) \n" // |B2|A2|b2|a2|
+ "lwx $s0, $t3(%[src]) \n" // |B3|A3|b3|a3|
+ "addu $s6, %[dst_b], %[dst_stride_b] \n"
+
+ "precrq.ph.w $s1, $t8, $t0 \n" // |B1|A1|B0|A0|
+ "precrq.ph.w $s2, $s0, $t9 \n" // |B3|A3|B2|A2|
+ "precr.qb.ph $s3, $s2, $s1 \n" // |A3|A2|A1|A0|
+ "precrq.qb.ph $s4, $s2, $s1 \n" // |B3|B2|B1|B0|
+
+ "sll $t0, $t0, 16 \n"
+ "packrl.ph $s1, $t8, $t0 \n" // |b1|a1|b0|a0|
+ "sll $t9, $t9, 16 \n"
+ "packrl.ph $s2, $s0, $t9 \n" // |b3|a3|b2|a2|
+
+ "sw $s3, 0($s5) \n"
+ "sw $s4, 0($s6) \n"
+
+ "precr.qb.ph $s3, $s2, $s1 \n" // |a3|a2|a1|a0|
+ "precrq.qb.ph $s4, $s2, $s1 \n" // |b3|b2|b1|b0|
+
+ "lwx $t0, $t4(%[src]) \n" // |B4|A4|b4|a4|
+ "lwx $t8, $t5(%[src]) \n" // |B5|A5|b5|a5|
+ "lwx $t9, $t6(%[src]) \n" // |B6|A6|b6|a6|
+ "lwx $s0, $t7(%[src]) \n" // |B7|A7|b7|a7|
+ "sw $s3, 0(%[dst_a]) \n"
+ "sw $s4, 0(%[dst_b]) \n"
+
+ "precrq.ph.w $s1, $t8, $t0 \n" // |B5|A5|B4|A4|
+ "precrq.ph.w $s2, $s0, $t9 \n" // |B6|A6|B7|A7|
+ "precr.qb.ph $s3, $s2, $s1 \n" // |A7|A6|A5|A4|
+ "precrq.qb.ph $s4, $s2, $s1 \n" // |B7|B6|B5|B4|
+
+ "sll $t0, $t0, 16 \n"
+ "packrl.ph $s1, $t8, $t0 \n" // |b5|a5|b4|a4|
+ "sll $t9, $t9, 16 \n"
+ "packrl.ph $s2, $s0, $t9 \n" // |b7|a7|b6|a6|
+ "sw $s3, 4($s5) \n"
+ "sw $s4, 4($s6) \n"
+
+ "precr.qb.ph $s3, $s2, $s1 \n" // |a7|a6|a5|a4|
+ "precrq.qb.ph $s4, $s2, $s1 \n" // |b7|b6|b5|b4|
+
+ "addiu %[src], 4 \n"
+ "addiu $t1, -1 \n"
+ "sll $t0, %[dst_stride_a], 1 \n"
+ "sll $t8, %[dst_stride_b], 1 \n"
+ "sw $s3, 4(%[dst_a]) \n"
+ "sw $s4, 4(%[dst_b]) \n"
+ "addu %[dst_a], %[dst_a], $t0 \n"
+ "bnez $t1, 1b \n"
+ " addu %[dst_b], %[dst_b], $t8 \n"
+ "b 2f \n"
+ " nop \n"
+
+ // dst_a or dst_b or dst_stride_a or dst_stride_b not word aligned
+ "11: \n"
+ "lw $t0, 0(%[src]) \n" // |B0|A0|b0|a0|
+ "lwx $t8, %[src_stride](%[src]) \n" // |B1|A1|b1|a1|
+ "addu $s5, %[dst_a], %[dst_stride_a] \n"
+ "lwx $t9, $t2(%[src]) \n" // |B2|A2|b2|a2|
+ "lwx $s0, $t3(%[src]) \n" // |B3|A3|b3|a3|
+ "addu $s6, %[dst_b], %[dst_stride_b] \n"
+
+ "precrq.ph.w $s1, $t8, $t0 \n" // |B1|A1|B0|A0|
+ "precrq.ph.w $s2, $s0, $t9 \n" // |B3|A3|B2|A2|
+ "precr.qb.ph $s3, $s2, $s1 \n" // |A3|A2|A1|A0|
+ "precrq.qb.ph $s4, $s2, $s1 \n" // |B3|B2|B1|B0|
+
+ "sll $t0, $t0, 16 \n"
+ "packrl.ph $s1, $t8, $t0 \n" // |b1|a1|b0|a0|
+ "sll $t9, $t9, 16 \n"
+ "packrl.ph $s2, $s0, $t9 \n" // |b3|a3|b2|a2|
+
+ "swr $s3, 0($s5) \n"
+ "swl $s3, 3($s5) \n"
+ "swr $s4, 0($s6) \n"
+ "swl $s4, 3($s6) \n"
+
+ "precr.qb.ph $s3, $s2, $s1 \n" // |a3|a2|a1|a0|
+ "precrq.qb.ph $s4, $s2, $s1 \n" // |b3|b2|b1|b0|
+
+ "lwx $t0, $t4(%[src]) \n" // |B4|A4|b4|a4|
+ "lwx $t8, $t5(%[src]) \n" // |B5|A5|b5|a5|
+ "lwx $t9, $t6(%[src]) \n" // |B6|A6|b6|a6|
+ "lwx $s0, $t7(%[src]) \n" // |B7|A7|b7|a7|
+ "swr $s3, 0(%[dst_a]) \n"
+ "swl $s3, 3(%[dst_a]) \n"
+ "swr $s4, 0(%[dst_b]) \n"
+ "swl $s4, 3(%[dst_b]) \n"
+
+ "precrq.ph.w $s1, $t8, $t0 \n" // |B5|A5|B4|A4|
+ "precrq.ph.w $s2, $s0, $t9 \n" // |B6|A6|B7|A7|
+ "precr.qb.ph $s3, $s2, $s1 \n" // |A7|A6|A5|A4|
+ "precrq.qb.ph $s4, $s2, $s1 \n" // |B7|B6|B5|B4|
+
+ "sll $t0, $t0, 16 \n"
+ "packrl.ph $s1, $t8, $t0 \n" // |b5|a5|b4|a4|
+ "sll $t9, $t9, 16 \n"
+ "packrl.ph $s2, $s0, $t9 \n" // |b7|a7|b6|a6|
+
+ "swr $s3, 4($s5) \n"
+ "swl $s3, 7($s5) \n"
+ "swr $s4, 4($s6) \n"
+ "swl $s4, 7($s6) \n"
+
+ "precr.qb.ph $s3, $s2, $s1 \n" // |a7|a6|a5|a4|
+ "precrq.qb.ph $s4, $s2, $s1 \n" // |b7|b6|b5|b4|
+
+ "addiu %[src], 4 \n"
+ "addiu $t1, -1 \n"
+ "sll $t0, %[dst_stride_a], 1 \n"
+ "sll $t8, %[dst_stride_b], 1 \n"
+ "swr $s3, 4(%[dst_a]) \n"
+ "swl $s3, 7(%[dst_a]) \n"
+ "swr $s4, 4(%[dst_b]) \n"
+ "swl $s4, 7(%[dst_b]) \n"
+ "addu %[dst_a], %[dst_a], $t0 \n"
+ "bnez $t1, 11b \n"
+ " addu %[dst_b], %[dst_b], $t8 \n"
+
+ "2: \n"
+ ".set pop \n"
+ : [src] "+r"(src), [dst_a] "+r"(dst_a), [dst_b] "+r"(dst_b),
+ [width] "+r"(width), [src_stride] "+r"(src_stride)
+ : [dst_stride_a] "r"(dst_stride_a), [dst_stride_b] "r"(dst_stride_b)
+ : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9", "s0", "s1",
+ "s2", "s3", "s4", "s5", "s6");
+}
+
+#endif // defined(__mips_dsp) && (__mips_dsp_rev >= 2)
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/libyuv/src/main/cpp/libyuv/source/rotate_gcc.cc b/libyuv/src/main/cpp/libyuv/source/rotate_gcc.cc
new file mode 100644
index 0000000..74b48ac
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/source/rotate_gcc.cc
@@ -0,0 +1,374 @@
+/*
+ * Copyright 2015 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/rotate_row.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for GCC x86 and x64.
+#if !defined(LIBYUV_DISABLE_X86) && \
+ (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
+
+// Transpose 8x8. 32 or 64 bit, but not NaCL for 64 bit.
+#if defined(HAS_TRANSPOSEWX8_SSSE3)
+void TransposeWx8_SSSE3(const uint8* src,
+ int src_stride,
+ uint8* dst,
+ int dst_stride,
+ int width) {
+ asm volatile(
+ // Read in the data from the source pointer.
+ // First round of bit swap.
+ LABELALIGN
+ "1: \n"
+ "movq (%0),%%xmm0 \n"
+ "movq (%0,%3),%%xmm1 \n"
+ "lea (%0,%3,2),%0 \n"
+ "punpcklbw %%xmm1,%%xmm0 \n"
+ "movq (%0),%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "palignr $0x8,%%xmm1,%%xmm1 \n"
+ "movq (%0,%3),%%xmm3 \n"
+ "lea (%0,%3,2),%0 \n"
+ "punpcklbw %%xmm3,%%xmm2 \n"
+ "movdqa %%xmm2,%%xmm3 \n"
+ "movq (%0),%%xmm4 \n"
+ "palignr $0x8,%%xmm3,%%xmm3 \n"
+ "movq (%0,%3),%%xmm5 \n"
+ "lea (%0,%3,2),%0 \n"
+ "punpcklbw %%xmm5,%%xmm4 \n"
+ "movdqa %%xmm4,%%xmm5 \n"
+ "movq (%0),%%xmm6 \n"
+ "palignr $0x8,%%xmm5,%%xmm5 \n"
+ "movq (%0,%3),%%xmm7 \n"
+ "lea (%0,%3,2),%0 \n"
+ "punpcklbw %%xmm7,%%xmm6 \n"
+ "neg %3 \n"
+ "movdqa %%xmm6,%%xmm7 \n"
+ "lea 0x8(%0,%3,8),%0 \n"
+ "palignr $0x8,%%xmm7,%%xmm7 \n"
+ "neg %3 \n"
+ // Second round of bit swap.
+ "punpcklwd %%xmm2,%%xmm0 \n"
+ "punpcklwd %%xmm3,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "movdqa %%xmm1,%%xmm3 \n"
+ "palignr $0x8,%%xmm2,%%xmm2 \n"
+ "palignr $0x8,%%xmm3,%%xmm3 \n"
+ "punpcklwd %%xmm6,%%xmm4 \n"
+ "punpcklwd %%xmm7,%%xmm5 \n"
+ "movdqa %%xmm4,%%xmm6 \n"
+ "movdqa %%xmm5,%%xmm7 \n"
+ "palignr $0x8,%%xmm6,%%xmm6 \n"
+ "palignr $0x8,%%xmm7,%%xmm7 \n"
+ // Third round of bit swap.
+ // Write to the destination pointer.
+ "punpckldq %%xmm4,%%xmm0 \n"
+ "movq %%xmm0,(%1) \n"
+ "movdqa %%xmm0,%%xmm4 \n"
+ "palignr $0x8,%%xmm4,%%xmm4 \n"
+ "movq %%xmm4,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "punpckldq %%xmm6,%%xmm2 \n"
+ "movdqa %%xmm2,%%xmm6 \n"
+ "movq %%xmm2,(%1) \n"
+ "palignr $0x8,%%xmm6,%%xmm6 \n"
+ "punpckldq %%xmm5,%%xmm1 \n"
+ "movq %%xmm6,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "movdqa %%xmm1,%%xmm5 \n"
+ "movq %%xmm1,(%1) \n"
+ "palignr $0x8,%%xmm5,%%xmm5 \n"
+ "movq %%xmm5,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "punpckldq %%xmm7,%%xmm3 \n"
+ "movq %%xmm3,(%1) \n"
+ "movdqa %%xmm3,%%xmm7 \n"
+ "palignr $0x8,%%xmm7,%%xmm7 \n"
+ "sub $0x8,%2 \n"
+ "movq %%xmm7,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ : "r"((intptr_t)(src_stride)), // %3
+ "r"((intptr_t)(dst_stride)) // %4
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
+}
+#endif // defined(HAS_TRANSPOSEWX8_SSSE3)
+
+// Transpose 16x8. 64 bit
+#if defined(HAS_TRANSPOSEWX8_FAST_SSSE3)
+void TransposeWx8_Fast_SSSE3(const uint8* src,
+ int src_stride,
+ uint8* dst,
+ int dst_stride,
+ int width) {
+ asm volatile(
+ // Read in the data from the source pointer.
+ // First round of bit swap.
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu (%0,%3),%%xmm1 \n"
+ "lea (%0,%3,2),%0 \n"
+ "movdqa %%xmm0,%%xmm8 \n"
+ "punpcklbw %%xmm1,%%xmm0 \n"
+ "punpckhbw %%xmm1,%%xmm8 \n"
+ "movdqu (%0),%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm8,%%xmm9 \n"
+ "palignr $0x8,%%xmm1,%%xmm1 \n"
+ "palignr $0x8,%%xmm9,%%xmm9 \n"
+ "movdqu (%0,%3),%%xmm3 \n"
+ "lea (%0,%3,2),%0 \n"
+ "movdqa %%xmm2,%%xmm10 \n"
+ "punpcklbw %%xmm3,%%xmm2 \n"
+ "punpckhbw %%xmm3,%%xmm10 \n"
+ "movdqa %%xmm2,%%xmm3 \n"
+ "movdqa %%xmm10,%%xmm11 \n"
+ "movdqu (%0),%%xmm4 \n"
+ "palignr $0x8,%%xmm3,%%xmm3 \n"
+ "palignr $0x8,%%xmm11,%%xmm11 \n"
+ "movdqu (%0,%3),%%xmm5 \n"
+ "lea (%0,%3,2),%0 \n"
+ "movdqa %%xmm4,%%xmm12 \n"
+ "punpcklbw %%xmm5,%%xmm4 \n"
+ "punpckhbw %%xmm5,%%xmm12 \n"
+ "movdqa %%xmm4,%%xmm5 \n"
+ "movdqa %%xmm12,%%xmm13 \n"
+ "movdqu (%0),%%xmm6 \n"
+ "palignr $0x8,%%xmm5,%%xmm5 \n"
+ "palignr $0x8,%%xmm13,%%xmm13 \n"
+ "movdqu (%0,%3),%%xmm7 \n"
+ "lea (%0,%3,2),%0 \n"
+ "movdqa %%xmm6,%%xmm14 \n"
+ "punpcklbw %%xmm7,%%xmm6 \n"
+ "punpckhbw %%xmm7,%%xmm14 \n"
+ "neg %3 \n"
+ "movdqa %%xmm6,%%xmm7 \n"
+ "movdqa %%xmm14,%%xmm15 \n"
+ "lea 0x10(%0,%3,8),%0 \n"
+ "palignr $0x8,%%xmm7,%%xmm7 \n"
+ "palignr $0x8,%%xmm15,%%xmm15 \n"
+ "neg %3 \n"
+ // Second round of bit swap.
+ "punpcklwd %%xmm2,%%xmm0 \n"
+ "punpcklwd %%xmm3,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "movdqa %%xmm1,%%xmm3 \n"
+ "palignr $0x8,%%xmm2,%%xmm2 \n"
+ "palignr $0x8,%%xmm3,%%xmm3 \n"
+ "punpcklwd %%xmm6,%%xmm4 \n"
+ "punpcklwd %%xmm7,%%xmm5 \n"
+ "movdqa %%xmm4,%%xmm6 \n"
+ "movdqa %%xmm5,%%xmm7 \n"
+ "palignr $0x8,%%xmm6,%%xmm6 \n"
+ "palignr $0x8,%%xmm7,%%xmm7 \n"
+ "punpcklwd %%xmm10,%%xmm8 \n"
+ "punpcklwd %%xmm11,%%xmm9 \n"
+ "movdqa %%xmm8,%%xmm10 \n"
+ "movdqa %%xmm9,%%xmm11 \n"
+ "palignr $0x8,%%xmm10,%%xmm10 \n"
+ "palignr $0x8,%%xmm11,%%xmm11 \n"
+ "punpcklwd %%xmm14,%%xmm12 \n"
+ "punpcklwd %%xmm15,%%xmm13 \n"
+ "movdqa %%xmm12,%%xmm14 \n"
+ "movdqa %%xmm13,%%xmm15 \n"
+ "palignr $0x8,%%xmm14,%%xmm14 \n"
+ "palignr $0x8,%%xmm15,%%xmm15 \n"
+ // Third round of bit swap.
+ // Write to the destination pointer.
+ "punpckldq %%xmm4,%%xmm0 \n"
+ "movq %%xmm0,(%1) \n"
+ "movdqa %%xmm0,%%xmm4 \n"
+ "palignr $0x8,%%xmm4,%%xmm4 \n"
+ "movq %%xmm4,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "punpckldq %%xmm6,%%xmm2 \n"
+ "movdqa %%xmm2,%%xmm6 \n"
+ "movq %%xmm2,(%1) \n"
+ "palignr $0x8,%%xmm6,%%xmm6 \n"
+ "punpckldq %%xmm5,%%xmm1 \n"
+ "movq %%xmm6,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "movdqa %%xmm1,%%xmm5 \n"
+ "movq %%xmm1,(%1) \n"
+ "palignr $0x8,%%xmm5,%%xmm5 \n"
+ "movq %%xmm5,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "punpckldq %%xmm7,%%xmm3 \n"
+ "movq %%xmm3,(%1) \n"
+ "movdqa %%xmm3,%%xmm7 \n"
+ "palignr $0x8,%%xmm7,%%xmm7 \n"
+ "movq %%xmm7,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "punpckldq %%xmm12,%%xmm8 \n"
+ "movq %%xmm8,(%1) \n"
+ "movdqa %%xmm8,%%xmm12 \n"
+ "palignr $0x8,%%xmm12,%%xmm12 \n"
+ "movq %%xmm12,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "punpckldq %%xmm14,%%xmm10 \n"
+ "movdqa %%xmm10,%%xmm14 \n"
+ "movq %%xmm10,(%1) \n"
+ "palignr $0x8,%%xmm14,%%xmm14 \n"
+ "punpckldq %%xmm13,%%xmm9 \n"
+ "movq %%xmm14,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "movdqa %%xmm9,%%xmm13 \n"
+ "movq %%xmm9,(%1) \n"
+ "palignr $0x8,%%xmm13,%%xmm13 \n"
+ "movq %%xmm13,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "punpckldq %%xmm15,%%xmm11 \n"
+ "movq %%xmm11,(%1) \n"
+ "movdqa %%xmm11,%%xmm15 \n"
+ "palignr $0x8,%%xmm15,%%xmm15 \n"
+ "sub $0x10,%2 \n"
+ "movq %%xmm15,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ : "r"((intptr_t)(src_stride)), // %3
+ "r"((intptr_t)(dst_stride)) // %4
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",
+ "xmm15");
+}
+#endif // defined(HAS_TRANSPOSEWX8_FAST_SSSE3)
+
+// Transpose UV 8x8. 64 bit.
+#if defined(HAS_TRANSPOSEUVWX8_SSE2)
+void TransposeUVWx8_SSE2(const uint8* src,
+ int src_stride,
+ uint8* dst_a,
+ int dst_stride_a,
+ uint8* dst_b,
+ int dst_stride_b,
+ int width) {
+ asm volatile(
+ // Read in the data from the source pointer.
+ // First round of bit swap.
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu (%0,%4),%%xmm1 \n"
+ "lea (%0,%4,2),%0 \n"
+ "movdqa %%xmm0,%%xmm8 \n"
+ "punpcklbw %%xmm1,%%xmm0 \n"
+ "punpckhbw %%xmm1,%%xmm8 \n"
+ "movdqa %%xmm8,%%xmm1 \n"
+ "movdqu (%0),%%xmm2 \n"
+ "movdqu (%0,%4),%%xmm3 \n"
+ "lea (%0,%4,2),%0 \n"
+ "movdqa %%xmm2,%%xmm8 \n"
+ "punpcklbw %%xmm3,%%xmm2 \n"
+ "punpckhbw %%xmm3,%%xmm8 \n"
+ "movdqa %%xmm8,%%xmm3 \n"
+ "movdqu (%0),%%xmm4 \n"
+ "movdqu (%0,%4),%%xmm5 \n"
+ "lea (%0,%4,2),%0 \n"
+ "movdqa %%xmm4,%%xmm8 \n"
+ "punpcklbw %%xmm5,%%xmm4 \n"
+ "punpckhbw %%xmm5,%%xmm8 \n"
+ "movdqa %%xmm8,%%xmm5 \n"
+ "movdqu (%0),%%xmm6 \n"
+ "movdqu (%0,%4),%%xmm7 \n"
+ "lea (%0,%4,2),%0 \n"
+ "movdqa %%xmm6,%%xmm8 \n"
+ "punpcklbw %%xmm7,%%xmm6 \n"
+ "neg %4 \n"
+ "lea 0x10(%0,%4,8),%0 \n"
+ "punpckhbw %%xmm7,%%xmm8 \n"
+ "movdqa %%xmm8,%%xmm7 \n"
+ "neg %4 \n"
+ // Second round of bit swap.
+ "movdqa %%xmm0,%%xmm8 \n"
+ "movdqa %%xmm1,%%xmm9 \n"
+ "punpckhwd %%xmm2,%%xmm8 \n"
+ "punpckhwd %%xmm3,%%xmm9 \n"
+ "punpcklwd %%xmm2,%%xmm0 \n"
+ "punpcklwd %%xmm3,%%xmm1 \n"
+ "movdqa %%xmm8,%%xmm2 \n"
+ "movdqa %%xmm9,%%xmm3 \n"
+ "movdqa %%xmm4,%%xmm8 \n"
+ "movdqa %%xmm5,%%xmm9 \n"
+ "punpckhwd %%xmm6,%%xmm8 \n"
+ "punpckhwd %%xmm7,%%xmm9 \n"
+ "punpcklwd %%xmm6,%%xmm4 \n"
+ "punpcklwd %%xmm7,%%xmm5 \n"
+ "movdqa %%xmm8,%%xmm6 \n"
+ "movdqa %%xmm9,%%xmm7 \n"
+ // Third round of bit swap.
+ // Write to the destination pointer.
+ "movdqa %%xmm0,%%xmm8 \n"
+ "punpckldq %%xmm4,%%xmm0 \n"
+ "movlpd %%xmm0,(%1) \n" // Write back U channel
+ "movhpd %%xmm0,(%2) \n" // Write back V channel
+ "punpckhdq %%xmm4,%%xmm8 \n"
+ "movlpd %%xmm8,(%1,%5) \n"
+ "lea (%1,%5,2),%1 \n"
+ "movhpd %%xmm8,(%2,%6) \n"
+ "lea (%2,%6,2),%2 \n"
+ "movdqa %%xmm2,%%xmm8 \n"
+ "punpckldq %%xmm6,%%xmm2 \n"
+ "movlpd %%xmm2,(%1) \n"
+ "movhpd %%xmm2,(%2) \n"
+ "punpckhdq %%xmm6,%%xmm8 \n"
+ "movlpd %%xmm8,(%1,%5) \n"
+ "lea (%1,%5,2),%1 \n"
+ "movhpd %%xmm8,(%2,%6) \n"
+ "lea (%2,%6,2),%2 \n"
+ "movdqa %%xmm1,%%xmm8 \n"
+ "punpckldq %%xmm5,%%xmm1 \n"
+ "movlpd %%xmm1,(%1) \n"
+ "movhpd %%xmm1,(%2) \n"
+ "punpckhdq %%xmm5,%%xmm8 \n"
+ "movlpd %%xmm8,(%1,%5) \n"
+ "lea (%1,%5,2),%1 \n"
+ "movhpd %%xmm8,(%2,%6) \n"
+ "lea (%2,%6,2),%2 \n"
+ "movdqa %%xmm3,%%xmm8 \n"
+ "punpckldq %%xmm7,%%xmm3 \n"
+ "movlpd %%xmm3,(%1) \n"
+ "movhpd %%xmm3,(%2) \n"
+ "punpckhdq %%xmm7,%%xmm8 \n"
+ "sub $0x8,%3 \n"
+ "movlpd %%xmm8,(%1,%5) \n"
+ "lea (%1,%5,2),%1 \n"
+ "movhpd %%xmm8,(%2,%6) \n"
+ "lea (%2,%6,2),%2 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst_a), // %1
+ "+r"(dst_b), // %2
+ "+r"(width) // %3
+ : "r"((intptr_t)(src_stride)), // %4
+ "r"((intptr_t)(dst_stride_a)), // %5
+ "r"((intptr_t)(dst_stride_b)) // %6
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7", "xmm8", "xmm9");
+}
+#endif // defined(HAS_TRANSPOSEUVWX8_SSE2)
+#endif // defined(__x86_64__) || defined(__i386__)
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/libyuv/src/main/cpp/libyuv/source/rotate_msa.cc b/libyuv/src/main/cpp/libyuv/source/rotate_msa.cc
new file mode 100644
index 0000000..8907765
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/source/rotate_msa.cc
@@ -0,0 +1,250 @@
+/*
+ * Copyright 2016 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/rotate_row.h"
+
+// This module is for GCC MSA
+#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
+#include "libyuv/macros_msa.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#define ILVRL_B(in0, in1, in2, in3, out0, out1, out2, out3) \
+ { \
+ out0 = (v16u8)__msa_ilvr_b((v16i8)in1, (v16i8)in0); \
+ out1 = (v16u8)__msa_ilvl_b((v16i8)in1, (v16i8)in0); \
+ out2 = (v16u8)__msa_ilvr_b((v16i8)in3, (v16i8)in2); \
+ out3 = (v16u8)__msa_ilvl_b((v16i8)in3, (v16i8)in2); \
+ }
+
+#define ILVRL_H(in0, in1, in2, in3, out0, out1, out2, out3) \
+ { \
+ out0 = (v16u8)__msa_ilvr_h((v8i16)in1, (v8i16)in0); \
+ out1 = (v16u8)__msa_ilvl_h((v8i16)in1, (v8i16)in0); \
+ out2 = (v16u8)__msa_ilvr_h((v8i16)in3, (v8i16)in2); \
+ out3 = (v16u8)__msa_ilvl_h((v8i16)in3, (v8i16)in2); \
+ }
+
+#define ILVRL_W(in0, in1, in2, in3, out0, out1, out2, out3) \
+ { \
+ out0 = (v16u8)__msa_ilvr_w((v4i32)in1, (v4i32)in0); \
+ out1 = (v16u8)__msa_ilvl_w((v4i32)in1, (v4i32)in0); \
+ out2 = (v16u8)__msa_ilvr_w((v4i32)in3, (v4i32)in2); \
+ out3 = (v16u8)__msa_ilvl_w((v4i32)in3, (v4i32)in2); \
+ }
+
+#define ILVRL_D(in0, in1, in2, in3, out0, out1, out2, out3) \
+ { \
+ out0 = (v16u8)__msa_ilvr_d((v2i64)in1, (v2i64)in0); \
+ out1 = (v16u8)__msa_ilvl_d((v2i64)in1, (v2i64)in0); \
+ out2 = (v16u8)__msa_ilvr_d((v2i64)in3, (v2i64)in2); \
+ out3 = (v16u8)__msa_ilvl_d((v2i64)in3, (v2i64)in2); \
+ }
+
+void TransposeWx16_C(const uint8* src,
+ int src_stride,
+ uint8* dst,
+ int dst_stride,
+ int width) {
+ TransposeWx8_C(src, src_stride, dst, dst_stride, width);
+ TransposeWx8_C((src + 8 * src_stride), src_stride, (dst + 8), dst_stride,
+ width);
+}
+
+void TransposeUVWx16_C(const uint8* src,
+ int src_stride,
+ uint8* dst_a,
+ int dst_stride_a,
+ uint8* dst_b,
+ int dst_stride_b,
+ int width) {
+ TransposeUVWx8_C(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b,
+ width);
+ TransposeUVWx8_C((src + 8 * src_stride), src_stride, (dst_a + 8),
+ dst_stride_a, (dst_b + 8), dst_stride_b, width);
+}
+
+void TransposeWx16_MSA(const uint8* src,
+ int src_stride,
+ uint8* dst,
+ int dst_stride,
+ int width) {
+ int x;
+ const uint8* s;
+ v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3, vec0, vec1, vec2, vec3;
+ v16u8 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
+ v16u8 res0, res1, res2, res3, res4, res5, res6, res7, res8, res9;
+
+ for (x = 0; x < width; x += 16) {
+ s = src;
+ src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+ s += src_stride;
+ src1 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+ s += src_stride;
+ src2 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+ s += src_stride;
+ src3 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+ s += src_stride;
+ ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
+ ILVRL_H(vec0, vec2, vec1, vec3, reg0, reg1, reg2, reg3);
+ src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+ s += src_stride;
+ src1 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+ s += src_stride;
+ src2 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+ s += src_stride;
+ src3 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+ s += src_stride;
+ ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
+ ILVRL_H(vec0, vec2, vec1, vec3, reg4, reg5, reg6, reg7);
+ ILVRL_W(reg0, reg4, reg1, reg5, res0, res1, res2, res3);
+ ILVRL_W(reg2, reg6, reg3, reg7, res4, res5, res6, res7);
+ src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+ s += src_stride;
+ src1 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+ s += src_stride;
+ src2 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+ s += src_stride;
+ src3 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+ s += src_stride;
+ ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
+ ILVRL_H(vec0, vec2, vec1, vec3, reg0, reg1, reg2, reg3);
+ src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+ s += src_stride;
+ src1 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+ s += src_stride;
+ src2 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+ s += src_stride;
+ src3 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+ s += src_stride;
+ ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
+ ILVRL_H(vec0, vec2, vec1, vec3, reg4, reg5, reg6, reg7);
+ res8 = (v16u8)__msa_ilvr_w((v4i32)reg4, (v4i32)reg0);
+ res9 = (v16u8)__msa_ilvl_w((v4i32)reg4, (v4i32)reg0);
+ ILVRL_D(res0, res8, res1, res9, dst0, dst1, dst2, dst3);
+ ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride);
+ dst += dst_stride * 4;
+ res8 = (v16u8)__msa_ilvr_w((v4i32)reg5, (v4i32)reg1);
+ res9 = (v16u8)__msa_ilvl_w((v4i32)reg5, (v4i32)reg1);
+ ILVRL_D(res2, res8, res3, res9, dst0, dst1, dst2, dst3);
+ ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride);
+ dst += dst_stride * 4;
+ res8 = (v16u8)__msa_ilvr_w((v4i32)reg6, (v4i32)reg2);
+ res9 = (v16u8)__msa_ilvl_w((v4i32)reg6, (v4i32)reg2);
+ ILVRL_D(res4, res8, res5, res9, dst0, dst1, dst2, dst3);
+ ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride);
+ dst += dst_stride * 4;
+ res8 = (v16u8)__msa_ilvr_w((v4i32)reg7, (v4i32)reg3);
+ res9 = (v16u8)__msa_ilvl_w((v4i32)reg7, (v4i32)reg3);
+ ILVRL_D(res6, res8, res7, res9, dst0, dst1, dst2, dst3);
+ ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride);
+ src += 16;
+ dst += dst_stride * 4;
+ }
+}
+
+void TransposeUVWx16_MSA(const uint8* src,
+ int src_stride,
+ uint8* dst_a,
+ int dst_stride_a,
+ uint8* dst_b,
+ int dst_stride_b,
+ int width) {
+ int x;
+ const uint8* s;
+ v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3, vec0, vec1, vec2, vec3;
+ v16u8 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
+ v16u8 res0, res1, res2, res3, res4, res5, res6, res7, res8, res9;
+
+ for (x = 0; x < width; x += 8) {
+ s = src;
+ src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+ s += src_stride;
+ src1 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+ s += src_stride;
+ src2 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+ s += src_stride;
+ src3 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+ s += src_stride;
+ ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
+ ILVRL_H(vec0, vec2, vec1, vec3, reg0, reg1, reg2, reg3);
+ src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+ s += src_stride;
+ src1 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+ s += src_stride;
+ src2 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+ s += src_stride;
+ src3 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+ s += src_stride;
+ ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
+ ILVRL_H(vec0, vec2, vec1, vec3, reg4, reg5, reg6, reg7);
+ ILVRL_W(reg0, reg4, reg1, reg5, res0, res1, res2, res3);
+ ILVRL_W(reg2, reg6, reg3, reg7, res4, res5, res6, res7);
+ src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+ s += src_stride;
+ src1 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+ s += src_stride;
+ src2 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+ s += src_stride;
+ src3 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+ s += src_stride;
+ ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
+ ILVRL_H(vec0, vec2, vec1, vec3, reg0, reg1, reg2, reg3);
+ src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+ s += src_stride;
+ src1 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+ s += src_stride;
+ src2 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+ s += src_stride;
+ src3 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+ s += src_stride;
+ ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
+ ILVRL_H(vec0, vec2, vec1, vec3, reg4, reg5, reg6, reg7);
+ res8 = (v16u8)__msa_ilvr_w((v4i32)reg4, (v4i32)reg0);
+ res9 = (v16u8)__msa_ilvl_w((v4i32)reg4, (v4i32)reg0);
+ ILVRL_D(res0, res8, res1, res9, dst0, dst1, dst2, dst3);
+ ST_UB2(dst0, dst2, dst_a, dst_stride_a);
+ ST_UB2(dst1, dst3, dst_b, dst_stride_b);
+ dst_a += dst_stride_a * 2;
+ dst_b += dst_stride_b * 2;
+ res8 = (v16u8)__msa_ilvr_w((v4i32)reg5, (v4i32)reg1);
+ res9 = (v16u8)__msa_ilvl_w((v4i32)reg5, (v4i32)reg1);
+ ILVRL_D(res2, res8, res3, res9, dst0, dst1, dst2, dst3);
+ ST_UB2(dst0, dst2, dst_a, dst_stride_a);
+ ST_UB2(dst1, dst3, dst_b, dst_stride_b);
+ dst_a += dst_stride_a * 2;
+ dst_b += dst_stride_b * 2;
+ res8 = (v16u8)__msa_ilvr_w((v4i32)reg6, (v4i32)reg2);
+ res9 = (v16u8)__msa_ilvl_w((v4i32)reg6, (v4i32)reg2);
+ ILVRL_D(res4, res8, res5, res9, dst0, dst1, dst2, dst3);
+ ST_UB2(dst0, dst2, dst_a, dst_stride_a);
+ ST_UB2(dst1, dst3, dst_b, dst_stride_b);
+ dst_a += dst_stride_a * 2;
+ dst_b += dst_stride_b * 2;
+ res8 = (v16u8)__msa_ilvr_w((v4i32)reg7, (v4i32)reg3);
+ res9 = (v16u8)__msa_ilvl_w((v4i32)reg7, (v4i32)reg3);
+ ILVRL_D(res6, res8, res7, res9, dst0, dst1, dst2, dst3);
+ ST_UB2(dst0, dst2, dst_a, dst_stride_a);
+ ST_UB2(dst1, dst3, dst_b, dst_stride_b);
+ src += 16;
+ dst_a += dst_stride_a * 2;
+ dst_b += dst_stride_b * 2;
+ }
+}
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
+
+#endif // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
diff --git a/libyuv/src/main/cpp/libyuv/source/rotate_neon.cc b/libyuv/src/main/cpp/libyuv/source/rotate_neon.cc
new file mode 100644
index 0000000..d9bbc78
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/source/rotate_neon.cc
@@ -0,0 +1,416 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/rotate_row.h"
+#include "libyuv/row.h"
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
+ !defined(__aarch64__)
+
+static uvec8 kVTbl4x4Transpose = {0, 4, 8, 12, 1, 5, 9, 13,
+ 2, 6, 10, 14, 3, 7, 11, 15};
+
+void TransposeWx8_NEON(const uint8* src,
+ int src_stride,
+ uint8* dst,
+ int dst_stride,
+ int width) {
+ const uint8* src_temp;
+ asm volatile(
+ // loops are on blocks of 8. loop will stop when
+ // counter gets to or below 0. starting the counter
+ // at w-8 allow for this
+ "sub %5, #8 \n"
+
+ // handle 8x8 blocks. this should be the majority of the plane
+ "1: \n"
+ "mov %0, %1 \n"
+
+ "vld1.8 {d0}, [%0], %2 \n"
+ "vld1.8 {d1}, [%0], %2 \n"
+ "vld1.8 {d2}, [%0], %2 \n"
+ "vld1.8 {d3}, [%0], %2 \n"
+ "vld1.8 {d4}, [%0], %2 \n"
+ "vld1.8 {d5}, [%0], %2 \n"
+ "vld1.8 {d6}, [%0], %2 \n"
+ "vld1.8 {d7}, [%0] \n"
+
+ "vtrn.8 d1, d0 \n"
+ "vtrn.8 d3, d2 \n"
+ "vtrn.8 d5, d4 \n"
+ "vtrn.8 d7, d6 \n"
+
+ "vtrn.16 d1, d3 \n"
+ "vtrn.16 d0, d2 \n"
+ "vtrn.16 d5, d7 \n"
+ "vtrn.16 d4, d6 \n"
+
+ "vtrn.32 d1, d5 \n"
+ "vtrn.32 d0, d4 \n"
+ "vtrn.32 d3, d7 \n"
+ "vtrn.32 d2, d6 \n"
+
+ "vrev16.8 q0, q0 \n"
+ "vrev16.8 q1, q1 \n"
+ "vrev16.8 q2, q2 \n"
+ "vrev16.8 q3, q3 \n"
+
+ "mov %0, %3 \n"
+
+ "vst1.8 {d1}, [%0], %4 \n"
+ "vst1.8 {d0}, [%0], %4 \n"
+ "vst1.8 {d3}, [%0], %4 \n"
+ "vst1.8 {d2}, [%0], %4 \n"
+ "vst1.8 {d5}, [%0], %4 \n"
+ "vst1.8 {d4}, [%0], %4 \n"
+ "vst1.8 {d7}, [%0], %4 \n"
+ "vst1.8 {d6}, [%0] \n"
+
+ "add %1, #8 \n" // src += 8
+ "add %3, %3, %4, lsl #3 \n" // dst += 8 * dst_stride
+ "subs %5, #8 \n" // w -= 8
+ "bge 1b \n"
+
+ // add 8 back to counter. if the result is 0 there are
+ // no residuals.
+ "adds %5, #8 \n"
+ "beq 4f \n"
+
+ // some residual, so between 1 and 7 lines left to transpose
+ "cmp %5, #2 \n"
+ "blt 3f \n"
+
+ "cmp %5, #4 \n"
+ "blt 2f \n"
+
+ // 4x8 block
+ "mov %0, %1 \n"
+ "vld1.32 {d0[0]}, [%0], %2 \n"
+ "vld1.32 {d0[1]}, [%0], %2 \n"
+ "vld1.32 {d1[0]}, [%0], %2 \n"
+ "vld1.32 {d1[1]}, [%0], %2 \n"
+ "vld1.32 {d2[0]}, [%0], %2 \n"
+ "vld1.32 {d2[1]}, [%0], %2 \n"
+ "vld1.32 {d3[0]}, [%0], %2 \n"
+ "vld1.32 {d3[1]}, [%0] \n"
+
+ "mov %0, %3 \n"
+
+ "vld1.8 {q3}, [%6] \n"
+
+ "vtbl.8 d4, {d0, d1}, d6 \n"
+ "vtbl.8 d5, {d0, d1}, d7 \n"
+ "vtbl.8 d0, {d2, d3}, d6 \n"
+ "vtbl.8 d1, {d2, d3}, d7 \n"
+
+ // TODO(frkoenig): Rework shuffle above to
+ // write out with 4 instead of 8 writes.
+ "vst1.32 {d4[0]}, [%0], %4 \n"
+ "vst1.32 {d4[1]}, [%0], %4 \n"
+ "vst1.32 {d5[0]}, [%0], %4 \n"
+ "vst1.32 {d5[1]}, [%0] \n"
+
+ "add %0, %3, #4 \n"
+ "vst1.32 {d0[0]}, [%0], %4 \n"
+ "vst1.32 {d0[1]}, [%0], %4 \n"
+ "vst1.32 {d1[0]}, [%0], %4 \n"
+ "vst1.32 {d1[1]}, [%0] \n"
+
+ "add %1, #4 \n" // src += 4
+ "add %3, %3, %4, lsl #2 \n" // dst += 4 * dst_stride
+ "subs %5, #4 \n" // w -= 4
+ "beq 4f \n"
+
+ // some residual, check to see if it includes a 2x8 block,
+ // or less
+ "cmp %5, #2 \n"
+ "blt 3f \n"
+
+ // 2x8 block
+ "2: \n"
+ "mov %0, %1 \n"
+ "vld1.16 {d0[0]}, [%0], %2 \n"
+ "vld1.16 {d1[0]}, [%0], %2 \n"
+ "vld1.16 {d0[1]}, [%0], %2 \n"
+ "vld1.16 {d1[1]}, [%0], %2 \n"
+ "vld1.16 {d0[2]}, [%0], %2 \n"
+ "vld1.16 {d1[2]}, [%0], %2 \n"
+ "vld1.16 {d0[3]}, [%0], %2 \n"
+ "vld1.16 {d1[3]}, [%0] \n"
+
+ "vtrn.8 d0, d1 \n"
+
+ "mov %0, %3 \n"
+
+ "vst1.64 {d0}, [%0], %4 \n"
+ "vst1.64 {d1}, [%0] \n"
+
+ "add %1, #2 \n" // src += 2
+ "add %3, %3, %4, lsl #1 \n" // dst += 2 * dst_stride
+ "subs %5, #2 \n" // w -= 2
+ "beq 4f \n"
+
+ // 1x8 block
+ "3: \n"
+ "vld1.8 {d0[0]}, [%1], %2 \n"
+ "vld1.8 {d0[1]}, [%1], %2 \n"
+ "vld1.8 {d0[2]}, [%1], %2 \n"
+ "vld1.8 {d0[3]}, [%1], %2 \n"
+ "vld1.8 {d0[4]}, [%1], %2 \n"
+ "vld1.8 {d0[5]}, [%1], %2 \n"
+ "vld1.8 {d0[6]}, [%1], %2 \n"
+ "vld1.8 {d0[7]}, [%1] \n"
+
+ "vst1.64 {d0}, [%3] \n"
+
+ "4: \n"
+
+ : "=&r"(src_temp), // %0
+ "+r"(src), // %1
+ "+r"(src_stride), // %2
+ "+r"(dst), // %3
+ "+r"(dst_stride), // %4
+ "+r"(width) // %5
+ : "r"(&kVTbl4x4Transpose) // %6
+ : "memory", "cc", "q0", "q1", "q2", "q3");
+}
+
+static uvec8 kVTbl4x4TransposeDi = {0, 8, 1, 9, 2, 10, 3, 11,
+ 4, 12, 5, 13, 6, 14, 7, 15};
+
+void TransposeUVWx8_NEON(const uint8* src,
+ int src_stride,
+ uint8* dst_a,
+ int dst_stride_a,
+ uint8* dst_b,
+ int dst_stride_b,
+ int width) {
+ const uint8* src_temp;
+ asm volatile(
+ // loops are on blocks of 8. loop will stop when
+ // counter gets to or below 0. starting the counter
+ // at w-8 allow for this
+ "sub %7, #8 \n"
+
+ // handle 8x8 blocks. this should be the majority of the plane
+ "1: \n"
+ "mov %0, %1 \n"
+
+ "vld2.8 {d0, d1}, [%0], %2 \n"
+ "vld2.8 {d2, d3}, [%0], %2 \n"
+ "vld2.8 {d4, d5}, [%0], %2 \n"
+ "vld2.8 {d6, d7}, [%0], %2 \n"
+ "vld2.8 {d16, d17}, [%0], %2 \n"
+ "vld2.8 {d18, d19}, [%0], %2 \n"
+ "vld2.8 {d20, d21}, [%0], %2 \n"
+ "vld2.8 {d22, d23}, [%0] \n"
+
+ "vtrn.8 q1, q0 \n"
+ "vtrn.8 q3, q2 \n"
+ "vtrn.8 q9, q8 \n"
+ "vtrn.8 q11, q10 \n"
+
+ "vtrn.16 q1, q3 \n"
+ "vtrn.16 q0, q2 \n"
+ "vtrn.16 q9, q11 \n"
+ "vtrn.16 q8, q10 \n"
+
+ "vtrn.32 q1, q9 \n"
+ "vtrn.32 q0, q8 \n"
+ "vtrn.32 q3, q11 \n"
+ "vtrn.32 q2, q10 \n"
+
+ "vrev16.8 q0, q0 \n"
+ "vrev16.8 q1, q1 \n"
+ "vrev16.8 q2, q2 \n"
+ "vrev16.8 q3, q3 \n"
+ "vrev16.8 q8, q8 \n"
+ "vrev16.8 q9, q9 \n"
+ "vrev16.8 q10, q10 \n"
+ "vrev16.8 q11, q11 \n"
+
+ "mov %0, %3 \n"
+
+ "vst1.8 {d2}, [%0], %4 \n"
+ "vst1.8 {d0}, [%0], %4 \n"
+ "vst1.8 {d6}, [%0], %4 \n"
+ "vst1.8 {d4}, [%0], %4 \n"
+ "vst1.8 {d18}, [%0], %4 \n"
+ "vst1.8 {d16}, [%0], %4 \n"
+ "vst1.8 {d22}, [%0], %4 \n"
+ "vst1.8 {d20}, [%0] \n"
+
+ "mov %0, %5 \n"
+
+ "vst1.8 {d3}, [%0], %6 \n"
+ "vst1.8 {d1}, [%0], %6 \n"
+ "vst1.8 {d7}, [%0], %6 \n"
+ "vst1.8 {d5}, [%0], %6 \n"
+ "vst1.8 {d19}, [%0], %6 \n"
+ "vst1.8 {d17}, [%0], %6 \n"
+ "vst1.8 {d23}, [%0], %6 \n"
+ "vst1.8 {d21}, [%0] \n"
+
+ "add %1, #8*2 \n" // src += 8*2
+ "add %3, %3, %4, lsl #3 \n" // dst_a += 8 * dst_stride_a
+ "add %5, %5, %6, lsl #3 \n" // dst_b += 8 * dst_stride_b
+ "subs %7, #8 \n" // w -= 8
+ "bge 1b \n"
+
+ // add 8 back to counter. if the result is 0 there are
+ // no residuals.
+ "adds %7, #8 \n"
+ "beq 4f \n"
+
+ // some residual, so between 1 and 7 lines left to transpose
+ "cmp %7, #2 \n"
+ "blt 3f \n"
+
+ "cmp %7, #4 \n"
+ "blt 2f \n"
+
+ // TODO(frkoenig): Clean this up
+ // 4x8 block
+ "mov %0, %1 \n"
+ "vld1.64 {d0}, [%0], %2 \n"
+ "vld1.64 {d1}, [%0], %2 \n"
+ "vld1.64 {d2}, [%0], %2 \n"
+ "vld1.64 {d3}, [%0], %2 \n"
+ "vld1.64 {d4}, [%0], %2 \n"
+ "vld1.64 {d5}, [%0], %2 \n"
+ "vld1.64 {d6}, [%0], %2 \n"
+ "vld1.64 {d7}, [%0] \n"
+
+ "vld1.8 {q15}, [%8] \n"
+
+ "vtrn.8 q0, q1 \n"
+ "vtrn.8 q2, q3 \n"
+
+ "vtbl.8 d16, {d0, d1}, d30 \n"
+ "vtbl.8 d17, {d0, d1}, d31 \n"
+ "vtbl.8 d18, {d2, d3}, d30 \n"
+ "vtbl.8 d19, {d2, d3}, d31 \n"
+ "vtbl.8 d20, {d4, d5}, d30 \n"
+ "vtbl.8 d21, {d4, d5}, d31 \n"
+ "vtbl.8 d22, {d6, d7}, d30 \n"
+ "vtbl.8 d23, {d6, d7}, d31 \n"
+
+ "mov %0, %3 \n"
+
+ "vst1.32 {d16[0]}, [%0], %4 \n"
+ "vst1.32 {d16[1]}, [%0], %4 \n"
+ "vst1.32 {d17[0]}, [%0], %4 \n"
+ "vst1.32 {d17[1]}, [%0], %4 \n"
+
+ "add %0, %3, #4 \n"
+ "vst1.32 {d20[0]}, [%0], %4 \n"
+ "vst1.32 {d20[1]}, [%0], %4 \n"
+ "vst1.32 {d21[0]}, [%0], %4 \n"
+ "vst1.32 {d21[1]}, [%0] \n"
+
+ "mov %0, %5 \n"
+
+ "vst1.32 {d18[0]}, [%0], %6 \n"
+ "vst1.32 {d18[1]}, [%0], %6 \n"
+ "vst1.32 {d19[0]}, [%0], %6 \n"
+ "vst1.32 {d19[1]}, [%0], %6 \n"
+
+ "add %0, %5, #4 \n"
+ "vst1.32 {d22[0]}, [%0], %6 \n"
+ "vst1.32 {d22[1]}, [%0], %6 \n"
+ "vst1.32 {d23[0]}, [%0], %6 \n"
+ "vst1.32 {d23[1]}, [%0] \n"
+
+ "add %1, #4*2 \n" // src += 4 * 2
+ "add %3, %3, %4, lsl #2 \n" // dst_a += 4 *
+ // dst_stride_a
+ "add %5, %5, %6, lsl #2 \n" // dst_b += 4 *
+ // dst_stride_b
+ "subs %7, #4 \n" // w -= 4
+ "beq 4f \n"
+
+ // some residual, check to see if it includes a 2x8 block,
+ // or less
+ "cmp %7, #2 \n"
+ "blt 3f \n"
+
+ // 2x8 block
+ "2: \n"
+ "mov %0, %1 \n"
+ "vld2.16 {d0[0], d2[0]}, [%0], %2 \n"
+ "vld2.16 {d1[0], d3[0]}, [%0], %2 \n"
+ "vld2.16 {d0[1], d2[1]}, [%0], %2 \n"
+ "vld2.16 {d1[1], d3[1]}, [%0], %2 \n"
+ "vld2.16 {d0[2], d2[2]}, [%0], %2 \n"
+ "vld2.16 {d1[2], d3[2]}, [%0], %2 \n"
+ "vld2.16 {d0[3], d2[3]}, [%0], %2 \n"
+ "vld2.16 {d1[3], d3[3]}, [%0] \n"
+
+ "vtrn.8 d0, d1 \n"
+ "vtrn.8 d2, d3 \n"
+
+ "mov %0, %3 \n"
+
+ "vst1.64 {d0}, [%0], %4 \n"
+ "vst1.64 {d2}, [%0] \n"
+
+ "mov %0, %5 \n"
+
+ "vst1.64 {d1}, [%0], %6 \n"
+ "vst1.64 {d3}, [%0] \n"
+
+ "add %1, #2*2 \n" // src += 2 * 2
+ "add %3, %3, %4, lsl #1 \n" // dst_a += 2 *
+ // dst_stride_a
+ "add %5, %5, %6, lsl #1 \n" // dst_b += 2 *
+ // dst_stride_b
+ "subs %7, #2 \n" // w -= 2
+ "beq 4f \n"
+
+ // 1x8 block
+ "3: \n"
+ "vld2.8 {d0[0], d1[0]}, [%1], %2 \n"
+ "vld2.8 {d0[1], d1[1]}, [%1], %2 \n"
+ "vld2.8 {d0[2], d1[2]}, [%1], %2 \n"
+ "vld2.8 {d0[3], d1[3]}, [%1], %2 \n"
+ "vld2.8 {d0[4], d1[4]}, [%1], %2 \n"
+ "vld2.8 {d0[5], d1[5]}, [%1], %2 \n"
+ "vld2.8 {d0[6], d1[6]}, [%1], %2 \n"
+ "vld2.8 {d0[7], d1[7]}, [%1] \n"
+
+ "vst1.64 {d0}, [%3] \n"
+ "vst1.64 {d1}, [%5] \n"
+
+ "4: \n"
+
+ : "=&r"(src_temp), // %0
+ "+r"(src), // %1
+ "+r"(src_stride), // %2
+ "+r"(dst_a), // %3
+ "+r"(dst_stride_a), // %4
+ "+r"(dst_b), // %5
+ "+r"(dst_stride_b), // %6
+ "+r"(width) // %7
+ : "r"(&kVTbl4x4TransposeDi) // %8
+ : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
+}
+#endif // defined(__ARM_NEON__) && !defined(__aarch64__)
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/libyuv/src/main/cpp/libyuv/source/rotate_neon64.cc b/libyuv/src/main/cpp/libyuv/source/rotate_neon64.cc
new file mode 100644
index 0000000..137336a
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/source/rotate_neon64.cc
@@ -0,0 +1,424 @@
+/*
+ * Copyright 2014 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/rotate_row.h"
+#include "libyuv/row.h"
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for GCC Neon armv8 64 bit.
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+
+static uvec8 kVTbl4x4Transpose = {0, 4, 8, 12, 1, 5, 9, 13,
+ 2, 6, 10, 14, 3, 7, 11, 15};
+
+void TransposeWx8_NEON(const uint8* src,
+ int src_stride,
+ uint8* dst,
+ int dst_stride,
+ int width) {
+ const uint8* src_temp;
+ asm volatile (
+ // loops are on blocks of 8. loop will stop when
+ // counter gets to or below 0. starting the counter
+ // at w-8 allow for this
+ "sub %w3, %w3, #8 \n"
+
+ // handle 8x8 blocks. this should be the majority of the plane
+ "1: \n"
+ "mov %0, %1 \n"
+
+ "ld1 {v0.8b}, [%0], %5 \n"
+ "ld1 {v1.8b}, [%0], %5 \n"
+ "ld1 {v2.8b}, [%0], %5 \n"
+ "ld1 {v3.8b}, [%0], %5 \n"
+ "ld1 {v4.8b}, [%0], %5 \n"
+ "ld1 {v5.8b}, [%0], %5 \n"
+ "ld1 {v6.8b}, [%0], %5 \n"
+ "ld1 {v7.8b}, [%0] \n"
+
+ "trn2 v16.8b, v0.8b, v1.8b \n"
+ "trn1 v17.8b, v0.8b, v1.8b \n"
+ "trn2 v18.8b, v2.8b, v3.8b \n"
+ "trn1 v19.8b, v2.8b, v3.8b \n"
+ "trn2 v20.8b, v4.8b, v5.8b \n"
+ "trn1 v21.8b, v4.8b, v5.8b \n"
+ "trn2 v22.8b, v6.8b, v7.8b \n"
+ "trn1 v23.8b, v6.8b, v7.8b \n"
+
+ "trn2 v3.4h, v17.4h, v19.4h \n"
+ "trn1 v1.4h, v17.4h, v19.4h \n"
+ "trn2 v2.4h, v16.4h, v18.4h \n"
+ "trn1 v0.4h, v16.4h, v18.4h \n"
+ "trn2 v7.4h, v21.4h, v23.4h \n"
+ "trn1 v5.4h, v21.4h, v23.4h \n"
+ "trn2 v6.4h, v20.4h, v22.4h \n"
+ "trn1 v4.4h, v20.4h, v22.4h \n"
+
+ "trn2 v21.2s, v1.2s, v5.2s \n"
+ "trn1 v17.2s, v1.2s, v5.2s \n"
+ "trn2 v20.2s, v0.2s, v4.2s \n"
+ "trn1 v16.2s, v0.2s, v4.2s \n"
+ "trn2 v23.2s, v3.2s, v7.2s \n"
+ "trn1 v19.2s, v3.2s, v7.2s \n"
+ "trn2 v22.2s, v2.2s, v6.2s \n"
+ "trn1 v18.2s, v2.2s, v6.2s \n"
+
+ "mov %0, %2 \n"
+
+ "st1 {v17.8b}, [%0], %6 \n"
+ "st1 {v16.8b}, [%0], %6 \n"
+ "st1 {v19.8b}, [%0], %6 \n"
+ "st1 {v18.8b}, [%0], %6 \n"
+ "st1 {v21.8b}, [%0], %6 \n"
+ "st1 {v20.8b}, [%0], %6 \n"
+ "st1 {v23.8b}, [%0], %6 \n"
+ "st1 {v22.8b}, [%0] \n"
+
+ "add %1, %1, #8 \n" // src += 8
+ "add %2, %2, %6, lsl #3 \n" // dst += 8 * dst_stride
+ "subs %w3, %w3, #8 \n" // w -= 8
+ "b.ge 1b \n"
+
+ // add 8 back to counter. if the result is 0 there are
+ // no residuals.
+ "adds %w3, %w3, #8 \n"
+ "b.eq 4f \n"
+
+ // some residual, so between 1 and 7 lines left to transpose
+ "cmp %w3, #2 \n"
+ "b.lt 3f \n"
+
+ "cmp %w3, #4 \n"
+ "b.lt 2f \n"
+
+ // 4x8 block
+ "mov %0, %1 \n"
+ "ld1 {v0.s}[0], [%0], %5 \n"
+ "ld1 {v0.s}[1], [%0], %5 \n"
+ "ld1 {v0.s}[2], [%0], %5 \n"
+ "ld1 {v0.s}[3], [%0], %5 \n"
+ "ld1 {v1.s}[0], [%0], %5 \n"
+ "ld1 {v1.s}[1], [%0], %5 \n"
+ "ld1 {v1.s}[2], [%0], %5 \n"
+ "ld1 {v1.s}[3], [%0] \n"
+
+ "mov %0, %2 \n"
+
+ "ld1 {v2.16b}, [%4] \n"
+
+ "tbl v3.16b, {v0.16b}, v2.16b \n"
+ "tbl v0.16b, {v1.16b}, v2.16b \n"
+
+ // TODO(frkoenig): Rework shuffle above to
+ // write out with 4 instead of 8 writes.
+ "st1 {v3.s}[0], [%0], %6 \n"
+ "st1 {v3.s}[1], [%0], %6 \n"
+ "st1 {v3.s}[2], [%0], %6 \n"
+ "st1 {v3.s}[3], [%0] \n"
+
+ "add %0, %2, #4 \n"
+ "st1 {v0.s}[0], [%0], %6 \n"
+ "st1 {v0.s}[1], [%0], %6 \n"
+ "st1 {v0.s}[2], [%0], %6 \n"
+ "st1 {v0.s}[3], [%0] \n"
+
+ "add %1, %1, #4 \n" // src += 4
+ "add %2, %2, %6, lsl #2 \n" // dst += 4 * dst_stride
+ "subs %w3, %w3, #4 \n" // w -= 4
+ "b.eq 4f \n"
+
+ // some residual, check to see if it includes a 2x8 block,
+ // or less
+ "cmp %w3, #2 \n"
+ "b.lt 3f \n"
+
+ // 2x8 block
+ "2: \n"
+ "mov %0, %1 \n"
+ "ld1 {v0.h}[0], [%0], %5 \n"
+ "ld1 {v1.h}[0], [%0], %5 \n"
+ "ld1 {v0.h}[1], [%0], %5 \n"
+ "ld1 {v1.h}[1], [%0], %5 \n"
+ "ld1 {v0.h}[2], [%0], %5 \n"
+ "ld1 {v1.h}[2], [%0], %5 \n"
+ "ld1 {v0.h}[3], [%0], %5 \n"
+ "ld1 {v1.h}[3], [%0] \n"
+
+ "trn2 v2.8b, v0.8b, v1.8b \n"
+ "trn1 v3.8b, v0.8b, v1.8b \n"
+
+ "mov %0, %2 \n"
+
+ "st1 {v3.8b}, [%0], %6 \n"
+ "st1 {v2.8b}, [%0] \n"
+
+ "add %1, %1, #2 \n" // src += 2
+ "add %2, %2, %6, lsl #1 \n" // dst += 2 * dst_stride
+ "subs %w3, %w3, #2 \n" // w -= 2
+ "b.eq 4f \n"
+
+ // 1x8 block
+ "3: \n"
+ "ld1 {v0.b}[0], [%1], %5 \n"
+ "ld1 {v0.b}[1], [%1], %5 \n"
+ "ld1 {v0.b}[2], [%1], %5 \n"
+ "ld1 {v0.b}[3], [%1], %5 \n"
+ "ld1 {v0.b}[4], [%1], %5 \n"
+ "ld1 {v0.b}[5], [%1], %5 \n"
+ "ld1 {v0.b}[6], [%1], %5 \n"
+ "ld1 {v0.b}[7], [%1] \n"
+
+ "st1 {v0.8b}, [%2] \n"
+
+ "4: \n"
+
+ : "=&r"(src_temp), // %0
+ "+r"(src), // %1
+ "+r"(dst), // %2
+ "+r"(width) // %3
+ : "r"(&kVTbl4x4Transpose), // %4
+ "r"(static_cast(src_stride)), // %5
+ "r"(static_cast(dst_stride)) // %6
+ : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
+ "v17", "v18", "v19", "v20", "v21", "v22", "v23"
+ );
+}
+
+static uint8 kVTbl4x4TransposeDi[32] = {
+ 0, 16, 32, 48, 2, 18, 34, 50, 4, 20, 36, 52, 6, 22, 38, 54,
+ 1, 17, 33, 49, 3, 19, 35, 51, 5, 21, 37, 53, 7, 23, 39, 55};
+
+void TransposeUVWx8_NEON(const uint8* src,
+ int src_stride,
+ uint8* dst_a,
+ int dst_stride_a,
+ uint8* dst_b,
+ int dst_stride_b,
+ int width) {
+ const uint8* src_temp;
+ asm volatile (
+ // loops are on blocks of 8. loop will stop when
+ // counter gets to or below 0. starting the counter
+ // at w-8 allow for this
+ "sub %w4, %w4, #8 \n"
+
+ // handle 8x8 blocks. this should be the majority of the plane
+ "1: \n"
+ "mov %0, %1 \n"
+
+ "ld1 {v0.16b}, [%0], %5 \n"
+ "ld1 {v1.16b}, [%0], %5 \n"
+ "ld1 {v2.16b}, [%0], %5 \n"
+ "ld1 {v3.16b}, [%0], %5 \n"
+ "ld1 {v4.16b}, [%0], %5 \n"
+ "ld1 {v5.16b}, [%0], %5 \n"
+ "ld1 {v6.16b}, [%0], %5 \n"
+ "ld1 {v7.16b}, [%0] \n"
+
+ "trn1 v16.16b, v0.16b, v1.16b \n"
+ "trn2 v17.16b, v0.16b, v1.16b \n"
+ "trn1 v18.16b, v2.16b, v3.16b \n"
+ "trn2 v19.16b, v2.16b, v3.16b \n"
+ "trn1 v20.16b, v4.16b, v5.16b \n"
+ "trn2 v21.16b, v4.16b, v5.16b \n"
+ "trn1 v22.16b, v6.16b, v7.16b \n"
+ "trn2 v23.16b, v6.16b, v7.16b \n"
+
+ "trn1 v0.8h, v16.8h, v18.8h \n"
+ "trn2 v1.8h, v16.8h, v18.8h \n"
+ "trn1 v2.8h, v20.8h, v22.8h \n"
+ "trn2 v3.8h, v20.8h, v22.8h \n"
+ "trn1 v4.8h, v17.8h, v19.8h \n"
+ "trn2 v5.8h, v17.8h, v19.8h \n"
+ "trn1 v6.8h, v21.8h, v23.8h \n"
+ "trn2 v7.8h, v21.8h, v23.8h \n"
+
+ "trn1 v16.4s, v0.4s, v2.4s \n"
+ "trn2 v17.4s, v0.4s, v2.4s \n"
+ "trn1 v18.4s, v1.4s, v3.4s \n"
+ "trn2 v19.4s, v1.4s, v3.4s \n"
+ "trn1 v20.4s, v4.4s, v6.4s \n"
+ "trn2 v21.4s, v4.4s, v6.4s \n"
+ "trn1 v22.4s, v5.4s, v7.4s \n"
+ "trn2 v23.4s, v5.4s, v7.4s \n"
+
+ "mov %0, %2 \n"
+
+ "st1 {v16.d}[0], [%0], %6 \n"
+ "st1 {v18.d}[0], [%0], %6 \n"
+ "st1 {v17.d}[0], [%0], %6 \n"
+ "st1 {v19.d}[0], [%0], %6 \n"
+ "st1 {v16.d}[1], [%0], %6 \n"
+ "st1 {v18.d}[1], [%0], %6 \n"
+ "st1 {v17.d}[1], [%0], %6 \n"
+ "st1 {v19.d}[1], [%0] \n"
+
+ "mov %0, %3 \n"
+
+ "st1 {v20.d}[0], [%0], %7 \n"
+ "st1 {v22.d}[0], [%0], %7 \n"
+ "st1 {v21.d}[0], [%0], %7 \n"
+ "st1 {v23.d}[0], [%0], %7 \n"
+ "st1 {v20.d}[1], [%0], %7 \n"
+ "st1 {v22.d}[1], [%0], %7 \n"
+ "st1 {v21.d}[1], [%0], %7 \n"
+ "st1 {v23.d}[1], [%0] \n"
+
+ "add %1, %1, #16 \n" // src += 8*2
+ "add %2, %2, %6, lsl #3 \n" // dst_a += 8 * dst_stride_a
+ "add %3, %3, %7, lsl #3 \n" // dst_b += 8 * dst_stride_b
+ "subs %w4, %w4, #8 \n" // w -= 8
+ "b.ge 1b \n"
+
+ // add 8 back to counter. if the result is 0 there are
+ // no residuals.
+ "adds %w4, %w4, #8 \n"
+ "b.eq 4f \n"
+
+ // some residual, so between 1 and 7 lines left to transpose
+ "cmp %w4, #2 \n"
+ "b.lt 3f \n"
+
+ "cmp %w4, #4 \n"
+ "b.lt 2f \n"
+
+ // TODO(frkoenig): Clean this up
+ // 4x8 block
+ "mov %0, %1 \n"
+ "ld1 {v0.8b}, [%0], %5 \n"
+ "ld1 {v1.8b}, [%0], %5 \n"
+ "ld1 {v2.8b}, [%0], %5 \n"
+ "ld1 {v3.8b}, [%0], %5 \n"
+ "ld1 {v4.8b}, [%0], %5 \n"
+ "ld1 {v5.8b}, [%0], %5 \n"
+ "ld1 {v6.8b}, [%0], %5 \n"
+ "ld1 {v7.8b}, [%0] \n"
+
+ "ld1 {v30.16b}, [%8], #16 \n"
+ "ld1 {v31.16b}, [%8] \n"
+
+ "tbl v16.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v30.16b \n"
+ "tbl v17.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v31.16b \n"
+ "tbl v18.16b, {v4.16b, v5.16b, v6.16b, v7.16b}, v30.16b \n"
+ "tbl v19.16b, {v4.16b, v5.16b, v6.16b, v7.16b}, v31.16b \n"
+
+ "mov %0, %2 \n"
+
+ "st1 {v16.s}[0], [%0], %6 \n"
+ "st1 {v16.s}[1], [%0], %6 \n"
+ "st1 {v16.s}[2], [%0], %6 \n"
+ "st1 {v16.s}[3], [%0], %6 \n"
+
+ "add %0, %2, #4 \n"
+ "st1 {v18.s}[0], [%0], %6 \n"
+ "st1 {v18.s}[1], [%0], %6 \n"
+ "st1 {v18.s}[2], [%0], %6 \n"
+ "st1 {v18.s}[3], [%0] \n"
+
+ "mov %0, %3 \n"
+
+ "st1 {v17.s}[0], [%0], %7 \n"
+ "st1 {v17.s}[1], [%0], %7 \n"
+ "st1 {v17.s}[2], [%0], %7 \n"
+ "st1 {v17.s}[3], [%0], %7 \n"
+
+ "add %0, %3, #4 \n"
+ "st1 {v19.s}[0], [%0], %7 \n"
+ "st1 {v19.s}[1], [%0], %7 \n"
+ "st1 {v19.s}[2], [%0], %7 \n"
+ "st1 {v19.s}[3], [%0] \n"
+
+ "add %1, %1, #8 \n" // src += 4 * 2
+ "add %2, %2, %6, lsl #2 \n" // dst_a += 4 * dst_stride_a
+ "add %3, %3, %7, lsl #2 \n" // dst_b += 4 * dst_stride_b
+ "subs %w4, %w4, #4 \n" // w -= 4
+ "b.eq 4f \n"
+
+ // some residual, check to see if it includes a 2x8 block,
+ // or less
+ "cmp %w4, #2 \n"
+ "b.lt 3f \n"
+
+ // 2x8 block
+ "2: \n"
+ "mov %0, %1 \n"
+ "ld2 {v0.h, v1.h}[0], [%0], %5 \n"
+ "ld2 {v2.h, v3.h}[0], [%0], %5 \n"
+ "ld2 {v0.h, v1.h}[1], [%0], %5 \n"
+ "ld2 {v2.h, v3.h}[1], [%0], %5 \n"
+ "ld2 {v0.h, v1.h}[2], [%0], %5 \n"
+ "ld2 {v2.h, v3.h}[2], [%0], %5 \n"
+ "ld2 {v0.h, v1.h}[3], [%0], %5 \n"
+ "ld2 {v2.h, v3.h}[3], [%0] \n"
+
+ "trn1 v4.8b, v0.8b, v2.8b \n"
+ "trn2 v5.8b, v0.8b, v2.8b \n"
+ "trn1 v6.8b, v1.8b, v3.8b \n"
+ "trn2 v7.8b, v1.8b, v3.8b \n"
+
+ "mov %0, %2 \n"
+
+ "st1 {v4.d}[0], [%0], %6 \n"
+ "st1 {v6.d}[0], [%0] \n"
+
+ "mov %0, %3 \n"
+
+ "st1 {v5.d}[0], [%0], %7 \n"
+ "st1 {v7.d}[0], [%0] \n"
+
+ "add %1, %1, #4 \n" // src += 2 * 2
+ "add %2, %2, %6, lsl #1 \n" // dst_a += 2 * dst_stride_a
+ "add %3, %3, %7, lsl #1 \n" // dst_b += 2 * dst_stride_b
+ "subs %w4, %w4, #2 \n" // w -= 2
+ "b.eq 4f \n"
+
+ // 1x8 block
+ "3: \n"
+ "ld2 {v0.b, v1.b}[0], [%1], %5 \n"
+ "ld2 {v0.b, v1.b}[1], [%1], %5 \n"
+ "ld2 {v0.b, v1.b}[2], [%1], %5 \n"
+ "ld2 {v0.b, v1.b}[3], [%1], %5 \n"
+ "ld2 {v0.b, v1.b}[4], [%1], %5 \n"
+ "ld2 {v0.b, v1.b}[5], [%1], %5 \n"
+ "ld2 {v0.b, v1.b}[6], [%1], %5 \n"
+ "ld2 {v0.b, v1.b}[7], [%1] \n"
+
+ "st1 {v0.d}[0], [%2] \n"
+ "st1 {v1.d}[0], [%3] \n"
+
+ "4: \n"
+
+ : "=&r"(src_temp), // %0
+ "+r"(src), // %1
+ "+r"(dst_a), // %2
+ "+r"(dst_b), // %3
+ "+r"(width) // %4
+ : "r"(static_cast(src_stride)), // %5
+ "r"(static_cast(dst_stride_a)), // %6
+ "r"(static_cast(dst_stride_b)), // %7
+ "r"(&kVTbl4x4TransposeDi) // %8
+ : "memory", "cc",
+ "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+ "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
+ "v30", "v31"
+ );
+}
+#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/libyuv/src/main/cpp/libyuv/source/rotate_win.cc b/libyuv/src/main/cpp/libyuv/source/rotate_win.cc
new file mode 100644
index 0000000..93a5c28
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/source/rotate_win.cc
@@ -0,0 +1,252 @@
+/*
+ * Copyright 2013 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/rotate_row.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for 32 bit Visual C x86 and clangcl
+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
+
+__declspec(naked) void TransposeWx8_SSSE3(const uint8* src,
+ int src_stride,
+ uint8* dst,
+ int dst_stride,
+ int width) {
+ __asm {
+ push edi
+ push esi
+ push ebp
+ mov eax, [esp + 12 + 4] // src
+ mov edi, [esp + 12 + 8] // src_stride
+ mov edx, [esp + 12 + 12] // dst
+ mov esi, [esp + 12 + 16] // dst_stride
+ mov ecx, [esp + 12 + 20] // width
+
+ // Read in the data from the source pointer.
+ // First round of bit swap.
+ align 4
+ convertloop:
+ movq xmm0, qword ptr [eax]
+ lea ebp, [eax + 8]
+ movq xmm1, qword ptr [eax + edi]
+ lea eax, [eax + 2 * edi]
+ punpcklbw xmm0, xmm1
+ movq xmm2, qword ptr [eax]
+ movdqa xmm1, xmm0
+ palignr xmm1, xmm1, 8
+ movq xmm3, qword ptr [eax + edi]
+ lea eax, [eax + 2 * edi]
+ punpcklbw xmm2, xmm3
+ movdqa xmm3, xmm2
+ movq xmm4, qword ptr [eax]
+ palignr xmm3, xmm3, 8
+ movq xmm5, qword ptr [eax + edi]
+ punpcklbw xmm4, xmm5
+ lea eax, [eax + 2 * edi]
+ movdqa xmm5, xmm4
+ movq xmm6, qword ptr [eax]
+ palignr xmm5, xmm5, 8
+ movq xmm7, qword ptr [eax + edi]
+ punpcklbw xmm6, xmm7
+ mov eax, ebp
+ movdqa xmm7, xmm6
+ palignr xmm7, xmm7, 8
+ // Second round of bit swap.
+ punpcklwd xmm0, xmm2
+ punpcklwd xmm1, xmm3
+ movdqa xmm2, xmm0
+ movdqa xmm3, xmm1
+ palignr xmm2, xmm2, 8
+ palignr xmm3, xmm3, 8
+ punpcklwd xmm4, xmm6
+ punpcklwd xmm5, xmm7
+ movdqa xmm6, xmm4
+ movdqa xmm7, xmm5
+ palignr xmm6, xmm6, 8
+ palignr xmm7, xmm7, 8
+ // Third round of bit swap.
+ // Write to the destination pointer.
+ punpckldq xmm0, xmm4
+ movq qword ptr [edx], xmm0
+ movdqa xmm4, xmm0
+ palignr xmm4, xmm4, 8
+ movq qword ptr [edx + esi], xmm4
+ lea edx, [edx + 2 * esi]
+ punpckldq xmm2, xmm6
+ movdqa xmm6, xmm2
+ palignr xmm6, xmm6, 8
+ movq qword ptr [edx], xmm2
+ punpckldq xmm1, xmm5
+ movq qword ptr [edx + esi], xmm6
+ lea edx, [edx + 2 * esi]
+ movdqa xmm5, xmm1
+ movq qword ptr [edx], xmm1
+ palignr xmm5, xmm5, 8
+ punpckldq xmm3, xmm7
+ movq qword ptr [edx + esi], xmm5
+ lea edx, [edx + 2 * esi]
+ movq qword ptr [edx], xmm3
+ movdqa xmm7, xmm3
+ palignr xmm7, xmm7, 8
+ sub ecx, 8
+ movq qword ptr [edx + esi], xmm7
+ lea edx, [edx + 2 * esi]
+ jg convertloop
+
+ pop ebp
+ pop esi
+ pop edi
+ ret
+ }
+}
+
+__declspec(naked) void TransposeUVWx8_SSE2(const uint8* src,
+ int src_stride,
+ uint8* dst_a,
+ int dst_stride_a,
+ uint8* dst_b,
+ int dst_stride_b,
+ int w) {
+ __asm {
+ push ebx
+ push esi
+ push edi
+ push ebp
+ mov eax, [esp + 16 + 4] // src
+ mov edi, [esp + 16 + 8] // src_stride
+ mov edx, [esp + 16 + 12] // dst_a
+ mov esi, [esp + 16 + 16] // dst_stride_a
+ mov ebx, [esp + 16 + 20] // dst_b
+ mov ebp, [esp + 16 + 24] // dst_stride_b
+ mov ecx, esp
+ sub esp, 4 + 16
+ and esp, ~15
+ mov [esp + 16], ecx
+ mov ecx, [ecx + 16 + 28] // w
+
+ align 4
+ // Read in the data from the source pointer.
+ // First round of bit swap.
+ convertloop:
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + edi]
+ lea eax, [eax + 2 * edi]
+ movdqa xmm7, xmm0 // use xmm7 as temp register.
+ punpcklbw xmm0, xmm1
+ punpckhbw xmm7, xmm1
+ movdqa xmm1, xmm7
+ movdqu xmm2, [eax]
+ movdqu xmm3, [eax + edi]
+ lea eax, [eax + 2 * edi]
+ movdqa xmm7, xmm2
+ punpcklbw xmm2, xmm3
+ punpckhbw xmm7, xmm3
+ movdqa xmm3, xmm7
+ movdqu xmm4, [eax]
+ movdqu xmm5, [eax + edi]
+ lea eax, [eax + 2 * edi]
+ movdqa xmm7, xmm4
+ punpcklbw xmm4, xmm5
+ punpckhbw xmm7, xmm5
+ movdqa xmm5, xmm7
+ movdqu xmm6, [eax]
+ movdqu xmm7, [eax + edi]
+ lea eax, [eax + 2 * edi]
+ movdqu [esp], xmm5 // backup xmm5
+ neg edi
+ movdqa xmm5, xmm6 // use xmm5 as temp register.
+ punpcklbw xmm6, xmm7
+ punpckhbw xmm5, xmm7
+ movdqa xmm7, xmm5
+ lea eax, [eax + 8 * edi + 16]
+ neg edi
+ // Second round of bit swap.
+ movdqa xmm5, xmm0
+ punpcklwd xmm0, xmm2
+ punpckhwd xmm5, xmm2
+ movdqa xmm2, xmm5
+ movdqa xmm5, xmm1
+ punpcklwd xmm1, xmm3
+ punpckhwd xmm5, xmm3
+ movdqa xmm3, xmm5
+ movdqa xmm5, xmm4
+ punpcklwd xmm4, xmm6
+ punpckhwd xmm5, xmm6
+ movdqa xmm6, xmm5
+ movdqu xmm5, [esp] // restore xmm5
+ movdqu [esp], xmm6 // backup xmm6
+ movdqa xmm6, xmm5 // use xmm6 as temp register.
+ punpcklwd xmm5, xmm7
+ punpckhwd xmm6, xmm7
+ movdqa xmm7, xmm6
+
+ // Third round of bit swap.
+ // Write to the destination pointer.
+ movdqa xmm6, xmm0
+ punpckldq xmm0, xmm4
+ punpckhdq xmm6, xmm4
+ movdqa xmm4, xmm6
+ movdqu xmm6, [esp] // restore xmm6
+ movlpd qword ptr [edx], xmm0
+ movhpd qword ptr [ebx], xmm0
+ movlpd qword ptr [edx + esi], xmm4
+ lea edx, [edx + 2 * esi]
+ movhpd qword ptr [ebx + ebp], xmm4
+ lea ebx, [ebx + 2 * ebp]
+ movdqa xmm0, xmm2 // use xmm0 as the temp register.
+ punpckldq xmm2, xmm6
+ movlpd qword ptr [edx], xmm2
+ movhpd qword ptr [ebx], xmm2
+ punpckhdq xmm0, xmm6
+ movlpd qword ptr [edx + esi], xmm0
+ lea edx, [edx + 2 * esi]
+ movhpd qword ptr [ebx + ebp], xmm0
+ lea ebx, [ebx + 2 * ebp]
+ movdqa xmm0, xmm1 // use xmm0 as the temp register.
+ punpckldq xmm1, xmm5
+ movlpd qword ptr [edx], xmm1
+ movhpd qword ptr [ebx], xmm1
+ punpckhdq xmm0, xmm5
+ movlpd qword ptr [edx + esi], xmm0
+ lea edx, [edx + 2 * esi]
+ movhpd qword ptr [ebx + ebp], xmm0
+ lea ebx, [ebx + 2 * ebp]
+ movdqa xmm0, xmm3 // use xmm0 as the temp register.
+ punpckldq xmm3, xmm7
+ movlpd qword ptr [edx], xmm3
+ movhpd qword ptr [ebx], xmm3
+ punpckhdq xmm0, xmm7
+ sub ecx, 8
+ movlpd qword ptr [edx + esi], xmm0
+ lea edx, [edx + 2 * esi]
+ movhpd qword ptr [ebx + ebp], xmm0
+ lea ebx, [ebx + 2 * ebp]
+ jg convertloop
+
+ mov esp, [esp + 16]
+ pop ebp
+ pop edi
+ pop esi
+ pop ebx
+ ret
+ }
+}
+
+#endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/libyuv/src/main/cpp/libyuv/source/row_any.cc b/libyuv/src/main/cpp/libyuv/source/row_any.cc
new file mode 100644
index 0000000..1092a9c
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/source/row_any.cc
@@ -0,0 +1,1070 @@
+/*
+ * Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+
+#include // For memset.
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// memset for temp is meant to clear the source buffer (not dest) so that
+// SIMD that reads full multiple of 16 bytes will not trigger msan errors.
+// memset is not needed for production, as the garbage values are processed but
+// not used, although there may be edge cases for subsampling.
+// The size of the buffer is based on the largest read, which can be inferred
+// by the source type (e.g. ARGB) and the mask (last parameter), or by examining
+// the source code for how much the source pointers are advanced.
+
+// Subsampled source needs to be increase by 1 of not even.
+#define SS(width, shift) (((width) + (1 << (shift)) - 1) >> (shift))
+
+// Any 4 planes to 1 with yuvconstants
+#define ANY41C(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK) \
+ void NAMEANY(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, \
+ const uint8* a_buf, uint8* dst_ptr, \
+ const struct YuvConstants* yuvconstants, int width) { \
+ SIMD_ALIGNED(uint8 temp[64 * 5]); \
+ memset(temp, 0, 64 * 4); /* for msan */ \
+ int r = width & MASK; \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ ANY_SIMD(y_buf, u_buf, v_buf, a_buf, dst_ptr, yuvconstants, n); \
+ } \
+ memcpy(temp, y_buf + n, r); \
+ memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \
+ memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \
+ memcpy(temp + 192, a_buf + n, r); \
+ ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, temp + 256, \
+ yuvconstants, MASK + 1); \
+ memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 256, \
+ SS(r, DUVSHIFT) * BPP); \
+ }
+
+#ifdef HAS_I422ALPHATOARGBROW_SSSE3
+ANY41C(I422AlphaToARGBRow_Any_SSSE3, I422AlphaToARGBRow_SSSE3, 1, 0, 4, 7)
+#endif
+#ifdef HAS_I422ALPHATOARGBROW_AVX2
+ANY41C(I422AlphaToARGBRow_Any_AVX2, I422AlphaToARGBRow_AVX2, 1, 0, 4, 15)
+#endif
+#ifdef HAS_I422ALPHATOARGBROW_NEON
+ANY41C(I422AlphaToARGBRow_Any_NEON, I422AlphaToARGBRow_NEON, 1, 0, 4, 7)
+#endif
+#ifdef HAS_I422ALPHATOARGBROW_MSA
+ANY41C(I422AlphaToARGBRow_Any_MSA, I422AlphaToARGBRow_MSA, 1, 0, 4, 7)
+#endif
+#undef ANY41C
+
+// Any 3 planes to 1.
+#define ANY31(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK) \
+ void NAMEANY(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, \
+ uint8* dst_ptr, int width) { \
+ SIMD_ALIGNED(uint8 temp[64 * 4]); \
+ memset(temp, 0, 64 * 3); /* for YUY2 and msan */ \
+ int r = width & MASK; \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, n); \
+ } \
+ memcpy(temp, y_buf + n, r); \
+ memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \
+ memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \
+ ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, MASK + 1); \
+ memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 192, \
+ SS(r, DUVSHIFT) * BPP); \
+ }
+#ifdef HAS_I422TOYUY2ROW_SSE2
+ANY31(I422ToYUY2Row_Any_SSE2, I422ToYUY2Row_SSE2, 1, 1, 4, 15)
+ANY31(I422ToUYVYRow_Any_SSE2, I422ToUYVYRow_SSE2, 1, 1, 4, 15)
+#endif
+#ifdef HAS_I422TOYUY2ROW_NEON
+ANY31(I422ToYUY2Row_Any_NEON, I422ToYUY2Row_NEON, 1, 1, 4, 15)
+#endif
+#ifdef HAS_I422TOYUY2ROW_MSA
+ANY31(I422ToYUY2Row_Any_MSA, I422ToYUY2Row_MSA, 1, 1, 4, 31)
+#endif
+#ifdef HAS_I422TOUYVYROW_NEON
+ANY31(I422ToUYVYRow_Any_NEON, I422ToUYVYRow_NEON, 1, 1, 4, 15)
+#endif
+#ifdef HAS_I422TOUYVYROW_MSA
+ANY31(I422ToUYVYRow_Any_MSA, I422ToUYVYRow_MSA, 1, 1, 4, 31)
+#endif
+#ifdef HAS_BLENDPLANEROW_AVX2
+ANY31(BlendPlaneRow_Any_AVX2, BlendPlaneRow_AVX2, 0, 0, 1, 31)
+#endif
+#ifdef HAS_BLENDPLANEROW_SSSE3
+ANY31(BlendPlaneRow_Any_SSSE3, BlendPlaneRow_SSSE3, 0, 0, 1, 7)
+#endif
+#undef ANY31
+
+// Note that odd width replication includes 444 due to implementation
+// on arm that subsamples 444 to 422 internally.
+// Any 3 planes to 1 with yuvconstants
+#define ANY31C(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK) \
+ void NAMEANY(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, \
+ uint8* dst_ptr, const struct YuvConstants* yuvconstants, \
+ int width) { \
+ SIMD_ALIGNED(uint8 temp[64 * 4]); \
+ memset(temp, 0, 64 * 3); /* for YUY2 and msan */ \
+ int r = width & MASK; \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, yuvconstants, n); \
+ } \
+ memcpy(temp, y_buf + n, r); \
+ memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \
+ memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \
+ if (width & 1) { \
+ temp[64 + SS(r, UVSHIFT)] = temp[64 + SS(r, UVSHIFT) - 1]; \
+ temp[128 + SS(r, UVSHIFT)] = temp[128 + SS(r, UVSHIFT) - 1]; \
+ } \
+ ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, yuvconstants, MASK + 1); \
+ memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 192, \
+ SS(r, DUVSHIFT) * BPP); \
+ }
+
+#ifdef HAS_I422TOARGBROW_SSSE3
+ANY31C(I422ToARGBRow_Any_SSSE3, I422ToARGBRow_SSSE3, 1, 0, 4, 7)
+#endif
+#ifdef HAS_I444TOARGBROW_SSSE3
+ANY31C(I444ToARGBRow_Any_SSSE3, I444ToARGBRow_SSSE3, 0, 0, 4, 7)
+ANY31C(I422ToRGBARow_Any_SSSE3, I422ToRGBARow_SSSE3, 1, 0, 4, 7)
+ANY31C(I422ToARGB4444Row_Any_SSSE3, I422ToARGB4444Row_SSSE3, 1, 0, 2, 7)
+ANY31C(I422ToARGB1555Row_Any_SSSE3, I422ToARGB1555Row_SSSE3, 1, 0, 2, 7)
+ANY31C(I422ToRGB565Row_Any_SSSE3, I422ToRGB565Row_SSSE3, 1, 0, 2, 7)
+ANY31C(I422ToRGB24Row_Any_SSSE3, I422ToRGB24Row_SSSE3, 1, 0, 3, 7)
+#endif // HAS_I444TOARGBROW_SSSE3
+#ifdef HAS_I422TORGB24ROW_AVX2
+ANY31C(I422ToRGB24Row_Any_AVX2, I422ToRGB24Row_AVX2, 1, 0, 3, 15)
+#endif
+#ifdef HAS_I422TOARGBROW_AVX2
+ANY31C(I422ToARGBRow_Any_AVX2, I422ToARGBRow_AVX2, 1, 0, 4, 15)
+#endif
+#ifdef HAS_I422TORGBAROW_AVX2
+ANY31C(I422ToRGBARow_Any_AVX2, I422ToRGBARow_AVX2, 1, 0, 4, 15)
+#endif
+#ifdef HAS_I444TOARGBROW_AVX2
+ANY31C(I444ToARGBRow_Any_AVX2, I444ToARGBRow_AVX2, 0, 0, 4, 15)
+#endif
+#ifdef HAS_I422TOARGB4444ROW_AVX2
+ANY31C(I422ToARGB4444Row_Any_AVX2, I422ToARGB4444Row_AVX2, 1, 0, 2, 15)
+#endif
+#ifdef HAS_I422TOARGB1555ROW_AVX2
+ANY31C(I422ToARGB1555Row_Any_AVX2, I422ToARGB1555Row_AVX2, 1, 0, 2, 15)
+#endif
+#ifdef HAS_I422TORGB565ROW_AVX2
+ANY31C(I422ToRGB565Row_Any_AVX2, I422ToRGB565Row_AVX2, 1, 0, 2, 15)
+#endif
+#ifdef HAS_I422TOARGBROW_NEON
+ANY31C(I444ToARGBRow_Any_NEON, I444ToARGBRow_NEON, 0, 0, 4, 7)
+ANY31C(I422ToARGBRow_Any_NEON, I422ToARGBRow_NEON, 1, 0, 4, 7)
+ANY31C(I422ToRGBARow_Any_NEON, I422ToRGBARow_NEON, 1, 0, 4, 7)
+ANY31C(I422ToRGB24Row_Any_NEON, I422ToRGB24Row_NEON, 1, 0, 3, 7)
+ANY31C(I422ToARGB4444Row_Any_NEON, I422ToARGB4444Row_NEON, 1, 0, 2, 7)
+ANY31C(I422ToARGB1555Row_Any_NEON, I422ToARGB1555Row_NEON, 1, 0, 2, 7)
+ANY31C(I422ToRGB565Row_Any_NEON, I422ToRGB565Row_NEON, 1, 0, 2, 7)
+#endif
+#ifdef HAS_I422TOARGBROW_DSPR2
+ANY31C(I444ToARGBRow_Any_DSPR2, I444ToARGBRow_DSPR2, 0, 0, 4, 7)
+ANY31C(I422ToARGBRow_Any_DSPR2, I422ToARGBRow_DSPR2, 1, 0, 4, 7)
+ANY31C(I422ToARGB4444Row_Any_DSPR2, I422ToARGB4444Row_DSPR2, 1, 0, 2, 7)
+ANY31C(I422ToARGB1555Row_Any_DSPR2, I422ToARGB1555Row_DSPR2, 1, 0, 2, 7)
+#endif
+#ifdef HAS_I422TOARGBROW_MSA
+ANY31C(I444ToARGBRow_Any_MSA, I444ToARGBRow_MSA, 0, 0, 4, 7)
+ANY31C(I422ToARGBRow_Any_MSA, I422ToARGBRow_MSA, 1, 0, 4, 7)
+ANY31C(I422ToRGBARow_Any_MSA, I422ToRGBARow_MSA, 1, 0, 4, 7)
+ANY31C(I422ToRGB24Row_Any_MSA, I422ToRGB24Row_MSA, 1, 0, 3, 15)
+ANY31C(I422ToARGB4444Row_Any_MSA, I422ToARGB4444Row_MSA, 1, 0, 2, 7)
+ANY31C(I422ToARGB1555Row_Any_MSA, I422ToARGB1555Row_MSA, 1, 0, 2, 7)
+ANY31C(I422ToRGB565Row_Any_MSA, I422ToRGB565Row_MSA, 1, 0, 2, 7)
+#endif
+#undef ANY31C
+
+// Any 2 planes to 1.
+#define ANY21(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK) \
+ void NAMEANY(const uint8* y_buf, const uint8* uv_buf, uint8* dst_ptr, \
+ int width) { \
+ SIMD_ALIGNED(uint8 temp[64 * 3]); \
+ memset(temp, 0, 64 * 2); /* for msan */ \
+ int r = width & MASK; \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ ANY_SIMD(y_buf, uv_buf, dst_ptr, n); \
+ } \
+ memcpy(temp, y_buf + n * SBPP, r * SBPP); \
+ memcpy(temp + 64, uv_buf + (n >> UVSHIFT) * SBPP2, \
+ SS(r, UVSHIFT) * SBPP2); \
+ ANY_SIMD(temp, temp + 64, temp + 128, MASK + 1); \
+ memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \
+ }
+
+// Merge functions.
+#ifdef HAS_MERGEUVROW_SSE2
+ANY21(MergeUVRow_Any_SSE2, MergeUVRow_SSE2, 0, 1, 1, 2, 15)
+#endif
+#ifdef HAS_MERGEUVROW_AVX2
+ANY21(MergeUVRow_Any_AVX2, MergeUVRow_AVX2, 0, 1, 1, 2, 31)
+#endif
+#ifdef HAS_MERGEUVROW_NEON
+ANY21(MergeUVRow_Any_NEON, MergeUVRow_NEON, 0, 1, 1, 2, 15)
+#endif
+#ifdef HAS_MERGEUVROW_MSA
+ANY21(MergeUVRow_Any_MSA, MergeUVRow_MSA, 0, 1, 1, 2, 15)
+#endif
+
+// Math functions.
+#ifdef HAS_ARGBMULTIPLYROW_SSE2
+ANY21(ARGBMultiplyRow_Any_SSE2, ARGBMultiplyRow_SSE2, 0, 4, 4, 4, 3)
+#endif
+#ifdef HAS_ARGBADDROW_SSE2
+ANY21(ARGBAddRow_Any_SSE2, ARGBAddRow_SSE2, 0, 4, 4, 4, 3)
+#endif
+#ifdef HAS_ARGBSUBTRACTROW_SSE2
+ANY21(ARGBSubtractRow_Any_SSE2, ARGBSubtractRow_SSE2, 0, 4, 4, 4, 3)
+#endif
+#ifdef HAS_ARGBMULTIPLYROW_AVX2
+ANY21(ARGBMultiplyRow_Any_AVX2, ARGBMultiplyRow_AVX2, 0, 4, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBADDROW_AVX2
+ANY21(ARGBAddRow_Any_AVX2, ARGBAddRow_AVX2, 0, 4, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBSUBTRACTROW_AVX2
+ANY21(ARGBSubtractRow_Any_AVX2, ARGBSubtractRow_AVX2, 0, 4, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBMULTIPLYROW_NEON
+ANY21(ARGBMultiplyRow_Any_NEON, ARGBMultiplyRow_NEON, 0, 4, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBADDROW_NEON
+ANY21(ARGBAddRow_Any_NEON, ARGBAddRow_NEON, 0, 4, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBSUBTRACTROW_NEON
+ANY21(ARGBSubtractRow_Any_NEON, ARGBSubtractRow_NEON, 0, 4, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBMULTIPLYROW_MSA
+ANY21(ARGBMultiplyRow_Any_MSA, ARGBMultiplyRow_MSA, 0, 4, 4, 4, 3)
+#endif
+#ifdef HAS_ARGBADDROW_MSA
+ANY21(ARGBAddRow_Any_MSA, ARGBAddRow_MSA, 0, 4, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBSUBTRACTROW_MSA
+ANY21(ARGBSubtractRow_Any_MSA, ARGBSubtractRow_MSA, 0, 4, 4, 4, 7)
+#endif
+#ifdef HAS_SOBELROW_SSE2
+ANY21(SobelRow_Any_SSE2, SobelRow_SSE2, 0, 1, 1, 4, 15)
+#endif
+#ifdef HAS_SOBELROW_NEON
+ANY21(SobelRow_Any_NEON, SobelRow_NEON, 0, 1, 1, 4, 7)
+#endif
+#ifdef HAS_SOBELROW_MSA
+ANY21(SobelRow_Any_MSA, SobelRow_MSA, 0, 1, 1, 4, 15)
+#endif
+#ifdef HAS_SOBELTOPLANEROW_SSE2
+ANY21(SobelToPlaneRow_Any_SSE2, SobelToPlaneRow_SSE2, 0, 1, 1, 1, 15)
+#endif
+#ifdef HAS_SOBELTOPLANEROW_NEON
+ANY21(SobelToPlaneRow_Any_NEON, SobelToPlaneRow_NEON, 0, 1, 1, 1, 15)
+#endif
+#ifdef HAS_SOBELTOPLANEROW_MSA
+ANY21(SobelToPlaneRow_Any_MSA, SobelToPlaneRow_MSA, 0, 1, 1, 1, 31)
+#endif
+#ifdef HAS_SOBELXYROW_SSE2
+ANY21(SobelXYRow_Any_SSE2, SobelXYRow_SSE2, 0, 1, 1, 4, 15)
+#endif
+#ifdef HAS_SOBELXYROW_NEON
+ANY21(SobelXYRow_Any_NEON, SobelXYRow_NEON, 0, 1, 1, 4, 7)
+#endif
+#ifdef HAS_SOBELXYROW_MSA
+ANY21(SobelXYRow_Any_MSA, SobelXYRow_MSA, 0, 1, 1, 4, 15)
+#endif
+#undef ANY21
+
+// Any 2 planes to 1 with yuvconstants
+#define ANY21C(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK) \
+ void NAMEANY(const uint8* y_buf, const uint8* uv_buf, uint8* dst_ptr, \
+ const struct YuvConstants* yuvconstants, int width) { \
+ SIMD_ALIGNED(uint8 temp[64 * 3]); \
+ memset(temp, 0, 64 * 2); /* for msan */ \
+ int r = width & MASK; \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ ANY_SIMD(y_buf, uv_buf, dst_ptr, yuvconstants, n); \
+ } \
+ memcpy(temp, y_buf + n * SBPP, r * SBPP); \
+ memcpy(temp + 64, uv_buf + (n >> UVSHIFT) * SBPP2, \
+ SS(r, UVSHIFT) * SBPP2); \
+ ANY_SIMD(temp, temp + 64, temp + 128, yuvconstants, MASK + 1); \
+ memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \
+ }
+
+// Biplanar to RGB.
+#ifdef HAS_NV12TOARGBROW_SSSE3
+ANY21C(NV12ToARGBRow_Any_SSSE3, NV12ToARGBRow_SSSE3, 1, 1, 2, 4, 7)
+#endif
+#ifdef HAS_NV12TOARGBROW_AVX2
+ANY21C(NV12ToARGBRow_Any_AVX2, NV12ToARGBRow_AVX2, 1, 1, 2, 4, 15)
+#endif
+#ifdef HAS_NV12TOARGBROW_NEON
+ANY21C(NV12ToARGBRow_Any_NEON, NV12ToARGBRow_NEON, 1, 1, 2, 4, 7)
+#endif
+#ifdef HAS_NV12TOARGBROW_DSPR2
+ANY21C(NV12ToARGBRow_Any_DSPR2, NV12ToARGBRow_DSPR2, 1, 1, 2, 4, 7)
+#endif
+#ifdef HAS_NV12TOARGBROW_MSA
+ANY21C(NV12ToARGBRow_Any_MSA, NV12ToARGBRow_MSA, 1, 1, 2, 4, 7)
+#endif
+#ifdef HAS_NV21TOARGBROW_SSSE3
+ANY21C(NV21ToARGBRow_Any_SSSE3, NV21ToARGBRow_SSSE3, 1, 1, 2, 4, 7)
+#endif
+#ifdef HAS_NV21TOARGBROW_AVX2
+ANY21C(NV21ToARGBRow_Any_AVX2, NV21ToARGBRow_AVX2, 1, 1, 2, 4, 15)
+#endif
+#ifdef HAS_NV21TOARGBROW_NEON
+ANY21C(NV21ToARGBRow_Any_NEON, NV21ToARGBRow_NEON, 1, 1, 2, 4, 7)
+#endif
+#ifdef HAS_NV21TOARGBROW_MSA
+ANY21C(NV21ToARGBRow_Any_MSA, NV21ToARGBRow_MSA, 1, 1, 2, 4, 7)
+#endif
+#ifdef HAS_NV12TORGB565ROW_SSSE3
+ANY21C(NV12ToRGB565Row_Any_SSSE3, NV12ToRGB565Row_SSSE3, 1, 1, 2, 2, 7)
+#endif
+#ifdef HAS_NV12TORGB565ROW_AVX2
+ANY21C(NV12ToRGB565Row_Any_AVX2, NV12ToRGB565Row_AVX2, 1, 1, 2, 2, 15)
+#endif
+#ifdef HAS_NV12TORGB565ROW_NEON
+ANY21C(NV12ToRGB565Row_Any_NEON, NV12ToRGB565Row_NEON, 1, 1, 2, 2, 7)
+#endif
+#ifdef HAS_NV12TORGB565ROW_MSA
+ANY21C(NV12ToRGB565Row_Any_MSA, NV12ToRGB565Row_MSA, 1, 1, 2, 2, 7)
+#endif
+#undef ANY21C
+
+// Any 1 to 1.
+#define ANY11(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK) \
+ void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, int width) { \
+ SIMD_ALIGNED(uint8 temp[128 * 2]); \
+ memset(temp, 0, 128); /* for YUY2 and msan */ \
+ int r = width & MASK; \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ ANY_SIMD(src_ptr, dst_ptr, n); \
+ } \
+ memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \
+ ANY_SIMD(temp, temp + 128, MASK + 1); \
+ memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \
+ }
+
+#ifdef HAS_COPYROW_AVX
+ANY11(CopyRow_Any_AVX, CopyRow_AVX, 0, 1, 1, 63)
+#endif
+#ifdef HAS_COPYROW_SSE2
+ANY11(CopyRow_Any_SSE2, CopyRow_SSE2, 0, 1, 1, 31)
+#endif
+#ifdef HAS_COPYROW_NEON
+ANY11(CopyRow_Any_NEON, CopyRow_NEON, 0, 1, 1, 31)
+#endif
+#if defined(HAS_ARGBTORGB24ROW_SSSE3)
+ANY11(ARGBToRGB24Row_Any_SSSE3, ARGBToRGB24Row_SSSE3, 0, 4, 3, 15)
+ANY11(ARGBToRAWRow_Any_SSSE3, ARGBToRAWRow_SSSE3, 0, 4, 3, 15)
+ANY11(ARGBToRGB565Row_Any_SSE2, ARGBToRGB565Row_SSE2, 0, 4, 2, 3)
+ANY11(ARGBToARGB1555Row_Any_SSE2, ARGBToARGB1555Row_SSE2, 0, 4, 2, 3)
+ANY11(ARGBToARGB4444Row_Any_SSE2, ARGBToARGB4444Row_SSE2, 0, 4, 2, 3)
+#endif
+#if defined(HAS_ARGBTORGB565ROW_AVX2)
+ANY11(ARGBToRGB565Row_Any_AVX2, ARGBToRGB565Row_AVX2, 0, 4, 2, 7)
+#endif
+#if defined(HAS_ARGBTOARGB4444ROW_AVX2)
+ANY11(ARGBToARGB1555Row_Any_AVX2, ARGBToARGB1555Row_AVX2, 0, 4, 2, 7)
+ANY11(ARGBToARGB4444Row_Any_AVX2, ARGBToARGB4444Row_AVX2, 0, 4, 2, 7)
+#endif
+#if defined(HAS_J400TOARGBROW_SSE2)
+ANY11(J400ToARGBRow_Any_SSE2, J400ToARGBRow_SSE2, 0, 1, 4, 7)
+#endif
+#if defined(HAS_J400TOARGBROW_AVX2)
+ANY11(J400ToARGBRow_Any_AVX2, J400ToARGBRow_AVX2, 0, 1, 4, 15)
+#endif
+#if defined(HAS_I400TOARGBROW_SSE2)
+ANY11(I400ToARGBRow_Any_SSE2, I400ToARGBRow_SSE2, 0, 1, 4, 7)
+#endif
+#if defined(HAS_I400TOARGBROW_AVX2)
+ANY11(I400ToARGBRow_Any_AVX2, I400ToARGBRow_AVX2, 0, 1, 4, 15)
+#endif
+#if defined(HAS_RGB24TOARGBROW_SSSE3)
+ANY11(RGB24ToARGBRow_Any_SSSE3, RGB24ToARGBRow_SSSE3, 0, 3, 4, 15)
+ANY11(RAWToARGBRow_Any_SSSE3, RAWToARGBRow_SSSE3, 0, 3, 4, 15)
+ANY11(RGB565ToARGBRow_Any_SSE2, RGB565ToARGBRow_SSE2, 0, 2, 4, 7)
+ANY11(ARGB1555ToARGBRow_Any_SSE2, ARGB1555ToARGBRow_SSE2, 0, 2, 4, 7)
+ANY11(ARGB4444ToARGBRow_Any_SSE2, ARGB4444ToARGBRow_SSE2, 0, 2, 4, 7)
+#endif
+#if defined(HAS_RAWTORGB24ROW_SSSE3)
+ANY11(RAWToRGB24Row_Any_SSSE3, RAWToRGB24Row_SSSE3, 0, 3, 3, 7)
+#endif
+#if defined(HAS_RGB565TOARGBROW_AVX2)
+ANY11(RGB565ToARGBRow_Any_AVX2, RGB565ToARGBRow_AVX2, 0, 2, 4, 15)
+#endif
+#if defined(HAS_ARGB1555TOARGBROW_AVX2)
+ANY11(ARGB1555ToARGBRow_Any_AVX2, ARGB1555ToARGBRow_AVX2, 0, 2, 4, 15)
+#endif
+#if defined(HAS_ARGB4444TOARGBROW_AVX2)
+ANY11(ARGB4444ToARGBRow_Any_AVX2, ARGB4444ToARGBRow_AVX2, 0, 2, 4, 15)
+#endif
+#if defined(HAS_ARGBTORGB24ROW_NEON)
+ANY11(ARGBToRGB24Row_Any_NEON, ARGBToRGB24Row_NEON, 0, 4, 3, 7)
+ANY11(ARGBToRAWRow_Any_NEON, ARGBToRAWRow_NEON, 0, 4, 3, 7)
+ANY11(ARGBToRGB565Row_Any_NEON, ARGBToRGB565Row_NEON, 0, 4, 2, 7)
+ANY11(ARGBToARGB1555Row_Any_NEON, ARGBToARGB1555Row_NEON, 0, 4, 2, 7)
+ANY11(ARGBToARGB4444Row_Any_NEON, ARGBToARGB4444Row_NEON, 0, 4, 2, 7)
+ANY11(J400ToARGBRow_Any_NEON, J400ToARGBRow_NEON, 0, 1, 4, 7)
+ANY11(I400ToARGBRow_Any_NEON, I400ToARGBRow_NEON, 0, 1, 4, 7)
+#endif
+#if defined(HAS_ARGBTORGB24ROW_MSA)
+ANY11(ARGBToRGB24Row_Any_MSA, ARGBToRGB24Row_MSA, 0, 4, 3, 15)
+ANY11(ARGBToRAWRow_Any_MSA, ARGBToRAWRow_MSA, 0, 4, 3, 15)
+ANY11(ARGBToRGB565Row_Any_MSA, ARGBToRGB565Row_MSA, 0, 4, 2, 7)
+ANY11(ARGBToARGB1555Row_Any_MSA, ARGBToARGB1555Row_MSA, 0, 4, 2, 7)
+ANY11(ARGBToARGB4444Row_Any_MSA, ARGBToARGB4444Row_MSA, 0, 4, 2, 7)
+ANY11(J400ToARGBRow_Any_MSA, J400ToARGBRow_MSA, 0, 1, 4, 15)
+ANY11(I400ToARGBRow_Any_MSA, I400ToARGBRow_MSA, 0, 1, 4, 15)
+#endif
+#if defined(HAS_RAWTORGB24ROW_NEON)
+ANY11(RAWToRGB24Row_Any_NEON, RAWToRGB24Row_NEON, 0, 3, 3, 7)
+#endif
+#if defined(HAS_RAWTORGB24ROW_MSA)
+ANY11(RAWToRGB24Row_Any_MSA, RAWToRGB24Row_MSA, 0, 3, 3, 15)
+#endif
+#ifdef HAS_ARGBTOYROW_AVX2
+ANY11(ARGBToYRow_Any_AVX2, ARGBToYRow_AVX2, 0, 4, 1, 31)
+#endif
+#ifdef HAS_ARGBTOYJROW_AVX2
+ANY11(ARGBToYJRow_Any_AVX2, ARGBToYJRow_AVX2, 0, 4, 1, 31)
+#endif
+#ifdef HAS_UYVYTOYROW_AVX2
+ANY11(UYVYToYRow_Any_AVX2, UYVYToYRow_AVX2, 0, 2, 1, 31)
+#endif
+#ifdef HAS_YUY2TOYROW_AVX2
+ANY11(YUY2ToYRow_Any_AVX2, YUY2ToYRow_AVX2, 1, 4, 1, 31)
+#endif
+#ifdef HAS_ARGBTOYROW_SSSE3
+ANY11(ARGBToYRow_Any_SSSE3, ARGBToYRow_SSSE3, 0, 4, 1, 15)
+#endif
+#ifdef HAS_BGRATOYROW_SSSE3
+ANY11(BGRAToYRow_Any_SSSE3, BGRAToYRow_SSSE3, 0, 4, 1, 15)
+ANY11(ABGRToYRow_Any_SSSE3, ABGRToYRow_SSSE3, 0, 4, 1, 15)
+ANY11(RGBAToYRow_Any_SSSE3, RGBAToYRow_SSSE3, 0, 4, 1, 15)
+ANY11(YUY2ToYRow_Any_SSE2, YUY2ToYRow_SSE2, 1, 4, 1, 15)
+ANY11(UYVYToYRow_Any_SSE2, UYVYToYRow_SSE2, 1, 4, 1, 15)
+#endif
+#ifdef HAS_ARGBTOYJROW_SSSE3
+ANY11(ARGBToYJRow_Any_SSSE3, ARGBToYJRow_SSSE3, 0, 4, 1, 15)
+#endif
+#ifdef HAS_ARGBTOYROW_NEON
+ANY11(ARGBToYRow_Any_NEON, ARGBToYRow_NEON, 0, 4, 1, 7)
+#endif
+#ifdef HAS_ARGBTOYROW_MSA
+ANY11(ARGBToYRow_Any_MSA, ARGBToYRow_MSA, 0, 4, 1, 15)
+#endif
+#ifdef HAS_ARGBTOYJROW_NEON
+ANY11(ARGBToYJRow_Any_NEON, ARGBToYJRow_NEON, 0, 4, 1, 7)
+#endif
+#ifdef HAS_ARGBTOYJROW_MSA
+ANY11(ARGBToYJRow_Any_MSA, ARGBToYJRow_MSA, 0, 4, 1, 15)
+#endif
+#ifdef HAS_BGRATOYROW_NEON
+ANY11(BGRAToYRow_Any_NEON, BGRAToYRow_NEON, 0, 4, 1, 7)
+#endif
+#ifdef HAS_BGRATOYROW_MSA
+ANY11(BGRAToYRow_Any_MSA, BGRAToYRow_MSA, 0, 4, 1, 15)
+#endif
+#ifdef HAS_ABGRTOYROW_NEON
+ANY11(ABGRToYRow_Any_NEON, ABGRToYRow_NEON, 0, 4, 1, 7)
+#endif
+#ifdef HAS_ABGRTOYROW_MSA
+ANY11(ABGRToYRow_Any_MSA, ABGRToYRow_MSA, 0, 4, 1, 7)
+#endif
+#ifdef HAS_RGBATOYROW_NEON
+ANY11(RGBAToYRow_Any_NEON, RGBAToYRow_NEON, 0, 4, 1, 7)
+#endif
+#ifdef HAS_RGBATOYROW_MSA
+ANY11(RGBAToYRow_Any_MSA, RGBAToYRow_MSA, 0, 4, 1, 15)
+#endif
+#ifdef HAS_RGB24TOYROW_NEON
+ANY11(RGB24ToYRow_Any_NEON, RGB24ToYRow_NEON, 0, 3, 1, 7)
+#endif
+#ifdef HAS_RGB24TOYROW_MSA
+ANY11(RGB24ToYRow_Any_MSA, RGB24ToYRow_MSA, 0, 3, 1, 15)
+#endif
+#ifdef HAS_RAWTOYROW_NEON
+ANY11(RAWToYRow_Any_NEON, RAWToYRow_NEON, 0, 3, 1, 7)
+#endif
+#ifdef HAS_RAWTOYROW_MSA
+ANY11(RAWToYRow_Any_MSA, RAWToYRow_MSA, 0, 3, 1, 15)
+#endif
+#ifdef HAS_RGB565TOYROW_NEON
+ANY11(RGB565ToYRow_Any_NEON, RGB565ToYRow_NEON, 0, 2, 1, 7)
+#endif
+#ifdef HAS_RGB565TOYROW_MSA
+ANY11(RGB565ToYRow_Any_MSA, RGB565ToYRow_MSA, 0, 2, 1, 15)
+#endif
+#ifdef HAS_ARGB1555TOYROW_NEON
+ANY11(ARGB1555ToYRow_Any_NEON, ARGB1555ToYRow_NEON, 0, 2, 1, 7)
+#endif
+#ifdef HAS_ARGB1555TOYROW_MSA
+ANY11(ARGB1555ToYRow_Any_MSA, ARGB1555ToYRow_MSA, 0, 2, 1, 15)
+#endif
+#ifdef HAS_ARGB4444TOYROW_NEON
+ANY11(ARGB4444ToYRow_Any_NEON, ARGB4444ToYRow_NEON, 0, 2, 1, 7)
+#endif
+#ifdef HAS_YUY2TOYROW_NEON
+ANY11(YUY2ToYRow_Any_NEON, YUY2ToYRow_NEON, 1, 4, 1, 15)
+#endif
+#ifdef HAS_UYVYTOYROW_NEON
+ANY11(UYVYToYRow_Any_NEON, UYVYToYRow_NEON, 1, 4, 1, 15)
+#endif
+#ifdef HAS_YUY2TOYROW_MSA
+ANY11(YUY2ToYRow_Any_MSA, YUY2ToYRow_MSA, 1, 4, 1, 31)
+#endif
+#ifdef HAS_UYVYTOYROW_MSA
+ANY11(UYVYToYRow_Any_MSA, UYVYToYRow_MSA, 1, 4, 1, 31)
+#endif
+#ifdef HAS_RGB24TOARGBROW_NEON
+ANY11(RGB24ToARGBRow_Any_NEON, RGB24ToARGBRow_NEON, 0, 3, 4, 7)
+#endif
+#ifdef HAS_RGB24TOARGBROW_MSA
+ANY11(RGB24ToARGBRow_Any_MSA, RGB24ToARGBRow_MSA, 0, 3, 4, 15)
+#endif
+#ifdef HAS_RAWTOARGBROW_NEON
+ANY11(RAWToARGBRow_Any_NEON, RAWToARGBRow_NEON, 0, 3, 4, 7)
+#endif
+#ifdef HAS_RAWTOARGBROW_MSA
+ANY11(RAWToARGBRow_Any_MSA, RAWToARGBRow_MSA, 0, 3, 4, 15)
+#endif
+#ifdef HAS_RGB565TOARGBROW_NEON
+ANY11(RGB565ToARGBRow_Any_NEON, RGB565ToARGBRow_NEON, 0, 2, 4, 7)
+#endif
+#ifdef HAS_RGB565TOARGBROW_MSA
+ANY11(RGB565ToARGBRow_Any_MSA, RGB565ToARGBRow_MSA, 0, 2, 4, 15)
+#endif
+#ifdef HAS_ARGB1555TOARGBROW_NEON
+ANY11(ARGB1555ToARGBRow_Any_NEON, ARGB1555ToARGBRow_NEON, 0, 2, 4, 7)
+#endif
+#ifdef HAS_ARGB1555TOARGBROW_MSA
+ANY11(ARGB1555ToARGBRow_Any_MSA, ARGB1555ToARGBRow_MSA, 0, 2, 4, 15)
+#endif
+#ifdef HAS_ARGB4444TOARGBROW_NEON
+ANY11(ARGB4444ToARGBRow_Any_NEON, ARGB4444ToARGBRow_NEON, 0, 2, 4, 7)
+#endif
+#ifdef HAS_RGB24TOARGBROW_DSPR2
+ANY11(RGB24ToARGBRow_Any_DSPR2, RGB24ToARGBRow_DSPR2, 0, 3, 4, 7)
+#endif
+#ifdef HAS_RAWTOARGBROW_DSPR2
+ANY11(RAWToARGBRow_Any_DSPR2, RAWToARGBRow_DSPR2, 0, 3, 4, 7)
+#endif
+#ifdef HAS_RGB565TOARGBROW_DSPR2
+ANY11(RGB565ToARGBRow_Any_DSPR2, RGB565ToARGBRow_DSPR2, 0, 2, 4, 7)
+#endif
+#ifdef HAS_ARGB1555TOARGBROW_DSPR2
+ANY11(ARGB1555ToARGBRow_Any_DSPR2, ARGB1555ToARGBRow_DSPR2, 0, 2, 4, 7)
+#endif
+#ifdef HAS_ARGB4444TOARGBROW_DSPR2
+ANY11(ARGB4444ToARGBRow_Any_DSPR2, ARGB4444ToARGBRow_DSPR2, 0, 2, 4, 7)
+#endif
+#ifdef HAS_BGRATOYROW_DSPR2
+ANY11(BGRAToYRow_Any_DSPR2, BGRAToYRow_DSPR2, 0, 4, 1, 7)
+#endif
+#ifdef HAS_ARGBTOYROW_DSPR2
+ANY11(ARGBToYRow_Any_DSPR2, ARGBToYRow_DSPR2, 0, 4, 1, 7)
+#endif
+#ifdef HAS_ABGRTOYROW_DSPR2
+ANY11(ABGRToYRow_Any_DSPR2, ABGRToYRow_DSPR2, 0, 4, 1, 7)
+#endif
+#ifdef HAS_RGBATOYROW_DSPR2
+ANY11(RGBAToYRow_Any_DSPR2, RGBAToYRow_DSPR2, 0, 4, 1, 7)
+#endif
+#ifdef HAS_ARGB4444TOARGBROW_MSA
+ANY11(ARGB4444ToARGBRow_Any_MSA, ARGB4444ToARGBRow_MSA, 0, 2, 4, 15)
+#endif
+#ifdef HAS_ARGBATTENUATEROW_SSSE3
+ANY11(ARGBAttenuateRow_Any_SSSE3, ARGBAttenuateRow_SSSE3, 0, 4, 4, 3)
+#endif
+#ifdef HAS_ARGBUNATTENUATEROW_SSE2
+ANY11(ARGBUnattenuateRow_Any_SSE2, ARGBUnattenuateRow_SSE2, 0, 4, 4, 3)
+#endif
+#ifdef HAS_ARGBATTENUATEROW_AVX2
+ANY11(ARGBAttenuateRow_Any_AVX2, ARGBAttenuateRow_AVX2, 0, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBUNATTENUATEROW_AVX2
+ANY11(ARGBUnattenuateRow_Any_AVX2, ARGBUnattenuateRow_AVX2, 0, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBATTENUATEROW_NEON
+ANY11(ARGBAttenuateRow_Any_NEON, ARGBAttenuateRow_NEON, 0, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBATTENUATEROW_MSA
+ANY11(ARGBAttenuateRow_Any_MSA, ARGBAttenuateRow_MSA, 0, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBEXTRACTALPHAROW_SSE2
+ANY11(ARGBExtractAlphaRow_Any_SSE2, ARGBExtractAlphaRow_SSE2, 0, 4, 1, 7)
+#endif
+#ifdef HAS_ARGBEXTRACTALPHAROW_AVX2
+ANY11(ARGBExtractAlphaRow_Any_AVX2, ARGBExtractAlphaRow_AVX2, 0, 4, 1, 32)
+#endif
+#ifdef HAS_ARGBEXTRACTALPHAROW_NEON
+ANY11(ARGBExtractAlphaRow_Any_NEON, ARGBExtractAlphaRow_NEON, 0, 4, 1, 15)
+#endif
+#undef ANY11
+
+// Any 1 to 1 blended. Destination is read, modify, write.
+#define ANY11B(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK) \
+ void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, int width) { \
+ SIMD_ALIGNED(uint8 temp[64 * 2]); \
+ memset(temp, 0, 64 * 2); /* for msan */ \
+ int r = width & MASK; \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ ANY_SIMD(src_ptr, dst_ptr, n); \
+ } \
+ memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \
+ memcpy(temp + 64, dst_ptr + n * BPP, r * BPP); \
+ ANY_SIMD(temp, temp + 64, MASK + 1); \
+ memcpy(dst_ptr + n * BPP, temp + 64, r * BPP); \
+ }
+
+#ifdef HAS_ARGBCOPYALPHAROW_AVX2
+ANY11B(ARGBCopyAlphaRow_Any_AVX2, ARGBCopyAlphaRow_AVX2, 0, 4, 4, 15)
+#endif
+#ifdef HAS_ARGBCOPYALPHAROW_SSE2
+ANY11B(ARGBCopyAlphaRow_Any_SSE2, ARGBCopyAlphaRow_SSE2, 0, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
+ANY11B(ARGBCopyYToAlphaRow_Any_AVX2, ARGBCopyYToAlphaRow_AVX2, 0, 1, 4, 15)
+#endif
+#ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
+ANY11B(ARGBCopyYToAlphaRow_Any_SSE2, ARGBCopyYToAlphaRow_SSE2, 0, 1, 4, 7)
+#endif
+#undef ANY11B
+
+// Any 1 to 1 with parameter.
+#define ANY11P(NAMEANY, ANY_SIMD, T, SBPP, BPP, MASK) \
+ void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, T param, int width) { \
+ SIMD_ALIGNED(uint8 temp[64 * 2]); \
+ memset(temp, 0, 64); /* for msan */ \
+ int r = width & MASK; \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ ANY_SIMD(src_ptr, dst_ptr, param, n); \
+ } \
+ memcpy(temp, src_ptr + n * SBPP, r * SBPP); \
+ ANY_SIMD(temp, temp + 64, param, MASK + 1); \
+ memcpy(dst_ptr + n * BPP, temp + 64, r * BPP); \
+ }
+
+#if defined(HAS_ARGBTORGB565DITHERROW_SSE2)
+ANY11P(ARGBToRGB565DitherRow_Any_SSE2,
+ ARGBToRGB565DitherRow_SSE2,
+ const uint32,
+ 4,
+ 2,
+ 3)
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_AVX2)
+ANY11P(ARGBToRGB565DitherRow_Any_AVX2,
+ ARGBToRGB565DitherRow_AVX2,
+ const uint32,
+ 4,
+ 2,
+ 7)
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_NEON)
+ANY11P(ARGBToRGB565DitherRow_Any_NEON,
+ ARGBToRGB565DitherRow_NEON,
+ const uint32,
+ 4,
+ 2,
+ 7)
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_MSA)
+ANY11P(ARGBToRGB565DitherRow_Any_MSA,
+ ARGBToRGB565DitherRow_MSA,
+ const uint32,
+ 4,
+ 2,
+ 7)
+#endif
+#ifdef HAS_ARGBSHUFFLEROW_SSE2
+ANY11P(ARGBShuffleRow_Any_SSE2, ARGBShuffleRow_SSE2, const uint8*, 4, 4, 3)
+#endif
+#ifdef HAS_ARGBSHUFFLEROW_SSSE3
+ANY11P(ARGBShuffleRow_Any_SSSE3, ARGBShuffleRow_SSSE3, const uint8*, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBSHUFFLEROW_AVX2
+ANY11P(ARGBShuffleRow_Any_AVX2, ARGBShuffleRow_AVX2, const uint8*, 4, 4, 15)
+#endif
+#ifdef HAS_ARGBSHUFFLEROW_NEON
+ANY11P(ARGBShuffleRow_Any_NEON, ARGBShuffleRow_NEON, const uint8*, 4, 4, 3)
+#endif
+#ifdef HAS_ARGBSHUFFLEROW_MSA
+ANY11P(ARGBShuffleRow_Any_MSA, ARGBShuffleRow_MSA, const uint8*, 4, 4, 7)
+#endif
+#undef ANY11P
+
+// Any 1 to 1 with parameter and shorts. BPP measures in shorts.
+#define ANY11P16(NAMEANY, ANY_SIMD, T, SBPP, BPP, MASK) \
+ void NAMEANY(const uint16* src_ptr, uint16* dst_ptr, T param, int width) { \
+ SIMD_ALIGNED(uint16 temp[16 * 2]); \
+ memset(temp, 0, 32); /* for msan */ \
+ int r = width & MASK; \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ ANY_SIMD(src_ptr, dst_ptr, param, n); \
+ } \
+ memcpy(temp, src_ptr + n, r * SBPP); \
+ ANY_SIMD(temp, temp + 16, param, MASK + 1); \
+ memcpy(dst_ptr + n, temp + 16, r * BPP); \
+ }
+
+#ifdef HAS_HALFFLOATROW_SSE2
+ANY11P16(HalfFloatRow_Any_SSE2, HalfFloatRow_SSE2, float, 2, 2, 7)
+#endif
+#ifdef HAS_HALFFLOATROW_AVX2
+ANY11P16(HalfFloatRow_Any_AVX2, HalfFloatRow_AVX2, float, 2, 2, 15)
+#endif
+#ifdef HAS_HALFFLOATROW_F16C
+ANY11P16(HalfFloatRow_Any_F16C, HalfFloatRow_F16C, float, 2, 2, 15)
+ANY11P16(HalfFloat1Row_Any_F16C, HalfFloat1Row_F16C, float, 2, 2, 15)
+#endif
+#ifdef HAS_HALFFLOATROW_NEON
+ANY11P16(HalfFloatRow_Any_NEON, HalfFloatRow_NEON, float, 2, 2, 7)
+ANY11P16(HalfFloat1Row_Any_NEON, HalfFloat1Row_NEON, float, 2, 2, 7)
+#endif
+#undef ANY11P16
+
+// Any 1 to 1 with yuvconstants
+#define ANY11C(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK) \
+ void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, \
+ const struct YuvConstants* yuvconstants, int width) { \
+ SIMD_ALIGNED(uint8 temp[128 * 2]); \
+ memset(temp, 0, 128); /* for YUY2 and msan */ \
+ int r = width & MASK; \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ ANY_SIMD(src_ptr, dst_ptr, yuvconstants, n); \
+ } \
+ memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \
+ ANY_SIMD(temp, temp + 128, yuvconstants, MASK + 1); \
+ memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \
+ }
+#if defined(HAS_YUY2TOARGBROW_SSSE3)
+ANY11C(YUY2ToARGBRow_Any_SSSE3, YUY2ToARGBRow_SSSE3, 1, 4, 4, 15)
+ANY11C(UYVYToARGBRow_Any_SSSE3, UYVYToARGBRow_SSSE3, 1, 4, 4, 15)
+#endif
+#if defined(HAS_YUY2TOARGBROW_AVX2)
+ANY11C(YUY2ToARGBRow_Any_AVX2, YUY2ToARGBRow_AVX2, 1, 4, 4, 31)
+ANY11C(UYVYToARGBRow_Any_AVX2, UYVYToARGBRow_AVX2, 1, 4, 4, 31)
+#endif
+#if defined(HAS_YUY2TOARGBROW_NEON)
+ANY11C(YUY2ToARGBRow_Any_NEON, YUY2ToARGBRow_NEON, 1, 4, 4, 7)
+ANY11C(UYVYToARGBRow_Any_NEON, UYVYToARGBRow_NEON, 1, 4, 4, 7)
+#endif
+#if defined(HAS_YUY2TOARGBROW_MSA)
+ANY11C(YUY2ToARGBRow_Any_MSA, YUY2ToARGBRow_MSA, 1, 4, 4, 7)
+ANY11C(UYVYToARGBRow_Any_MSA, UYVYToARGBRow_MSA, 1, 4, 4, 7)
+#endif
+#undef ANY11C
+
+// Any 1 to 1 interpolate. Takes 2 rows of source via stride.
+#define ANY11T(NAMEANY, ANY_SIMD, SBPP, BPP, MASK) \
+ void NAMEANY(uint8* dst_ptr, const uint8* src_ptr, ptrdiff_t src_stride_ptr, \
+ int width, int source_y_fraction) { \
+ SIMD_ALIGNED(uint8 temp[64 * 3]); \
+ memset(temp, 0, 64 * 2); /* for msan */ \
+ int r = width & MASK; \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ ANY_SIMD(dst_ptr, src_ptr, src_stride_ptr, n, source_y_fraction); \
+ } \
+ memcpy(temp, src_ptr + n * SBPP, r * SBPP); \
+ memcpy(temp + 64, src_ptr + src_stride_ptr + n * SBPP, r * SBPP); \
+ ANY_SIMD(temp + 128, temp, 64, MASK + 1, source_y_fraction); \
+ memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \
+ }
+
+#ifdef HAS_INTERPOLATEROW_AVX2
+ANY11T(InterpolateRow_Any_AVX2, InterpolateRow_AVX2, 1, 1, 31)
+#endif
+#ifdef HAS_INTERPOLATEROW_SSSE3
+ANY11T(InterpolateRow_Any_SSSE3, InterpolateRow_SSSE3, 1, 1, 15)
+#endif
+#ifdef HAS_INTERPOLATEROW_NEON
+ANY11T(InterpolateRow_Any_NEON, InterpolateRow_NEON, 1, 1, 15)
+#endif
+#ifdef HAS_INTERPOLATEROW_DSPR2
+ANY11T(InterpolateRow_Any_DSPR2, InterpolateRow_DSPR2, 1, 1, 3)
+#endif
+#ifdef HAS_INTERPOLATEROW_MSA
+ANY11T(InterpolateRow_Any_MSA, InterpolateRow_MSA, 1, 1, 31)
+#endif
+#undef ANY11T
+
+// Any 1 to 1 mirror.
+#define ANY11M(NAMEANY, ANY_SIMD, BPP, MASK) \
+ void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, int width) { \
+ SIMD_ALIGNED(uint8 temp[64 * 2]); \
+ memset(temp, 0, 64); /* for msan */ \
+ int r = width & MASK; \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ ANY_SIMD(src_ptr + r * BPP, dst_ptr, n); \
+ } \
+ memcpy(temp, src_ptr, r* BPP); \
+ ANY_SIMD(temp, temp + 64, MASK + 1); \
+ memcpy(dst_ptr + n * BPP, temp + 64 + (MASK + 1 - r) * BPP, r * BPP); \
+ }
+
+#ifdef HAS_MIRRORROW_AVX2
+ANY11M(MirrorRow_Any_AVX2, MirrorRow_AVX2, 1, 31)
+#endif
+#ifdef HAS_MIRRORROW_SSSE3
+ANY11M(MirrorRow_Any_SSSE3, MirrorRow_SSSE3, 1, 15)
+#endif
+#ifdef HAS_MIRRORROW_NEON
+ANY11M(MirrorRow_Any_NEON, MirrorRow_NEON, 1, 15)
+#endif
+#ifdef HAS_MIRRORROW_MSA
+ANY11M(MirrorRow_Any_MSA, MirrorRow_MSA, 1, 63)
+#endif
+#ifdef HAS_ARGBMIRRORROW_AVX2
+ANY11M(ARGBMirrorRow_Any_AVX2, ARGBMirrorRow_AVX2, 4, 7)
+#endif
+#ifdef HAS_ARGBMIRRORROW_SSE2
+ANY11M(ARGBMirrorRow_Any_SSE2, ARGBMirrorRow_SSE2, 4, 3)
+#endif
+#ifdef HAS_ARGBMIRRORROW_NEON
+ANY11M(ARGBMirrorRow_Any_NEON, ARGBMirrorRow_NEON, 4, 3)
+#endif
+#ifdef HAS_ARGBMIRRORROW_MSA
+ANY11M(ARGBMirrorRow_Any_MSA, ARGBMirrorRow_MSA, 4, 15)
+#endif
+#undef ANY11M
+
+// Any 1 plane. (memset)
+#define ANY1(NAMEANY, ANY_SIMD, T, BPP, MASK) \
+ void NAMEANY(uint8* dst_ptr, T v32, int width) { \
+ SIMD_ALIGNED(uint8 temp[64]); \
+ int r = width & MASK; \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ ANY_SIMD(dst_ptr, v32, n); \
+ } \
+ ANY_SIMD(temp, v32, MASK + 1); \
+ memcpy(dst_ptr + n * BPP, temp, r * BPP); \
+ }
+
+#ifdef HAS_SETROW_X86
+ANY1(SetRow_Any_X86, SetRow_X86, uint8, 1, 3)
+#endif
+#ifdef HAS_SETROW_NEON
+ANY1(SetRow_Any_NEON, SetRow_NEON, uint8, 1, 15)
+#endif
+#ifdef HAS_ARGBSETROW_NEON
+ANY1(ARGBSetRow_Any_NEON, ARGBSetRow_NEON, uint32, 4, 3)
+#endif
+#ifdef HAS_ARGBSETROW_MSA
+ANY1(ARGBSetRow_Any_MSA, ARGBSetRow_MSA, uint32, 4, 3)
+#endif
+#undef ANY1
+
+// Any 1 to 2. Outputs UV planes.
+#define ANY12(NAMEANY, ANY_SIMD, UVSHIFT, BPP, DUVSHIFT, MASK) \
+ void NAMEANY(const uint8* src_ptr, uint8* dst_u, uint8* dst_v, int width) { \
+ SIMD_ALIGNED(uint8 temp[128 * 3]); \
+ memset(temp, 0, 128); /* for msan */ \
+ int r = width & MASK; \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ ANY_SIMD(src_ptr, dst_u, dst_v, n); \
+ } \
+ memcpy(temp, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP); \
+ ANY_SIMD(temp, temp + 128, temp + 256, MASK + 1); \
+ memcpy(dst_u + (n >> DUVSHIFT), temp + 128, SS(r, DUVSHIFT)); \
+ memcpy(dst_v + (n >> DUVSHIFT), temp + 256, SS(r, DUVSHIFT)); \
+ }
+
+#ifdef HAS_SPLITUVROW_SSE2
+ANY12(SplitUVRow_Any_SSE2, SplitUVRow_SSE2, 0, 2, 0, 15)
+#endif
+#ifdef HAS_SPLITUVROW_AVX2
+ANY12(SplitUVRow_Any_AVX2, SplitUVRow_AVX2, 0, 2, 0, 31)
+#endif
+#ifdef HAS_SPLITUVROW_NEON
+ANY12(SplitUVRow_Any_NEON, SplitUVRow_NEON, 0, 2, 0, 15)
+#endif
+#ifdef HAS_SPLITUVROW_DSPR2
+ANY12(SplitUVRow_Any_DSPR2, SplitUVRow_DSPR2, 0, 2, 0, 15)
+#endif
+#ifdef HAS_ARGBTOUV444ROW_SSSE3
+ANY12(ARGBToUV444Row_Any_SSSE3, ARGBToUV444Row_SSSE3, 0, 4, 0, 15)
+#endif
+#ifdef HAS_YUY2TOUV422ROW_AVX2
+ANY12(YUY2ToUV422Row_Any_AVX2, YUY2ToUV422Row_AVX2, 1, 4, 1, 31)
+ANY12(UYVYToUV422Row_Any_AVX2, UYVYToUV422Row_AVX2, 1, 4, 1, 31)
+#endif
+#ifdef HAS_YUY2TOUV422ROW_SSE2
+ANY12(YUY2ToUV422Row_Any_SSE2, YUY2ToUV422Row_SSE2, 1, 4, 1, 15)
+ANY12(UYVYToUV422Row_Any_SSE2, UYVYToUV422Row_SSE2, 1, 4, 1, 15)
+#endif
+#ifdef HAS_YUY2TOUV422ROW_NEON
+ANY12(ARGBToUV444Row_Any_NEON, ARGBToUV444Row_NEON, 0, 4, 0, 7)
+ANY12(YUY2ToUV422Row_Any_NEON, YUY2ToUV422Row_NEON, 1, 4, 1, 15)
+ANY12(UYVYToUV422Row_Any_NEON, UYVYToUV422Row_NEON, 1, 4, 1, 15)
+#endif
+#ifdef HAS_YUY2TOUV422ROW_MSA
+ANY12(ARGBToUV444Row_Any_MSA, ARGBToUV444Row_MSA, 0, 4, 0, 15)
+ANY12(YUY2ToUV422Row_Any_MSA, YUY2ToUV422Row_MSA, 1, 4, 1, 31)
+ANY12(UYVYToUV422Row_Any_MSA, UYVYToUV422Row_MSA, 1, 4, 1, 31)
+#endif
+#undef ANY12
+
+// Any 1 to 2 with source stride (2 rows of source). Outputs UV planes.
+// 128 byte row allows for 32 avx ARGB pixels.
+#define ANY12S(NAMEANY, ANY_SIMD, UVSHIFT, BPP, MASK) \
+ void NAMEANY(const uint8* src_ptr, int src_stride_ptr, uint8* dst_u, \
+ uint8* dst_v, int width) { \
+ SIMD_ALIGNED(uint8 temp[128 * 4]); \
+ memset(temp, 0, 128 * 2); /* for msan */ \
+ int r = width & MASK; \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ ANY_SIMD(src_ptr, src_stride_ptr, dst_u, dst_v, n); \
+ } \
+ memcpy(temp, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP); \
+ memcpy(temp + 128, src_ptr + src_stride_ptr + (n >> UVSHIFT) * BPP, \
+ SS(r, UVSHIFT) * BPP); \
+ if ((width & 1) && UVSHIFT == 0) { /* repeat last pixel for subsample */ \
+ memcpy(temp + SS(r, UVSHIFT) * BPP, temp + SS(r, UVSHIFT) * BPP - BPP, \
+ BPP); \
+ memcpy(temp + 128 + SS(r, UVSHIFT) * BPP, \
+ temp + 128 + SS(r, UVSHIFT) * BPP - BPP, BPP); \
+ } \
+ ANY_SIMD(temp, 128, temp + 256, temp + 384, MASK + 1); \
+ memcpy(dst_u + (n >> 1), temp + 256, SS(r, 1)); \
+ memcpy(dst_v + (n >> 1), temp + 384, SS(r, 1)); \
+ }
+
+#ifdef HAS_ARGBTOUVROW_AVX2
+ANY12S(ARGBToUVRow_Any_AVX2, ARGBToUVRow_AVX2, 0, 4, 31)
+#endif
+#ifdef HAS_ARGBTOUVJROW_AVX2
+ANY12S(ARGBToUVJRow_Any_AVX2, ARGBToUVJRow_AVX2, 0, 4, 31)
+#endif
+#ifdef HAS_ARGBTOUVROW_SSSE3
+ANY12S(ARGBToUVRow_Any_SSSE3, ARGBToUVRow_SSSE3, 0, 4, 15)
+ANY12S(ARGBToUVJRow_Any_SSSE3, ARGBToUVJRow_SSSE3, 0, 4, 15)
+ANY12S(BGRAToUVRow_Any_SSSE3, BGRAToUVRow_SSSE3, 0, 4, 15)
+ANY12S(ABGRToUVRow_Any_SSSE3, ABGRToUVRow_SSSE3, 0, 4, 15)
+ANY12S(RGBAToUVRow_Any_SSSE3, RGBAToUVRow_SSSE3, 0, 4, 15)
+#endif
+#ifdef HAS_YUY2TOUVROW_AVX2
+ANY12S(YUY2ToUVRow_Any_AVX2, YUY2ToUVRow_AVX2, 1, 4, 31)
+ANY12S(UYVYToUVRow_Any_AVX2, UYVYToUVRow_AVX2, 1, 4, 31)
+#endif
+#ifdef HAS_YUY2TOUVROW_SSE2
+ANY12S(YUY2ToUVRow_Any_SSE2, YUY2ToUVRow_SSE2, 1, 4, 15)
+ANY12S(UYVYToUVRow_Any_SSE2, UYVYToUVRow_SSE2, 1, 4, 15)
+#endif
+#ifdef HAS_ARGBTOUVROW_NEON
+ANY12S(ARGBToUVRow_Any_NEON, ARGBToUVRow_NEON, 0, 4, 15)
+#endif
+#ifdef HAS_ARGBTOUVROW_MSA
+ANY12S(ARGBToUVRow_Any_MSA, ARGBToUVRow_MSA, 0, 4, 31)
+#endif
+#ifdef HAS_ARGBTOUVJROW_NEON
+ANY12S(ARGBToUVJRow_Any_NEON, ARGBToUVJRow_NEON, 0, 4, 15)
+#endif
+#ifdef HAS_ARGBTOUVJROW_MSA
+ANY12S(ARGBToUVJRow_Any_MSA, ARGBToUVJRow_MSA, 0, 4, 31)
+#endif
+#ifdef HAS_BGRATOUVROW_NEON
+ANY12S(BGRAToUVRow_Any_NEON, BGRAToUVRow_NEON, 0, 4, 15)
+#endif
+#ifdef HAS_BGRATOUVROW_MSA
+ANY12S(BGRAToUVRow_Any_MSA, BGRAToUVRow_MSA, 0, 4, 31)
+#endif
+#ifdef HAS_ABGRTOUVROW_NEON
+ANY12S(ABGRToUVRow_Any_NEON, ABGRToUVRow_NEON, 0, 4, 15)
+#endif
+#ifdef HAS_ABGRTOUVROW_MSA
+ANY12S(ABGRToUVRow_Any_MSA, ABGRToUVRow_MSA, 0, 4, 31)
+#endif
+#ifdef HAS_RGBATOUVROW_NEON
+ANY12S(RGBAToUVRow_Any_NEON, RGBAToUVRow_NEON, 0, 4, 15)
+#endif
+#ifdef HAS_RGBATOUVROW_MSA
+ANY12S(RGBAToUVRow_Any_MSA, RGBAToUVRow_MSA, 0, 4, 31)
+#endif
+#ifdef HAS_RGB24TOUVROW_NEON
+ANY12S(RGB24ToUVRow_Any_NEON, RGB24ToUVRow_NEON, 0, 3, 15)
+#endif
+#ifdef HAS_RGB24TOUVROW_MSA
+ANY12S(RGB24ToUVRow_Any_MSA, RGB24ToUVRow_MSA, 0, 3, 15)
+#endif
+#ifdef HAS_RAWTOUVROW_NEON
+ANY12S(RAWToUVRow_Any_NEON, RAWToUVRow_NEON, 0, 3, 15)
+#endif
+#ifdef HAS_RAWTOUVROW_MSA
+ANY12S(RAWToUVRow_Any_MSA, RAWToUVRow_MSA, 0, 3, 15)
+#endif
+#ifdef HAS_RGB565TOUVROW_NEON
+ANY12S(RGB565ToUVRow_Any_NEON, RGB565ToUVRow_NEON, 0, 2, 15)
+#endif
+#ifdef HAS_RGB565TOUVROW_MSA
+ANY12S(RGB565ToUVRow_Any_MSA, RGB565ToUVRow_MSA, 0, 2, 15)
+#endif
+#ifdef HAS_ARGB1555TOUVROW_NEON
+ANY12S(ARGB1555ToUVRow_Any_NEON, ARGB1555ToUVRow_NEON, 0, 2, 15)
+#endif
+#ifdef HAS_ARGB1555TOUVROW_MSA
+ANY12S(ARGB1555ToUVRow_Any_MSA, ARGB1555ToUVRow_MSA, 0, 2, 15)
+#endif
+#ifdef HAS_ARGB4444TOUVROW_NEON
+ANY12S(ARGB4444ToUVRow_Any_NEON, ARGB4444ToUVRow_NEON, 0, 2, 15)
+#endif
+#ifdef HAS_YUY2TOUVROW_NEON
+ANY12S(YUY2ToUVRow_Any_NEON, YUY2ToUVRow_NEON, 1, 4, 15)
+#endif
+#ifdef HAS_UYVYTOUVROW_NEON
+ANY12S(UYVYToUVRow_Any_NEON, UYVYToUVRow_NEON, 1, 4, 15)
+#endif
+#ifdef HAS_BGRATOUVROW_DSPR2
+ANY12S(BGRAToUVRow_Any_DSPR2, BGRAToUVRow_DSPR2, 0, 4, 15)
+#endif
+#ifdef HAS_ABGRTOUVROW_DSPR2
+ANY12S(ABGRToUVRow_Any_DSPR2, ABGRToUVRow_DSPR2, 0, 4, 15)
+#endif
+#ifdef HAS_RGBATOUVROW_DSPR2
+ANY12S(RGBAToUVRow_Any_DSPR2, RGBAToUVRow_DSPR2, 0, 4, 15)
+#endif
+#ifdef HAS_ARGBTOUVROW_DSPR2
+ANY12S(ARGBToUVRow_Any_DSPR2, ARGBToUVRow_DSPR2, 0, 4, 15)
+#endif
+#ifdef HAS_YUY2TOUVROW_MSA
+ANY12S(YUY2ToUVRow_Any_MSA, YUY2ToUVRow_MSA, 1, 4, 31)
+#endif
+#ifdef HAS_UYVYTOUVROW_MSA
+ANY12S(UYVYToUVRow_Any_MSA, UYVYToUVRow_MSA, 1, 4, 31)
+#endif
+#undef ANY12S
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/libyuv/src/main/cpp/libyuv/source/row_common.cc b/libyuv/src/main/cpp/libyuv/source/row_common.cc
new file mode 100644
index 0000000..bf953ee
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/source/row_common.cc
@@ -0,0 +1,2645 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+
+#include // For memcpy and memset.
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// llvm x86 is poor at ternary operator, so use branchless min/max.
+
+#define USE_BRANCHLESS 1
+#if USE_BRANCHLESS
+static __inline int32 clamp0(int32 v) {
+ return ((-(v) >> 31) & (v));
+}
+
+static __inline int32 clamp255(int32 v) {
+ return (((255 - (v)) >> 31) | (v)) & 255;
+}
+
+static __inline uint32 Clamp(int32 val) {
+ int v = clamp0(val);
+ return (uint32)(clamp255(v));
+}
+
+static __inline uint32 Abs(int32 v) {
+ int m = v >> 31;
+ return (v + m) ^ m;
+}
+#else // USE_BRANCHLESS
+static __inline int32 clamp0(int32 v) {
+ return (v < 0) ? 0 : v;
+}
+
+static __inline int32 clamp255(int32 v) {
+ return (v > 255) ? 255 : v;
+}
+
+static __inline uint32 Clamp(int32 val) {
+ int v = clamp0(val);
+ return (uint32)(clamp255(v));
+}
+
+static __inline uint32 Abs(int32 v) {
+ return (v < 0) ? -v : v;
+}
+#endif // USE_BRANCHLESS
+
+#ifdef LIBYUV_LITTLE_ENDIAN
+#define WRITEWORD(p, v) *(uint32*)(p) = v
+#else
+static inline void WRITEWORD(uint8* p, uint32 v) {
+ p[0] = (uint8)(v & 255);
+ p[1] = (uint8)((v >> 8) & 255);
+ p[2] = (uint8)((v >> 16) & 255);
+ p[3] = (uint8)((v >> 24) & 255);
+}
+#endif
+
+void RGB24ToARGBRow_C(const uint8* src_rgb24, uint8* dst_argb, int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ uint8 b = src_rgb24[0];
+ uint8 g = src_rgb24[1];
+ uint8 r = src_rgb24[2];
+ dst_argb[0] = b;
+ dst_argb[1] = g;
+ dst_argb[2] = r;
+ dst_argb[3] = 255u;
+ dst_argb += 4;
+ src_rgb24 += 3;
+ }
+}
+
+void RAWToARGBRow_C(const uint8* src_raw, uint8* dst_argb, int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ uint8 r = src_raw[0];
+ uint8 g = src_raw[1];
+ uint8 b = src_raw[2];
+ dst_argb[0] = b;
+ dst_argb[1] = g;
+ dst_argb[2] = r;
+ dst_argb[3] = 255u;
+ dst_argb += 4;
+ src_raw += 3;
+ }
+}
+
+void RAWToRGB24Row_C(const uint8* src_raw, uint8* dst_rgb24, int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ uint8 r = src_raw[0];
+ uint8 g = src_raw[1];
+ uint8 b = src_raw[2];
+ dst_rgb24[0] = b;
+ dst_rgb24[1] = g;
+ dst_rgb24[2] = r;
+ dst_rgb24 += 3;
+ src_raw += 3;
+ }
+}
+
+void RGB565ToARGBRow_C(const uint8* src_rgb565, uint8* dst_argb, int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ uint8 b = src_rgb565[0] & 0x1f;
+ uint8 g = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
+ uint8 r = src_rgb565[1] >> 3;
+ dst_argb[0] = (b << 3) | (b >> 2);
+ dst_argb[1] = (g << 2) | (g >> 4);
+ dst_argb[2] = (r << 3) | (r >> 2);
+ dst_argb[3] = 255u;
+ dst_argb += 4;
+ src_rgb565 += 2;
+ }
+}
+
+void ARGB1555ToARGBRow_C(const uint8* src_argb1555,
+ uint8* dst_argb,
+ int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ uint8 b = src_argb1555[0] & 0x1f;
+ uint8 g = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
+ uint8 r = (src_argb1555[1] & 0x7c) >> 2;
+ uint8 a = src_argb1555[1] >> 7;
+ dst_argb[0] = (b << 3) | (b >> 2);
+ dst_argb[1] = (g << 3) | (g >> 2);
+ dst_argb[2] = (r << 3) | (r >> 2);
+ dst_argb[3] = -a;
+ dst_argb += 4;
+ src_argb1555 += 2;
+ }
+}
+
+void ARGB4444ToARGBRow_C(const uint8* src_argb4444,
+ uint8* dst_argb,
+ int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ uint8 b = src_argb4444[0] & 0x0f;
+ uint8 g = src_argb4444[0] >> 4;
+ uint8 r = src_argb4444[1] & 0x0f;
+ uint8 a = src_argb4444[1] >> 4;
+ dst_argb[0] = (b << 4) | b;
+ dst_argb[1] = (g << 4) | g;
+ dst_argb[2] = (r << 4) | r;
+ dst_argb[3] = (a << 4) | a;
+ dst_argb += 4;
+ src_argb4444 += 2;
+ }
+}
+
+void ARGBToRGB24Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ uint8 b = src_argb[0];
+ uint8 g = src_argb[1];
+ uint8 r = src_argb[2];
+ dst_rgb[0] = b;
+ dst_rgb[1] = g;
+ dst_rgb[2] = r;
+ dst_rgb += 3;
+ src_argb += 4;
+ }
+}
+
+void ARGBToRAWRow_C(const uint8* src_argb, uint8* dst_rgb, int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ uint8 b = src_argb[0];
+ uint8 g = src_argb[1];
+ uint8 r = src_argb[2];
+ dst_rgb[0] = r;
+ dst_rgb[1] = g;
+ dst_rgb[2] = b;
+ dst_rgb += 3;
+ src_argb += 4;
+ }
+}
+
+void ARGBToRGB565Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ uint8 b0 = src_argb[0] >> 3;
+ uint8 g0 = src_argb[1] >> 2;
+ uint8 r0 = src_argb[2] >> 3;
+ uint8 b1 = src_argb[4] >> 3;
+ uint8 g1 = src_argb[5] >> 2;
+ uint8 r1 = src_argb[6] >> 3;
+ WRITEWORD(dst_rgb, b0 | (g0 << 5) | (r0 << 11) | (b1 << 16) | (g1 << 21) |
+ (r1 << 27));
+ dst_rgb += 4;
+ src_argb += 8;
+ }
+ if (width & 1) {
+ uint8 b0 = src_argb[0] >> 3;
+ uint8 g0 = src_argb[1] >> 2;
+ uint8 r0 = src_argb[2] >> 3;
+ *(uint16*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 11);
+ }
+}
+
+// dither4 is a row of 4 values from 4x4 dither matrix.
+// The 4x4 matrix contains values to increase RGB. When converting to
+// fewer bits (565) this provides an ordered dither.
+// The order in the 4x4 matrix in first byte is upper left.
+// The 4 values are passed as an int, then referenced as an array, so
+// endian will not affect order of the original matrix. But the dither4
+// will containing the first pixel in the lower byte for little endian
+// or the upper byte for big endian.
+void ARGBToRGB565DitherRow_C(const uint8* src_argb,
+ uint8* dst_rgb,
+ const uint32 dither4,
+ int width) {
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ int dither0 = ((const unsigned char*)(&dither4))[x & 3];
+ int dither1 = ((const unsigned char*)(&dither4))[(x + 1) & 3];
+ uint8 b0 = clamp255(src_argb[0] + dither0) >> 3;
+ uint8 g0 = clamp255(src_argb[1] + dither0) >> 2;
+ uint8 r0 = clamp255(src_argb[2] + dither0) >> 3;
+ uint8 b1 = clamp255(src_argb[4] + dither1) >> 3;
+ uint8 g1 = clamp255(src_argb[5] + dither1) >> 2;
+ uint8 r1 = clamp255(src_argb[6] + dither1) >> 3;
+ WRITEWORD(dst_rgb, b0 | (g0 << 5) | (r0 << 11) | (b1 << 16) | (g1 << 21) |
+ (r1 << 27));
+ dst_rgb += 4;
+ src_argb += 8;
+ }
+ if (width & 1) {
+ int dither0 = ((const unsigned char*)(&dither4))[(width - 1) & 3];
+ uint8 b0 = clamp255(src_argb[0] + dither0) >> 3;
+ uint8 g0 = clamp255(src_argb[1] + dither0) >> 2;
+ uint8 r0 = clamp255(src_argb[2] + dither0) >> 3;
+ *(uint16*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 11);
+ }
+}
+
+void ARGBToARGB1555Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ uint8 b0 = src_argb[0] >> 3;
+ uint8 g0 = src_argb[1] >> 3;
+ uint8 r0 = src_argb[2] >> 3;
+ uint8 a0 = src_argb[3] >> 7;
+ uint8 b1 = src_argb[4] >> 3;
+ uint8 g1 = src_argb[5] >> 3;
+ uint8 r1 = src_argb[6] >> 3;
+ uint8 a1 = src_argb[7] >> 7;
+ *(uint32*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 10) | (a0 << 15) |
+ (b1 << 16) | (g1 << 21) | (r1 << 26) | (a1 << 31);
+ dst_rgb += 4;
+ src_argb += 8;
+ }
+ if (width & 1) {
+ uint8 b0 = src_argb[0] >> 3;
+ uint8 g0 = src_argb[1] >> 3;
+ uint8 r0 = src_argb[2] >> 3;
+ uint8 a0 = src_argb[3] >> 7;
+ *(uint16*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 10) | (a0 << 15);
+ }
+}
+
+void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ uint8 b0 = src_argb[0] >> 4;
+ uint8 g0 = src_argb[1] >> 4;
+ uint8 r0 = src_argb[2] >> 4;
+ uint8 a0 = src_argb[3] >> 4;
+ uint8 b1 = src_argb[4] >> 4;
+ uint8 g1 = src_argb[5] >> 4;
+ uint8 r1 = src_argb[6] >> 4;
+ uint8 a1 = src_argb[7] >> 4;
+ *(uint32*)(dst_rgb) = b0 | (g0 << 4) | (r0 << 8) | (a0 << 12) | (b1 << 16) |
+ (g1 << 20) | (r1 << 24) | (a1 << 28);
+ dst_rgb += 4;
+ src_argb += 8;
+ }
+ if (width & 1) {
+ uint8 b0 = src_argb[0] >> 4;
+ uint8 g0 = src_argb[1] >> 4;
+ uint8 r0 = src_argb[2] >> 4;
+ uint8 a0 = src_argb[3] >> 4;
+ *(uint16*)(dst_rgb) = b0 | (g0 << 4) | (r0 << 8) | (a0 << 12);
+ }
+}
+
+static __inline int RGBToY(uint8 r, uint8 g, uint8 b) {
+ return (66 * r + 129 * g + 25 * b + 0x1080) >> 8;
+}
+
+static __inline int RGBToU(uint8 r, uint8 g, uint8 b) {
+ return (112 * b - 74 * g - 38 * r + 0x8080) >> 8;
+}
+static __inline int RGBToV(uint8 r, uint8 g, uint8 b) {
+ return (112 * r - 94 * g - 18 * b + 0x8080) >> 8;
+}
+
+// ARGBToY_C and ARGBToUV_C
+#define MAKEROWY(NAME, R, G, B, BPP) \
+ void NAME##ToYRow_C(const uint8* src_argb0, uint8* dst_y, int width) { \
+ int x; \
+ for (x = 0; x < width; ++x) { \
+ dst_y[0] = RGBToY(src_argb0[R], src_argb0[G], src_argb0[B]); \
+ src_argb0 += BPP; \
+ dst_y += 1; \
+ } \
+ } \
+ void NAME##ToUVRow_C(const uint8* src_rgb0, int src_stride_rgb, \
+ uint8* dst_u, uint8* dst_v, int width) { \
+ const uint8* src_rgb1 = src_rgb0 + src_stride_rgb; \
+ int x; \
+ for (x = 0; x < width - 1; x += 2) { \
+ uint8 ab = (src_rgb0[B] + src_rgb0[B + BPP] + src_rgb1[B] + \
+ src_rgb1[B + BPP]) >> \
+ 2; \
+ uint8 ag = (src_rgb0[G] + src_rgb0[G + BPP] + src_rgb1[G] + \
+ src_rgb1[G + BPP]) >> \
+ 2; \
+ uint8 ar = (src_rgb0[R] + src_rgb0[R + BPP] + src_rgb1[R] + \
+ src_rgb1[R + BPP]) >> \
+ 2; \
+ dst_u[0] = RGBToU(ar, ag, ab); \
+ dst_v[0] = RGBToV(ar, ag, ab); \
+ src_rgb0 += BPP * 2; \
+ src_rgb1 += BPP * 2; \
+ dst_u += 1; \
+ dst_v += 1; \
+ } \
+ if (width & 1) { \
+ uint8 ab = (src_rgb0[B] + src_rgb1[B]) >> 1; \
+ uint8 ag = (src_rgb0[G] + src_rgb1[G]) >> 1; \
+ uint8 ar = (src_rgb0[R] + src_rgb1[R]) >> 1; \
+ dst_u[0] = RGBToU(ar, ag, ab); \
+ dst_v[0] = RGBToV(ar, ag, ab); \
+ } \
+ }
+
+MAKEROWY(ARGB, 2, 1, 0, 4)
+MAKEROWY(BGRA, 1, 2, 3, 4)
+MAKEROWY(ABGR, 0, 1, 2, 4)
+MAKEROWY(RGBA, 3, 2, 1, 4)
+MAKEROWY(RGB24, 2, 1, 0, 3)
+MAKEROWY(RAW, 0, 1, 2, 3)
+#undef MAKEROWY
+
+// JPeg uses a variation on BT.601-1 full range
+// y = 0.29900 * r + 0.58700 * g + 0.11400 * b
+// u = -0.16874 * r - 0.33126 * g + 0.50000 * b + center
+// v = 0.50000 * r - 0.41869 * g - 0.08131 * b + center
+// BT.601 Mpeg range uses:
+// b 0.1016 * 255 = 25.908 = 25
+// g 0.5078 * 255 = 129.489 = 129
+// r 0.2578 * 255 = 65.739 = 66
+// JPeg 8 bit Y (not used):
+// b 0.11400 * 256 = 29.184 = 29
+// g 0.58700 * 256 = 150.272 = 150
+// r 0.29900 * 256 = 76.544 = 77
+// JPeg 7 bit Y:
+// b 0.11400 * 128 = 14.592 = 15
+// g 0.58700 * 128 = 75.136 = 75
+// r 0.29900 * 128 = 38.272 = 38
+// JPeg 8 bit U:
+// b 0.50000 * 255 = 127.5 = 127
+// g -0.33126 * 255 = -84.4713 = -84
+// r -0.16874 * 255 = -43.0287 = -43
+// JPeg 8 bit V:
+// b -0.08131 * 255 = -20.73405 = -20
+// g -0.41869 * 255 = -106.76595 = -107
+// r 0.50000 * 255 = 127.5 = 127
+
+static __inline int RGBToYJ(uint8 r, uint8 g, uint8 b) {
+ return (38 * r + 75 * g + 15 * b + 64) >> 7;
+}
+
+static __inline int RGBToUJ(uint8 r, uint8 g, uint8 b) {
+ return (127 * b - 84 * g - 43 * r + 0x8080) >> 8;
+}
+static __inline int RGBToVJ(uint8 r, uint8 g, uint8 b) {
+ return (127 * r - 107 * g - 20 * b + 0x8080) >> 8;
+}
+
+#define AVGB(a, b) (((a) + (b) + 1) >> 1)
+
+// ARGBToYJ_C and ARGBToUVJ_C
+#define MAKEROWYJ(NAME, R, G, B, BPP) \
+ void NAME##ToYJRow_C(const uint8* src_argb0, uint8* dst_y, int width) { \
+ int x; \
+ for (x = 0; x < width; ++x) { \
+ dst_y[0] = RGBToYJ(src_argb0[R], src_argb0[G], src_argb0[B]); \
+ src_argb0 += BPP; \
+ dst_y += 1; \
+ } \
+ } \
+ void NAME##ToUVJRow_C(const uint8* src_rgb0, int src_stride_rgb, \
+ uint8* dst_u, uint8* dst_v, int width) { \
+ const uint8* src_rgb1 = src_rgb0 + src_stride_rgb; \
+ int x; \
+ for (x = 0; x < width - 1; x += 2) { \
+ uint8 ab = AVGB(AVGB(src_rgb0[B], src_rgb1[B]), \
+ AVGB(src_rgb0[B + BPP], src_rgb1[B + BPP])); \
+ uint8 ag = AVGB(AVGB(src_rgb0[G], src_rgb1[G]), \
+ AVGB(src_rgb0[G + BPP], src_rgb1[G + BPP])); \
+ uint8 ar = AVGB(AVGB(src_rgb0[R], src_rgb1[R]), \
+ AVGB(src_rgb0[R + BPP], src_rgb1[R + BPP])); \
+ dst_u[0] = RGBToUJ(ar, ag, ab); \
+ dst_v[0] = RGBToVJ(ar, ag, ab); \
+ src_rgb0 += BPP * 2; \
+ src_rgb1 += BPP * 2; \
+ dst_u += 1; \
+ dst_v += 1; \
+ } \
+ if (width & 1) { \
+ uint8 ab = AVGB(src_rgb0[B], src_rgb1[B]); \
+ uint8 ag = AVGB(src_rgb0[G], src_rgb1[G]); \
+ uint8 ar = AVGB(src_rgb0[R], src_rgb1[R]); \
+ dst_u[0] = RGBToUJ(ar, ag, ab); \
+ dst_v[0] = RGBToVJ(ar, ag, ab); \
+ } \
+ }
+
+MAKEROWYJ(ARGB, 2, 1, 0, 4)
+#undef MAKEROWYJ
+
+void RGB565ToYRow_C(const uint8* src_rgb565, uint8* dst_y, int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ uint8 b = src_rgb565[0] & 0x1f;
+ uint8 g = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
+ uint8 r = src_rgb565[1] >> 3;
+ b = (b << 3) | (b >> 2);
+ g = (g << 2) | (g >> 4);
+ r = (r << 3) | (r >> 2);
+ dst_y[0] = RGBToY(r, g, b);
+ src_rgb565 += 2;
+ dst_y += 1;
+ }
+}
+
+void ARGB1555ToYRow_C(const uint8* src_argb1555, uint8* dst_y, int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ uint8 b = src_argb1555[0] & 0x1f;
+ uint8 g = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
+ uint8 r = (src_argb1555[1] & 0x7c) >> 2;
+ b = (b << 3) | (b >> 2);
+ g = (g << 3) | (g >> 2);
+ r = (r << 3) | (r >> 2);
+ dst_y[0] = RGBToY(r, g, b);
+ src_argb1555 += 2;
+ dst_y += 1;
+ }
+}
+
+void ARGB4444ToYRow_C(const uint8* src_argb4444, uint8* dst_y, int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ uint8 b = src_argb4444[0] & 0x0f;
+ uint8 g = src_argb4444[0] >> 4;
+ uint8 r = src_argb4444[1] & 0x0f;
+ b = (b << 4) | b;
+ g = (g << 4) | g;
+ r = (r << 4) | r;
+ dst_y[0] = RGBToY(r, g, b);
+ src_argb4444 += 2;
+ dst_y += 1;
+ }
+}
+
+void RGB565ToUVRow_C(const uint8* src_rgb565,
+ int src_stride_rgb565,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
+ const uint8* next_rgb565 = src_rgb565 + src_stride_rgb565;
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ uint8 b0 = src_rgb565[0] & 0x1f;
+ uint8 g0 = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
+ uint8 r0 = src_rgb565[1] >> 3;
+ uint8 b1 = src_rgb565[2] & 0x1f;
+ uint8 g1 = (src_rgb565[2] >> 5) | ((src_rgb565[3] & 0x07) << 3);
+ uint8 r1 = src_rgb565[3] >> 3;
+ uint8 b2 = next_rgb565[0] & 0x1f;
+ uint8 g2 = (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3);
+ uint8 r2 = next_rgb565[1] >> 3;
+ uint8 b3 = next_rgb565[2] & 0x1f;
+ uint8 g3 = (next_rgb565[2] >> 5) | ((next_rgb565[3] & 0x07) << 3);
+ uint8 r3 = next_rgb565[3] >> 3;
+ uint8 b = (b0 + b1 + b2 + b3); // 565 * 4 = 787.
+ uint8 g = (g0 + g1 + g2 + g3);
+ uint8 r = (r0 + r1 + r2 + r3);
+ b = (b << 1) | (b >> 6); // 787 -> 888.
+ r = (r << 1) | (r >> 6);
+ dst_u[0] = RGBToU(r, g, b);
+ dst_v[0] = RGBToV(r, g, b);
+ src_rgb565 += 4;
+ next_rgb565 += 4;
+ dst_u += 1;
+ dst_v += 1;
+ }
+ if (width & 1) {
+ uint8 b0 = src_rgb565[0] & 0x1f;
+ uint8 g0 = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
+ uint8 r0 = src_rgb565[1] >> 3;
+ uint8 b2 = next_rgb565[0] & 0x1f;
+ uint8 g2 = (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3);
+ uint8 r2 = next_rgb565[1] >> 3;
+ uint8 b = (b0 + b2); // 565 * 2 = 676.
+ uint8 g = (g0 + g2);
+ uint8 r = (r0 + r2);
+ b = (b << 2) | (b >> 4); // 676 -> 888
+ g = (g << 1) | (g >> 6);
+ r = (r << 2) | (r >> 4);
+ dst_u[0] = RGBToU(r, g, b);
+ dst_v[0] = RGBToV(r, g, b);
+ }
+}
+
+void ARGB1555ToUVRow_C(const uint8* src_argb1555,
+ int src_stride_argb1555,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
+ const uint8* next_argb1555 = src_argb1555 + src_stride_argb1555;
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ uint8 b0 = src_argb1555[0] & 0x1f;
+ uint8 g0 = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
+ uint8 r0 = (src_argb1555[1] & 0x7c) >> 2;
+ uint8 b1 = src_argb1555[2] & 0x1f;
+ uint8 g1 = (src_argb1555[2] >> 5) | ((src_argb1555[3] & 0x03) << 3);
+ uint8 r1 = (src_argb1555[3] & 0x7c) >> 2;
+ uint8 b2 = next_argb1555[0] & 0x1f;
+ uint8 g2 = (next_argb1555[0] >> 5) | ((next_argb1555[1] & 0x03) << 3);
+ uint8 r2 = (next_argb1555[1] & 0x7c) >> 2;
+ uint8 b3 = next_argb1555[2] & 0x1f;
+ uint8 g3 = (next_argb1555[2] >> 5) | ((next_argb1555[3] & 0x03) << 3);
+ uint8 r3 = (next_argb1555[3] & 0x7c) >> 2;
+ uint8 b = (b0 + b1 + b2 + b3); // 555 * 4 = 777.
+ uint8 g = (g0 + g1 + g2 + g3);
+ uint8 r = (r0 + r1 + r2 + r3);
+ b = (b << 1) | (b >> 6); // 777 -> 888.
+ g = (g << 1) | (g >> 6);
+ r = (r << 1) | (r >> 6);
+ dst_u[0] = RGBToU(r, g, b);
+ dst_v[0] = RGBToV(r, g, b);
+ src_argb1555 += 4;
+ next_argb1555 += 4;
+ dst_u += 1;
+ dst_v += 1;
+ }
+ if (width & 1) {
+ uint8 b0 = src_argb1555[0] & 0x1f;
+ uint8 g0 = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
+ uint8 r0 = (src_argb1555[1] & 0x7c) >> 2;
+ uint8 b2 = next_argb1555[0] & 0x1f;
+ uint8 g2 = (next_argb1555[0] >> 5) | ((next_argb1555[1] & 0x03) << 3);
+ uint8 r2 = next_argb1555[1] >> 3;
+ uint8 b = (b0 + b2); // 555 * 2 = 666.
+ uint8 g = (g0 + g2);
+ uint8 r = (r0 + r2);
+ b = (b << 2) | (b >> 4); // 666 -> 888.
+ g = (g << 2) | (g >> 4);
+ r = (r << 2) | (r >> 4);
+ dst_u[0] = RGBToU(r, g, b);
+ dst_v[0] = RGBToV(r, g, b);
+ }
+}
+
+void ARGB4444ToUVRow_C(const uint8* src_argb4444,
+ int src_stride_argb4444,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
+ const uint8* next_argb4444 = src_argb4444 + src_stride_argb4444;
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ uint8 b0 = src_argb4444[0] & 0x0f;
+ uint8 g0 = src_argb4444[0] >> 4;
+ uint8 r0 = src_argb4444[1] & 0x0f;
+ uint8 b1 = src_argb4444[2] & 0x0f;
+ uint8 g1 = src_argb4444[2] >> 4;
+ uint8 r1 = src_argb4444[3] & 0x0f;
+ uint8 b2 = next_argb4444[0] & 0x0f;
+ uint8 g2 = next_argb4444[0] >> 4;
+ uint8 r2 = next_argb4444[1] & 0x0f;
+ uint8 b3 = next_argb4444[2] & 0x0f;
+ uint8 g3 = next_argb4444[2] >> 4;
+ uint8 r3 = next_argb4444[3] & 0x0f;
+ uint8 b = (b0 + b1 + b2 + b3); // 444 * 4 = 666.
+ uint8 g = (g0 + g1 + g2 + g3);
+ uint8 r = (r0 + r1 + r2 + r3);
+ b = (b << 2) | (b >> 4); // 666 -> 888.
+ g = (g << 2) | (g >> 4);
+ r = (r << 2) | (r >> 4);
+ dst_u[0] = RGBToU(r, g, b);
+ dst_v[0] = RGBToV(r, g, b);
+ src_argb4444 += 4;
+ next_argb4444 += 4;
+ dst_u += 1;
+ dst_v += 1;
+ }
+ if (width & 1) {
+ uint8 b0 = src_argb4444[0] & 0x0f;
+ uint8 g0 = src_argb4444[0] >> 4;
+ uint8 r0 = src_argb4444[1] & 0x0f;
+ uint8 b2 = next_argb4444[0] & 0x0f;
+ uint8 g2 = next_argb4444[0] >> 4;
+ uint8 r2 = next_argb4444[1] & 0x0f;
+ uint8 b = (b0 + b2); // 444 * 2 = 555.
+ uint8 g = (g0 + g2);
+ uint8 r = (r0 + r2);
+ b = (b << 3) | (b >> 2); // 555 -> 888.
+ g = (g << 3) | (g >> 2);
+ r = (r << 3) | (r >> 2);
+ dst_u[0] = RGBToU(r, g, b);
+ dst_v[0] = RGBToV(r, g, b);
+ }
+}
+
+void ARGBToUV444Row_C(const uint8* src_argb,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ uint8 ab = src_argb[0];
+ uint8 ag = src_argb[1];
+ uint8 ar = src_argb[2];
+ dst_u[0] = RGBToU(ar, ag, ab);
+ dst_v[0] = RGBToV(ar, ag, ab);
+ src_argb += 4;
+ dst_u += 1;
+ dst_v += 1;
+ }
+}
+
+void ARGBGrayRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ uint8 y = RGBToYJ(src_argb[2], src_argb[1], src_argb[0]);
+ dst_argb[2] = dst_argb[1] = dst_argb[0] = y;
+ dst_argb[3] = src_argb[3];
+ dst_argb += 4;
+ src_argb += 4;
+ }
+}
+
+// Convert a row of image to Sepia tone.
+void ARGBSepiaRow_C(uint8* dst_argb, int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ int b = dst_argb[0];
+ int g = dst_argb[1];
+ int r = dst_argb[2];
+ int sb = (b * 17 + g * 68 + r * 35) >> 7;
+ int sg = (b * 22 + g * 88 + r * 45) >> 7;
+ int sr = (b * 24 + g * 98 + r * 50) >> 7;
+ // b does not over flow. a is preserved from original.
+ dst_argb[0] = sb;
+ dst_argb[1] = clamp255(sg);
+ dst_argb[2] = clamp255(sr);
+ dst_argb += 4;
+ }
+}
+
+// Apply color matrix to a row of image. Matrix is signed.
+// TODO(fbarchard): Consider adding rounding (+32).
+void ARGBColorMatrixRow_C(const uint8* src_argb,
+ uint8* dst_argb,
+ const int8* matrix_argb,
+ int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ int b = src_argb[0];
+ int g = src_argb[1];
+ int r = src_argb[2];
+ int a = src_argb[3];
+ int sb = (b * matrix_argb[0] + g * matrix_argb[1] + r * matrix_argb[2] +
+ a * matrix_argb[3]) >>
+ 6;
+ int sg = (b * matrix_argb[4] + g * matrix_argb[5] + r * matrix_argb[6] +
+ a * matrix_argb[7]) >>
+ 6;
+ int sr = (b * matrix_argb[8] + g * matrix_argb[9] + r * matrix_argb[10] +
+ a * matrix_argb[11]) >>
+ 6;
+ int sa = (b * matrix_argb[12] + g * matrix_argb[13] + r * matrix_argb[14] +
+ a * matrix_argb[15]) >>
+ 6;
+ dst_argb[0] = Clamp(sb);
+ dst_argb[1] = Clamp(sg);
+ dst_argb[2] = Clamp(sr);
+ dst_argb[3] = Clamp(sa);
+ src_argb += 4;
+ dst_argb += 4;
+ }
+}
+
+// Apply color table to a row of image.
+void ARGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ int b = dst_argb[0];
+ int g = dst_argb[1];
+ int r = dst_argb[2];
+ int a = dst_argb[3];
+ dst_argb[0] = table_argb[b * 4 + 0];
+ dst_argb[1] = table_argb[g * 4 + 1];
+ dst_argb[2] = table_argb[r * 4 + 2];
+ dst_argb[3] = table_argb[a * 4 + 3];
+ dst_argb += 4;
+ }
+}
+
+// Apply color table to a row of image.
+void RGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ int b = dst_argb[0];
+ int g = dst_argb[1];
+ int r = dst_argb[2];
+ dst_argb[0] = table_argb[b * 4 + 0];
+ dst_argb[1] = table_argb[g * 4 + 1];
+ dst_argb[2] = table_argb[r * 4 + 2];
+ dst_argb += 4;
+ }
+}
+
+void ARGBQuantizeRow_C(uint8* dst_argb,
+ int scale,
+ int interval_size,
+ int interval_offset,
+ int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ int b = dst_argb[0];
+ int g = dst_argb[1];
+ int r = dst_argb[2];
+ dst_argb[0] = (b * scale >> 16) * interval_size + interval_offset;
+ dst_argb[1] = (g * scale >> 16) * interval_size + interval_offset;
+ dst_argb[2] = (r * scale >> 16) * interval_size + interval_offset;
+ dst_argb += 4;
+ }
+}
+
+#define REPEAT8(v) (v) | ((v) << 8)
+#define SHADE(f, v) v* f >> 24
+
+void ARGBShadeRow_C(const uint8* src_argb,
+ uint8* dst_argb,
+ int width,
+ uint32 value) {
+ const uint32 b_scale = REPEAT8(value & 0xff);
+ const uint32 g_scale = REPEAT8((value >> 8) & 0xff);
+ const uint32 r_scale = REPEAT8((value >> 16) & 0xff);
+ const uint32 a_scale = REPEAT8(value >> 24);
+
+ int i;
+ for (i = 0; i < width; ++i) {
+ const uint32 b = REPEAT8(src_argb[0]);
+ const uint32 g = REPEAT8(src_argb[1]);
+ const uint32 r = REPEAT8(src_argb[2]);
+ const uint32 a = REPEAT8(src_argb[3]);
+ dst_argb[0] = SHADE(b, b_scale);
+ dst_argb[1] = SHADE(g, g_scale);
+ dst_argb[2] = SHADE(r, r_scale);
+ dst_argb[3] = SHADE(a, a_scale);
+ src_argb += 4;
+ dst_argb += 4;
+ }
+}
+#undef REPEAT8
+#undef SHADE
+
+#define REPEAT8(v) (v) | ((v) << 8)
+#define SHADE(f, v) v* f >> 16
+
+void ARGBMultiplyRow_C(const uint8* src_argb0,
+ const uint8* src_argb1,
+ uint8* dst_argb,
+ int width) {
+ int i;
+ for (i = 0; i < width; ++i) {
+ const uint32 b = REPEAT8(src_argb0[0]);
+ const uint32 g = REPEAT8(src_argb0[1]);
+ const uint32 r = REPEAT8(src_argb0[2]);
+ const uint32 a = REPEAT8(src_argb0[3]);
+ const uint32 b_scale = src_argb1[0];
+ const uint32 g_scale = src_argb1[1];
+ const uint32 r_scale = src_argb1[2];
+ const uint32 a_scale = src_argb1[3];
+ dst_argb[0] = SHADE(b, b_scale);
+ dst_argb[1] = SHADE(g, g_scale);
+ dst_argb[2] = SHADE(r, r_scale);
+ dst_argb[3] = SHADE(a, a_scale);
+ src_argb0 += 4;
+ src_argb1 += 4;
+ dst_argb += 4;
+ }
+}
+#undef REPEAT8
+#undef SHADE
+
+#define SHADE(f, v) clamp255(v + f)
+
+void ARGBAddRow_C(const uint8* src_argb0,
+ const uint8* src_argb1,
+ uint8* dst_argb,
+ int width) {
+ int i;
+ for (i = 0; i < width; ++i) {
+ const int b = src_argb0[0];
+ const int g = src_argb0[1];
+ const int r = src_argb0[2];
+ const int a = src_argb0[3];
+ const int b_add = src_argb1[0];
+ const int g_add = src_argb1[1];
+ const int r_add = src_argb1[2];
+ const int a_add = src_argb1[3];
+ dst_argb[0] = SHADE(b, b_add);
+ dst_argb[1] = SHADE(g, g_add);
+ dst_argb[2] = SHADE(r, r_add);
+ dst_argb[3] = SHADE(a, a_add);
+ src_argb0 += 4;
+ src_argb1 += 4;
+ dst_argb += 4;
+ }
+}
+#undef SHADE
+
+#define SHADE(f, v) clamp0(f - v)
+
+void ARGBSubtractRow_C(const uint8* src_argb0,
+ const uint8* src_argb1,
+ uint8* dst_argb,
+ int width) {
+ int i;
+ for (i = 0; i < width; ++i) {
+ const int b = src_argb0[0];
+ const int g = src_argb0[1];
+ const int r = src_argb0[2];
+ const int a = src_argb0[3];
+ const int b_sub = src_argb1[0];
+ const int g_sub = src_argb1[1];
+ const int r_sub = src_argb1[2];
+ const int a_sub = src_argb1[3];
+ dst_argb[0] = SHADE(b, b_sub);
+ dst_argb[1] = SHADE(g, g_sub);
+ dst_argb[2] = SHADE(r, r_sub);
+ dst_argb[3] = SHADE(a, a_sub);
+ src_argb0 += 4;
+ src_argb1 += 4;
+ dst_argb += 4;
+ }
+}
+#undef SHADE
+
+// Sobel functions which mimics SSSE3.
+void SobelXRow_C(const uint8* src_y0,
+ const uint8* src_y1,
+ const uint8* src_y2,
+ uint8* dst_sobelx,
+ int width) {
+ int i;
+ for (i = 0; i < width; ++i) {
+ int a = src_y0[i];
+ int b = src_y1[i];
+ int c = src_y2[i];
+ int a_sub = src_y0[i + 2];
+ int b_sub = src_y1[i + 2];
+ int c_sub = src_y2[i + 2];
+ int a_diff = a - a_sub;
+ int b_diff = b - b_sub;
+ int c_diff = c - c_sub;
+ int sobel = Abs(a_diff + b_diff * 2 + c_diff);
+ dst_sobelx[i] = (uint8)(clamp255(sobel));
+ }
+}
+
+void SobelYRow_C(const uint8* src_y0,
+ const uint8* src_y1,
+ uint8* dst_sobely,
+ int width) {
+ int i;
+ for (i = 0; i < width; ++i) {
+ int a = src_y0[i + 0];
+ int b = src_y0[i + 1];
+ int c = src_y0[i + 2];
+ int a_sub = src_y1[i + 0];
+ int b_sub = src_y1[i + 1];
+ int c_sub = src_y1[i + 2];
+ int a_diff = a - a_sub;
+ int b_diff = b - b_sub;
+ int c_diff = c - c_sub;
+ int sobel = Abs(a_diff + b_diff * 2 + c_diff);
+ dst_sobely[i] = (uint8)(clamp255(sobel));
+ }
+}
+
+void SobelRow_C(const uint8* src_sobelx,
+ const uint8* src_sobely,
+ uint8* dst_argb,
+ int width) {
+ int i;
+ for (i = 0; i < width; ++i) {
+ int r = src_sobelx[i];
+ int b = src_sobely[i];
+ int s = clamp255(r + b);
+ dst_argb[0] = (uint8)(s);
+ dst_argb[1] = (uint8)(s);
+ dst_argb[2] = (uint8)(s);
+ dst_argb[3] = (uint8)(255u);
+ dst_argb += 4;
+ }
+}
+
+void SobelToPlaneRow_C(const uint8* src_sobelx,
+ const uint8* src_sobely,
+ uint8* dst_y,
+ int width) {
+ int i;
+ for (i = 0; i < width; ++i) {
+ int r = src_sobelx[i];
+ int b = src_sobely[i];
+ int s = clamp255(r + b);
+ dst_y[i] = (uint8)(s);
+ }
+}
+
+void SobelXYRow_C(const uint8* src_sobelx,
+ const uint8* src_sobely,
+ uint8* dst_argb,
+ int width) {
+ int i;
+ for (i = 0; i < width; ++i) {
+ int r = src_sobelx[i];
+ int b = src_sobely[i];
+ int g = clamp255(r + b);
+ dst_argb[0] = (uint8)(b);
+ dst_argb[1] = (uint8)(g);
+ dst_argb[2] = (uint8)(r);
+ dst_argb[3] = (uint8)(255u);
+ dst_argb += 4;
+ }
+}
+
+void J400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width) {
+ // Copy a Y to RGB.
+ int x;
+ for (x = 0; x < width; ++x) {
+ uint8 y = src_y[0];
+ dst_argb[2] = dst_argb[1] = dst_argb[0] = y;
+ dst_argb[3] = 255u;
+ dst_argb += 4;
+ ++src_y;
+ }
+}
+
+// TODO(fbarchard): Unify these structures to be platform independent.
+// TODO(fbarchard): Generate SIMD structures from float matrix.
+
+// BT.601 YUV to RGB reference
+// R = (Y - 16) * 1.164 - V * -1.596
+// G = (Y - 16) * 1.164 - U * 0.391 - V * 0.813
+// B = (Y - 16) * 1.164 - U * -2.018
+
+// Y contribution to R,G,B. Scale and bias.
+#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
+#define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
+
+// U and V contributions to R,G,B.
+#define UB -128 /* max(-128, round(-2.018 * 64)) */
+#define UG 25 /* round(0.391 * 64) */
+#define VG 52 /* round(0.813 * 64) */
+#define VR -102 /* round(-1.596 * 64) */
+
+// Bias values to subtract 16 from Y and 128 from U and V.
+#define BB (UB * 128 + YGB)
+#define BG (UG * 128 + VG * 128 + YGB)
+#define BR (VR * 128 + YGB)
+
+#if defined(__aarch64__) // 64 bit arm
+const struct YuvConstants SIMD_ALIGNED(kYuvI601Constants) = {
+ {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR},
+ {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR},
+ {UG, VG, UG, VG, UG, VG, UG, VG},
+ {UG, VG, UG, VG, UG, VG, UG, VG},
+ {BB, BG, BR, 0, 0, 0, 0, 0},
+ {0x0101 * YG, 0, 0, 0}};
+const struct YuvConstants SIMD_ALIGNED(kYvuI601Constants) = {
+ {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB},
+ {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB},
+ {VG, UG, VG, UG, VG, UG, VG, UG},
+ {VG, UG, VG, UG, VG, UG, VG, UG},
+ {BR, BG, BB, 0, 0, 0, 0, 0},
+ {0x0101 * YG, 0, 0, 0}};
+#elif defined(__arm__) // 32 bit arm
+const struct YuvConstants SIMD_ALIGNED(kYuvI601Constants) = {
+ {-UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0},
+ {UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0},
+ {BB, BG, BR, 0, 0, 0, 0, 0},
+ {0x0101 * YG, 0, 0, 0}};
+const struct YuvConstants SIMD_ALIGNED(kYvuI601Constants) = {
+ {-VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0},
+ {VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0},
+ {BR, BG, BB, 0, 0, 0, 0, 0},
+ {0x0101 * YG, 0, 0, 0}};
+#else
+const struct YuvConstants SIMD_ALIGNED(kYuvI601Constants) = {
+ {UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,
+ UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0},
+ {UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,
+ UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG},
+ {0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,
+ 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR},
+ {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB},
+ {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG},
+ {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR},
+ {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}};
+const struct YuvConstants SIMD_ALIGNED(kYvuI601Constants) = {
+ {VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,
+ VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0},
+ {VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
+ VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG},
+ {0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB,
+ 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB},
+ {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR},
+ {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG},
+ {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB},
+ {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}};
+#endif
+
+#undef BB
+#undef BG
+#undef BR
+#undef YGB
+#undef UB
+#undef UG
+#undef VG
+#undef VR
+#undef YG
+
+// JPEG YUV to RGB reference
+// * R = Y - V * -1.40200
+// * G = Y - U * 0.34414 - V * 0.71414
+// * B = Y - U * -1.77200
+
+// Y contribution to R,G,B. Scale and bias.
+#define YG 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
+#define YGB 32 /* 64 / 2 */
+
+// U and V contributions to R,G,B.
+#define UB -113 /* round(-1.77200 * 64) */
+#define UG 22 /* round(0.34414 * 64) */
+#define VG 46 /* round(0.71414 * 64) */
+#define VR -90 /* round(-1.40200 * 64) */
+
+// Bias values to round, and subtract 128 from U and V.
+#define BB (UB * 128 + YGB)
+#define BG (UG * 128 + VG * 128 + YGB)
+#define BR (VR * 128 + YGB)
+
+#if defined(__aarch64__)
+const struct YuvConstants SIMD_ALIGNED(kYuvJPEGConstants) = {
+ {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR},
+ {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR},
+ {UG, VG, UG, VG, UG, VG, UG, VG},
+ {UG, VG, UG, VG, UG, VG, UG, VG},
+ {BB, BG, BR, 0, 0, 0, 0, 0},
+ {0x0101 * YG, 0, 0, 0}};
+const struct YuvConstants SIMD_ALIGNED(kYvuJPEGConstants) = {
+ {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB},
+ {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB},
+ {VG, UG, VG, UG, VG, UG, VG, UG},
+ {VG, UG, VG, UG, VG, UG, VG, UG},
+ {BR, BG, BB, 0, 0, 0, 0, 0},
+ {0x0101 * YG, 0, 0, 0}};
+#elif defined(__arm__)
+const struct YuvConstants SIMD_ALIGNED(kYuvJPEGConstants) = {
+ {-UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0},
+ {UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0},
+ {BB, BG, BR, 0, 0, 0, 0, 0},
+ {0x0101 * YG, 0, 0, 0}};
+const struct YuvConstants SIMD_ALIGNED(kYvuJPEGConstants) = {
+ {-VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0},
+ {VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0},
+ {BR, BG, BB, 0, 0, 0, 0, 0},
+ {0x0101 * YG, 0, 0, 0}};
+#else
+const struct YuvConstants SIMD_ALIGNED(kYuvJPEGConstants) = {
+ {UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,
+ UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0},
+ {UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,
+ UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG},
+ {0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,
+ 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR},
+ {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB},
+ {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG},
+ {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR},
+ {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}};
+const struct YuvConstants SIMD_ALIGNED(kYvuJPEGConstants) = {
+ {VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,
+ VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0},
+ {VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
+ VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG},
+ {0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB,
+ 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB},
+ {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR},
+ {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG},
+ {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB},
+ {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}};
+#endif
+
+#undef BB
+#undef BG
+#undef BR
+#undef YGB
+#undef UB
+#undef UG
+#undef VG
+#undef VR
+#undef YG
+
+// BT.709 YUV to RGB reference
+// R = (Y - 16) * 1.164 - V * -1.793
+// G = (Y - 16) * 1.164 - U * 0.213 - V * 0.533
+// B = (Y - 16) * 1.164 - U * -2.112
+// See also http://www.equasys.de/colorconversion.html
+
+// Y contribution to R,G,B. Scale and bias.
+#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
+#define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
+
+// TODO(fbarchard): Find way to express 2.112 instead of 2.0.
+// U and V contributions to R,G,B.
+#define UB -128 /* max(-128, round(-2.112 * 64)) */
+#define UG 14 /* round(0.213 * 64) */
+#define VG 34 /* round(0.533 * 64) */
+#define VR -115 /* round(-1.793 * 64) */
+
+// Bias values to round, and subtract 128 from U and V.
+#define BB (UB * 128 + YGB)
+#define BG (UG * 128 + VG * 128 + YGB)
+#define BR (VR * 128 + YGB)
+
+#if defined(__aarch64__)
+const struct YuvConstants SIMD_ALIGNED(kYuvH709Constants) = {
+ {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR},
+ {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR},
+ {UG, VG, UG, VG, UG, VG, UG, VG},
+ {UG, VG, UG, VG, UG, VG, UG, VG},
+ {BB, BG, BR, 0, 0, 0, 0, 0},
+ {0x0101 * YG, 0, 0, 0}};
+const struct YuvConstants SIMD_ALIGNED(kYvuH709Constants) = {
+ {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB},
+ {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB},
+ {VG, UG, VG, UG, VG, UG, VG, UG},
+ {VG, UG, VG, UG, VG, UG, VG, UG},
+ {BR, BG, BB, 0, 0, 0, 0, 0},
+ {0x0101 * YG, 0, 0, 0}};
+#elif defined(__arm__)
+const struct YuvConstants SIMD_ALIGNED(kYuvH709Constants) = {
+ {-UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0},
+ {UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0},
+ {BB, BG, BR, 0, 0, 0, 0, 0},
+ {0x0101 * YG, 0, 0, 0}};
+const struct YuvConstants SIMD_ALIGNED(kYvuH709Constants) = {
+ {-VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0},
+ {VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0},
+ {BR, BG, BB, 0, 0, 0, 0, 0},
+ {0x0101 * YG, 0, 0, 0}};
+#else
+const struct YuvConstants SIMD_ALIGNED(kYuvH709Constants) = {
+ {UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,
+ UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0},
+ {UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,
+ UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG},
+ {0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,
+ 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR},
+ {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB},
+ {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG},
+ {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR},
+ {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}};
+const struct YuvConstants SIMD_ALIGNED(kYvuH709Constants) = {
+ {VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,
+ VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0},
+ {VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
+ VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG},
+ {0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB,
+ 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB},
+ {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR},
+ {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG},
+ {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB},
+ {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}};
+#endif
+
+#undef BB
+#undef BG
+#undef BR
+#undef YGB
+#undef UB
+#undef UG
+#undef VG
+#undef VR
+#undef YG
+
+// C reference code that mimics the YUV assembly.
+static __inline void YuvPixel(uint8 y,
+ uint8 u,
+ uint8 v,
+ uint8* b,
+ uint8* g,
+ uint8* r,
+ const struct YuvConstants* yuvconstants) {
+#if defined(__aarch64__)
+ int ub = -yuvconstants->kUVToRB[0];
+ int ug = yuvconstants->kUVToG[0];
+ int vg = yuvconstants->kUVToG[1];
+ int vr = -yuvconstants->kUVToRB[1];
+ int bb = yuvconstants->kUVBiasBGR[0];
+ int bg = yuvconstants->kUVBiasBGR[1];
+ int br = yuvconstants->kUVBiasBGR[2];
+ int yg = yuvconstants->kYToRgb[0] / 0x0101;
+#elif defined(__arm__)
+ int ub = -yuvconstants->kUVToRB[0];
+ int ug = yuvconstants->kUVToG[0];
+ int vg = yuvconstants->kUVToG[4];
+ int vr = -yuvconstants->kUVToRB[4];
+ int bb = yuvconstants->kUVBiasBGR[0];
+ int bg = yuvconstants->kUVBiasBGR[1];
+ int br = yuvconstants->kUVBiasBGR[2];
+ int yg = yuvconstants->kYToRgb[0] / 0x0101;
+#else
+ int ub = yuvconstants->kUVToB[0];
+ int ug = yuvconstants->kUVToG[0];
+ int vg = yuvconstants->kUVToG[1];
+ int vr = yuvconstants->kUVToR[1];
+ int bb = yuvconstants->kUVBiasB[0];
+ int bg = yuvconstants->kUVBiasG[0];
+ int br = yuvconstants->kUVBiasR[0];
+ int yg = yuvconstants->kYToRgb[0];
+#endif
+
+ uint32 y1 = (uint32)(y * 0x0101 * yg) >> 16;
+ *b = Clamp((int32)(-(u * ub) + y1 + bb) >> 6);
+ *g = Clamp((int32)(-(u * ug + v * vg) + y1 + bg) >> 6);
+ *r = Clamp((int32)(-(v * vr) + y1 + br) >> 6);
+}
+
+// Y contribution to R,G,B. Scale and bias.
+#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
+#define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
+
+// C reference code that mimics the YUV assembly.
+static __inline void YPixel(uint8 y, uint8* b, uint8* g, uint8* r) {
+ uint32 y1 = (uint32)(y * 0x0101 * YG) >> 16;
+ *b = Clamp((int32)(y1 + YGB) >> 6);
+ *g = Clamp((int32)(y1 + YGB) >> 6);
+ *r = Clamp((int32)(y1 + YGB) >> 6);
+}
+
+#undef YG
+#undef YGB
+
+#if !defined(LIBYUV_DISABLE_NEON) && \
+ (defined(__ARM_NEON__) || defined(__aarch64__) || defined(LIBYUV_NEON))
+// C mimic assembly.
+// TODO(fbarchard): Remove subsampling from Neon.
+void I444ToARGBRow_C(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ uint8 u = (src_u[0] + src_u[1] + 1) >> 1;
+ uint8 v = (src_v[0] + src_v[1] + 1) >> 1;
+ YuvPixel(src_y[0], u, v, rgb_buf + 0, rgb_buf + 1, rgb_buf + 2,
+ yuvconstants);
+ rgb_buf[3] = 255;
+ YuvPixel(src_y[1], u, v, rgb_buf + 4, rgb_buf + 5, rgb_buf + 6,
+ yuvconstants);
+ rgb_buf[7] = 255;
+ src_y += 2;
+ src_u += 2;
+ src_v += 2;
+ rgb_buf += 8; // Advance 2 pixels.
+ }
+ if (width & 1) {
+ YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+ rgb_buf + 2, yuvconstants);
+ rgb_buf[3] = 255;
+ }
+}
+#else
+void I444ToARGBRow_C(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+ rgb_buf + 2, yuvconstants);
+ rgb_buf[3] = 255;
+ src_y += 1;
+ src_u += 1;
+ src_v += 1;
+ rgb_buf += 4; // Advance 1 pixel.
+ }
+}
+#endif
+
+// Also used for 420
+void I422ToARGBRow_C(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+ rgb_buf + 2, yuvconstants);
+ rgb_buf[3] = 255;
+ YuvPixel(src_y[1], src_u[0], src_v[0], rgb_buf + 4, rgb_buf + 5,
+ rgb_buf + 6, yuvconstants);
+ rgb_buf[7] = 255;
+ src_y += 2;
+ src_u += 1;
+ src_v += 1;
+ rgb_buf += 8; // Advance 2 pixels.
+ }
+ if (width & 1) {
+ YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+ rgb_buf + 2, yuvconstants);
+ rgb_buf[3] = 255;
+ }
+}
+
+void I422AlphaToARGBRow_C(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ const uint8* src_a,
+ uint8* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+ rgb_buf + 2, yuvconstants);
+ rgb_buf[3] = src_a[0];
+ YuvPixel(src_y[1], src_u[0], src_v[0], rgb_buf + 4, rgb_buf + 5,
+ rgb_buf + 6, yuvconstants);
+ rgb_buf[7] = src_a[1];
+ src_y += 2;
+ src_u += 1;
+ src_v += 1;
+ src_a += 2;
+ rgb_buf += 8; // Advance 2 pixels.
+ }
+ if (width & 1) {
+ YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+ rgb_buf + 2, yuvconstants);
+ rgb_buf[3] = src_a[0];
+ }
+}
+
+void I422ToRGB24Row_C(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+ rgb_buf + 2, yuvconstants);
+ YuvPixel(src_y[1], src_u[0], src_v[0], rgb_buf + 3, rgb_buf + 4,
+ rgb_buf + 5, yuvconstants);
+ src_y += 2;
+ src_u += 1;
+ src_v += 1;
+ rgb_buf += 6; // Advance 2 pixels.
+ }
+ if (width & 1) {
+ YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+ rgb_buf + 2, yuvconstants);
+ }
+}
+
+void I422ToARGB4444Row_C(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb4444,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ uint8 b0;
+ uint8 g0;
+ uint8 r0;
+ uint8 b1;
+ uint8 g1;
+ uint8 r1;
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);
+ YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1, yuvconstants);
+ b0 = b0 >> 4;
+ g0 = g0 >> 4;
+ r0 = r0 >> 4;
+ b1 = b1 >> 4;
+ g1 = g1 >> 4;
+ r1 = r1 >> 4;
+ *(uint32*)(dst_argb4444) = b0 | (g0 << 4) | (r0 << 8) | (b1 << 16) |
+ (g1 << 20) | (r1 << 24) | 0xf000f000;
+ src_y += 2;
+ src_u += 1;
+ src_v += 1;
+ dst_argb4444 += 4; // Advance 2 pixels.
+ }
+ if (width & 1) {
+ YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);
+ b0 = b0 >> 4;
+ g0 = g0 >> 4;
+ r0 = r0 >> 4;
+ *(uint16*)(dst_argb4444) = b0 | (g0 << 4) | (r0 << 8) | 0xf000;
+ }
+}
+
+void I422ToARGB1555Row_C(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb1555,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ uint8 b0;
+ uint8 g0;
+ uint8 r0;
+ uint8 b1;
+ uint8 g1;
+ uint8 r1;
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);
+ YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1, yuvconstants);
+ b0 = b0 >> 3;
+ g0 = g0 >> 3;
+ r0 = r0 >> 3;
+ b1 = b1 >> 3;
+ g1 = g1 >> 3;
+ r1 = r1 >> 3;
+ *(uint32*)(dst_argb1555) = b0 | (g0 << 5) | (r0 << 10) | (b1 << 16) |
+ (g1 << 21) | (r1 << 26) | 0x80008000;
+ src_y += 2;
+ src_u += 1;
+ src_v += 1;
+ dst_argb1555 += 4; // Advance 2 pixels.
+ }
+ if (width & 1) {
+ YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);
+ b0 = b0 >> 3;
+ g0 = g0 >> 3;
+ r0 = r0 >> 3;
+ *(uint16*)(dst_argb1555) = b0 | (g0 << 5) | (r0 << 10) | 0x8000;
+ }
+}
+
+void I422ToRGB565Row_C(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_rgb565,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ uint8 b0;
+ uint8 g0;
+ uint8 r0;
+ uint8 b1;
+ uint8 g1;
+ uint8 r1;
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);
+ YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1, yuvconstants);
+ b0 = b0 >> 3;
+ g0 = g0 >> 2;
+ r0 = r0 >> 3;
+ b1 = b1 >> 3;
+ g1 = g1 >> 2;
+ r1 = r1 >> 3;
+ *(uint32*)(dst_rgb565) =
+ b0 | (g0 << 5) | (r0 << 11) | (b1 << 16) | (g1 << 21) | (r1 << 27);
+ src_y += 2;
+ src_u += 1;
+ src_v += 1;
+ dst_rgb565 += 4; // Advance 2 pixels.
+ }
+ if (width & 1) {
+ YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);
+ b0 = b0 >> 3;
+ g0 = g0 >> 2;
+ r0 = r0 >> 3;
+ *(uint16*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11);
+ }
+}
+
+void NV12ToARGBRow_C(const uint8* src_y,
+ const uint8* src_uv,
+ uint8* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ YuvPixel(src_y[0], src_uv[0], src_uv[1], rgb_buf + 0, rgb_buf + 1,
+ rgb_buf + 2, yuvconstants);
+ rgb_buf[3] = 255;
+ YuvPixel(src_y[1], src_uv[0], src_uv[1], rgb_buf + 4, rgb_buf + 5,
+ rgb_buf + 6, yuvconstants);
+ rgb_buf[7] = 255;
+ src_y += 2;
+ src_uv += 2;
+ rgb_buf += 8; // Advance 2 pixels.
+ }
+ if (width & 1) {
+ YuvPixel(src_y[0], src_uv[0], src_uv[1], rgb_buf + 0, rgb_buf + 1,
+ rgb_buf + 2, yuvconstants);
+ rgb_buf[3] = 255;
+ }
+}
+
+void NV21ToARGBRow_C(const uint8* src_y,
+ const uint8* src_vu,
+ uint8* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ YuvPixel(src_y[0], src_vu[1], src_vu[0], rgb_buf + 0, rgb_buf + 1,
+ rgb_buf + 2, yuvconstants);
+ rgb_buf[3] = 255;
+ YuvPixel(src_y[1], src_vu[1], src_vu[0], rgb_buf + 4, rgb_buf + 5,
+ rgb_buf + 6, yuvconstants);
+ rgb_buf[7] = 255;
+ src_y += 2;
+ src_vu += 2;
+ rgb_buf += 8; // Advance 2 pixels.
+ }
+ if (width & 1) {
+ YuvPixel(src_y[0], src_vu[1], src_vu[0], rgb_buf + 0, rgb_buf + 1,
+ rgb_buf + 2, yuvconstants);
+ rgb_buf[3] = 255;
+ }
+}
+
+void NV12ToRGB565Row_C(const uint8* src_y,
+ const uint8* src_uv,
+ uint8* dst_rgb565,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ uint8 b0;
+ uint8 g0;
+ uint8 r0;
+ uint8 b1;
+ uint8 g1;
+ uint8 r1;
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ YuvPixel(src_y[0], src_uv[0], src_uv[1], &b0, &g0, &r0, yuvconstants);
+ YuvPixel(src_y[1], src_uv[0], src_uv[1], &b1, &g1, &r1, yuvconstants);
+ b0 = b0 >> 3;
+ g0 = g0 >> 2;
+ r0 = r0 >> 3;
+ b1 = b1 >> 3;
+ g1 = g1 >> 2;
+ r1 = r1 >> 3;
+ *(uint32*)(dst_rgb565) =
+ b0 | (g0 << 5) | (r0 << 11) | (b1 << 16) | (g1 << 21) | (r1 << 27);
+ src_y += 2;
+ src_uv += 2;
+ dst_rgb565 += 4; // Advance 2 pixels.
+ }
+ if (width & 1) {
+ YuvPixel(src_y[0], src_uv[0], src_uv[1], &b0, &g0, &r0, yuvconstants);
+ b0 = b0 >> 3;
+ g0 = g0 >> 2;
+ r0 = r0 >> 3;
+ *(uint16*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11);
+ }
+}
+
+void YUY2ToARGBRow_C(const uint8* src_yuy2,
+ uint8* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ YuvPixel(src_yuy2[0], src_yuy2[1], src_yuy2[3], rgb_buf + 0, rgb_buf + 1,
+ rgb_buf + 2, yuvconstants);
+ rgb_buf[3] = 255;
+ YuvPixel(src_yuy2[2], src_yuy2[1], src_yuy2[3], rgb_buf + 4, rgb_buf + 5,
+ rgb_buf + 6, yuvconstants);
+ rgb_buf[7] = 255;
+ src_yuy2 += 4;
+ rgb_buf += 8; // Advance 2 pixels.
+ }
+ if (width & 1) {
+ YuvPixel(src_yuy2[0], src_yuy2[1], src_yuy2[3], rgb_buf + 0, rgb_buf + 1,
+ rgb_buf + 2, yuvconstants);
+ rgb_buf[3] = 255;
+ }
+}
+
+void UYVYToARGBRow_C(const uint8* src_uyvy,
+ uint8* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ YuvPixel(src_uyvy[1], src_uyvy[0], src_uyvy[2], rgb_buf + 0, rgb_buf + 1,
+ rgb_buf + 2, yuvconstants);
+ rgb_buf[3] = 255;
+ YuvPixel(src_uyvy[3], src_uyvy[0], src_uyvy[2], rgb_buf + 4, rgb_buf + 5,
+ rgb_buf + 6, yuvconstants);
+ rgb_buf[7] = 255;
+ src_uyvy += 4;
+ rgb_buf += 8; // Advance 2 pixels.
+ }
+ if (width & 1) {
+ YuvPixel(src_uyvy[1], src_uyvy[0], src_uyvy[2], rgb_buf + 0, rgb_buf + 1,
+ rgb_buf + 2, yuvconstants);
+ rgb_buf[3] = 255;
+ }
+}
+
+void I422ToRGBARow_C(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 1, rgb_buf + 2,
+ rgb_buf + 3, yuvconstants);
+ rgb_buf[0] = 255;
+ YuvPixel(src_y[1], src_u[0], src_v[0], rgb_buf + 5, rgb_buf + 6,
+ rgb_buf + 7, yuvconstants);
+ rgb_buf[4] = 255;
+ src_y += 2;
+ src_u += 1;
+ src_v += 1;
+ rgb_buf += 8; // Advance 2 pixels.
+ }
+ if (width & 1) {
+ YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 1, rgb_buf + 2,
+ rgb_buf + 3, yuvconstants);
+ rgb_buf[0] = 255;
+ }
+}
+
+void I400ToARGBRow_C(const uint8* src_y, uint8* rgb_buf, int width) {
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ YPixel(src_y[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+ rgb_buf[3] = 255;
+ YPixel(src_y[1], rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
+ rgb_buf[7] = 255;
+ src_y += 2;
+ rgb_buf += 8; // Advance 2 pixels.
+ }
+ if (width & 1) {
+ YPixel(src_y[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+ rgb_buf[3] = 255;
+ }
+}
+
+void MirrorRow_C(const uint8* src, uint8* dst, int width) {
+ int x;
+ src += width - 1;
+ for (x = 0; x < width - 1; x += 2) {
+ dst[x] = src[0];
+ dst[x + 1] = src[-1];
+ src -= 2;
+ }
+ if (width & 1) {
+ dst[width - 1] = src[0];
+ }
+}
+
+void MirrorUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) {
+ int x;
+ src_uv += (width - 1) << 1;
+ for (x = 0; x < width - 1; x += 2) {
+ dst_u[x] = src_uv[0];
+ dst_u[x + 1] = src_uv[-2];
+ dst_v[x] = src_uv[1];
+ dst_v[x + 1] = src_uv[-2 + 1];
+ src_uv -= 4;
+ }
+ if (width & 1) {
+ dst_u[width - 1] = src_uv[0];
+ dst_v[width - 1] = src_uv[1];
+ }
+}
+
+void ARGBMirrorRow_C(const uint8* src, uint8* dst, int width) {
+ int x;
+ const uint32* src32 = (const uint32*)(src);
+ uint32* dst32 = (uint32*)(dst);
+ src32 += width - 1;
+ for (x = 0; x < width - 1; x += 2) {
+ dst32[x] = src32[0];
+ dst32[x + 1] = src32[-1];
+ src32 -= 2;
+ }
+ if (width & 1) {
+ dst32[width - 1] = src32[0];
+ }
+}
+
+void SplitUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) {
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ dst_u[x] = src_uv[0];
+ dst_u[x + 1] = src_uv[2];
+ dst_v[x] = src_uv[1];
+ dst_v[x + 1] = src_uv[3];
+ src_uv += 4;
+ }
+ if (width & 1) {
+ dst_u[width - 1] = src_uv[0];
+ dst_v[width - 1] = src_uv[1];
+ }
+}
+
+void MergeUVRow_C(const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_uv,
+ int width) {
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ dst_uv[0] = src_u[x];
+ dst_uv[1] = src_v[x];
+ dst_uv[2] = src_u[x + 1];
+ dst_uv[3] = src_v[x + 1];
+ dst_uv += 4;
+ }
+ if (width & 1) {
+ dst_uv[0] = src_u[width - 1];
+ dst_uv[1] = src_v[width - 1];
+ }
+}
+
+void CopyRow_C(const uint8* src, uint8* dst, int count) {
+ memcpy(dst, src, count);
+}
+
+void CopyRow_16_C(const uint16* src, uint16* dst, int count) {
+ memcpy(dst, src, count * 2);
+}
+
+void SetRow_C(uint8* dst, uint8 v8, int width) {
+ memset(dst, v8, width);
+}
+
+void ARGBSetRow_C(uint8* dst_argb, uint32 v32, int width) {
+ uint32* d = (uint32*)(dst_argb);
+ int x;
+ for (x = 0; x < width; ++x) {
+ d[x] = v32;
+ }
+}
+
+// Filter 2 rows of YUY2 UV's (422) into U and V (420).
+void YUY2ToUVRow_C(const uint8* src_yuy2,
+ int src_stride_yuy2,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
+ // Output a row of UV values, filtering 2 rows of YUY2.
+ int x;
+ for (x = 0; x < width; x += 2) {
+ dst_u[0] = (src_yuy2[1] + src_yuy2[src_stride_yuy2 + 1] + 1) >> 1;
+ dst_v[0] = (src_yuy2[3] + src_yuy2[src_stride_yuy2 + 3] + 1) >> 1;
+ src_yuy2 += 4;
+ dst_u += 1;
+ dst_v += 1;
+ }
+}
+
+// Copy row of YUY2 UV's (422) into U and V (422).
+void YUY2ToUV422Row_C(const uint8* src_yuy2,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
+ // Output a row of UV values.
+ int x;
+ for (x = 0; x < width; x += 2) {
+ dst_u[0] = src_yuy2[1];
+ dst_v[0] = src_yuy2[3];
+ src_yuy2 += 4;
+ dst_u += 1;
+ dst_v += 1;
+ }
+}
+
+// Copy row of YUY2 Y's (422) into Y (420/422).
+void YUY2ToYRow_C(const uint8* src_yuy2, uint8* dst_y, int width) {
+ // Output a row of Y values.
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ dst_y[x] = src_yuy2[0];
+ dst_y[x + 1] = src_yuy2[2];
+ src_yuy2 += 4;
+ }
+ if (width & 1) {
+ dst_y[width - 1] = src_yuy2[0];
+ }
+}
+
+// Filter 2 rows of UYVY UV's (422) into U and V (420).
+void UYVYToUVRow_C(const uint8* src_uyvy,
+ int src_stride_uyvy,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
+ // Output a row of UV values.
+ int x;
+ for (x = 0; x < width; x += 2) {
+ dst_u[0] = (src_uyvy[0] + src_uyvy[src_stride_uyvy + 0] + 1) >> 1;
+ dst_v[0] = (src_uyvy[2] + src_uyvy[src_stride_uyvy + 2] + 1) >> 1;
+ src_uyvy += 4;
+ dst_u += 1;
+ dst_v += 1;
+ }
+}
+
+// Copy row of UYVY UV's (422) into U and V (422).
+void UYVYToUV422Row_C(const uint8* src_uyvy,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
+ // Output a row of UV values.
+ int x;
+ for (x = 0; x < width; x += 2) {
+ dst_u[0] = src_uyvy[0];
+ dst_v[0] = src_uyvy[2];
+ src_uyvy += 4;
+ dst_u += 1;
+ dst_v += 1;
+ }
+}
+
+// Copy row of UYVY Y's (422) into Y (420/422).
+void UYVYToYRow_C(const uint8* src_uyvy, uint8* dst_y, int width) {
+ // Output a row of Y values.
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ dst_y[x] = src_uyvy[1];
+ dst_y[x + 1] = src_uyvy[3];
+ src_uyvy += 4;
+ }
+ if (width & 1) {
+ dst_y[width - 1] = src_uyvy[1];
+ }
+}
+
+#define BLEND(f, b, a) (((256 - a) * b) >> 8) + f
+
+// Blend src_argb0 over src_argb1 and store to dst_argb.
+// dst_argb may be src_argb0 or src_argb1.
+// This code mimics the SSSE3 version for better testability.
+void ARGBBlendRow_C(const uint8* src_argb0,
+ const uint8* src_argb1,
+ uint8* dst_argb,
+ int width) {
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ uint32 fb = src_argb0[0];
+ uint32 fg = src_argb0[1];
+ uint32 fr = src_argb0[2];
+ uint32 a = src_argb0[3];
+ uint32 bb = src_argb1[0];
+ uint32 bg = src_argb1[1];
+ uint32 br = src_argb1[2];
+ dst_argb[0] = BLEND(fb, bb, a);
+ dst_argb[1] = BLEND(fg, bg, a);
+ dst_argb[2] = BLEND(fr, br, a);
+ dst_argb[3] = 255u;
+
+ fb = src_argb0[4 + 0];
+ fg = src_argb0[4 + 1];
+ fr = src_argb0[4 + 2];
+ a = src_argb0[4 + 3];
+ bb = src_argb1[4 + 0];
+ bg = src_argb1[4 + 1];
+ br = src_argb1[4 + 2];
+ dst_argb[4 + 0] = BLEND(fb, bb, a);
+ dst_argb[4 + 1] = BLEND(fg, bg, a);
+ dst_argb[4 + 2] = BLEND(fr, br, a);
+ dst_argb[4 + 3] = 255u;
+ src_argb0 += 8;
+ src_argb1 += 8;
+ dst_argb += 8;
+ }
+
+ if (width & 1) {
+ uint32 fb = src_argb0[0];
+ uint32 fg = src_argb0[1];
+ uint32 fr = src_argb0[2];
+ uint32 a = src_argb0[3];
+ uint32 bb = src_argb1[0];
+ uint32 bg = src_argb1[1];
+ uint32 br = src_argb1[2];
+ dst_argb[0] = BLEND(fb, bb, a);
+ dst_argb[1] = BLEND(fg, bg, a);
+ dst_argb[2] = BLEND(fr, br, a);
+ dst_argb[3] = 255u;
+ }
+}
+#undef BLEND
+
+#define UBLEND(f, b, a) (((a)*f) + ((255 - a) * b) + 255) >> 8
+void BlendPlaneRow_C(const uint8* src0,
+ const uint8* src1,
+ const uint8* alpha,
+ uint8* dst,
+ int width) {
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ dst[0] = UBLEND(src0[0], src1[0], alpha[0]);
+ dst[1] = UBLEND(src0[1], src1[1], alpha[1]);
+ src0 += 2;
+ src1 += 2;
+ alpha += 2;
+ dst += 2;
+ }
+ if (width & 1) {
+ dst[0] = UBLEND(src0[0], src1[0], alpha[0]);
+ }
+}
+#undef UBLEND
+
+#define ATTENUATE(f, a) (a | (a << 8)) * (f | (f << 8)) >> 24
+
+// Multiply source RGB by alpha and store to destination.
+// This code mimics the SSSE3 version for better testability.
+void ARGBAttenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
+ int i;
+ for (i = 0; i < width - 1; i += 2) {
+ uint32 b = src_argb[0];
+ uint32 g = src_argb[1];
+ uint32 r = src_argb[2];
+ uint32 a = src_argb[3];
+ dst_argb[0] = ATTENUATE(b, a);
+ dst_argb[1] = ATTENUATE(g, a);
+ dst_argb[2] = ATTENUATE(r, a);
+ dst_argb[3] = a;
+ b = src_argb[4];
+ g = src_argb[5];
+ r = src_argb[6];
+ a = src_argb[7];
+ dst_argb[4] = ATTENUATE(b, a);
+ dst_argb[5] = ATTENUATE(g, a);
+ dst_argb[6] = ATTENUATE(r, a);
+ dst_argb[7] = a;
+ src_argb += 8;
+ dst_argb += 8;
+ }
+
+ if (width & 1) {
+ const uint32 b = src_argb[0];
+ const uint32 g = src_argb[1];
+ const uint32 r = src_argb[2];
+ const uint32 a = src_argb[3];
+ dst_argb[0] = ATTENUATE(b, a);
+ dst_argb[1] = ATTENUATE(g, a);
+ dst_argb[2] = ATTENUATE(r, a);
+ dst_argb[3] = a;
+ }
+}
+#undef ATTENUATE
+
+// Divide source RGB by alpha and store to destination.
+// b = (b * 255 + (a / 2)) / a;
+// g = (g * 255 + (a / 2)) / a;
+// r = (r * 255 + (a / 2)) / a;
+// Reciprocal method is off by 1 on some values. ie 125
+// 8.8 fixed point inverse table with 1.0 in upper short and 1 / a in lower.
+#define T(a) 0x01000000 + (0x10000 / a)
+const uint32 fixed_invtbl8[256] = {
+ 0x01000000, 0x0100ffff, T(0x02), T(0x03), T(0x04), T(0x05), T(0x06),
+ T(0x07), T(0x08), T(0x09), T(0x0a), T(0x0b), T(0x0c), T(0x0d),
+ T(0x0e), T(0x0f), T(0x10), T(0x11), T(0x12), T(0x13), T(0x14),
+ T(0x15), T(0x16), T(0x17), T(0x18), T(0x19), T(0x1a), T(0x1b),
+ T(0x1c), T(0x1d), T(0x1e), T(0x1f), T(0x20), T(0x21), T(0x22),
+ T(0x23), T(0x24), T(0x25), T(0x26), T(0x27), T(0x28), T(0x29),
+ T(0x2a), T(0x2b), T(0x2c), T(0x2d), T(0x2e), T(0x2f), T(0x30),
+ T(0x31), T(0x32), T(0x33), T(0x34), T(0x35), T(0x36), T(0x37),
+ T(0x38), T(0x39), T(0x3a), T(0x3b), T(0x3c), T(0x3d), T(0x3e),
+ T(0x3f), T(0x40), T(0x41), T(0x42), T(0x43), T(0x44), T(0x45),
+ T(0x46), T(0x47), T(0x48), T(0x49), T(0x4a), T(0x4b), T(0x4c),
+ T(0x4d), T(0x4e), T(0x4f), T(0x50), T(0x51), T(0x52), T(0x53),
+ T(0x54), T(0x55), T(0x56), T(0x57), T(0x58), T(0x59), T(0x5a),
+ T(0x5b), T(0x5c), T(0x5d), T(0x5e), T(0x5f), T(0x60), T(0x61),
+ T(0x62), T(0x63), T(0x64), T(0x65), T(0x66), T(0x67), T(0x68),
+ T(0x69), T(0x6a), T(0x6b), T(0x6c), T(0x6d), T(0x6e), T(0x6f),
+ T(0x70), T(0x71), T(0x72), T(0x73), T(0x74), T(0x75), T(0x76),
+ T(0x77), T(0x78), T(0x79), T(0x7a), T(0x7b), T(0x7c), T(0x7d),
+ T(0x7e), T(0x7f), T(0x80), T(0x81), T(0x82), T(0x83), T(0x84),
+ T(0x85), T(0x86), T(0x87), T(0x88), T(0x89), T(0x8a), T(0x8b),
+ T(0x8c), T(0x8d), T(0x8e), T(0x8f), T(0x90), T(0x91), T(0x92),
+ T(0x93), T(0x94), T(0x95), T(0x96), T(0x97), T(0x98), T(0x99),
+ T(0x9a), T(0x9b), T(0x9c), T(0x9d), T(0x9e), T(0x9f), T(0xa0),
+ T(0xa1), T(0xa2), T(0xa3), T(0xa4), T(0xa5), T(0xa6), T(0xa7),
+ T(0xa8), T(0xa9), T(0xaa), T(0xab), T(0xac), T(0xad), T(0xae),
+ T(0xaf), T(0xb0), T(0xb1), T(0xb2), T(0xb3), T(0xb4), T(0xb5),
+ T(0xb6), T(0xb7), T(0xb8), T(0xb9), T(0xba), T(0xbb), T(0xbc),
+ T(0xbd), T(0xbe), T(0xbf), T(0xc0), T(0xc1), T(0xc2), T(0xc3),
+ T(0xc4), T(0xc5), T(0xc6), T(0xc7), T(0xc8), T(0xc9), T(0xca),
+ T(0xcb), T(0xcc), T(0xcd), T(0xce), T(0xcf), T(0xd0), T(0xd1),
+ T(0xd2), T(0xd3), T(0xd4), T(0xd5), T(0xd6), T(0xd7), T(0xd8),
+ T(0xd9), T(0xda), T(0xdb), T(0xdc), T(0xdd), T(0xde), T(0xdf),
+ T(0xe0), T(0xe1), T(0xe2), T(0xe3), T(0xe4), T(0xe5), T(0xe6),
+ T(0xe7), T(0xe8), T(0xe9), T(0xea), T(0xeb), T(0xec), T(0xed),
+ T(0xee), T(0xef), T(0xf0), T(0xf1), T(0xf2), T(0xf3), T(0xf4),
+ T(0xf5), T(0xf6), T(0xf7), T(0xf8), T(0xf9), T(0xfa), T(0xfb),
+ T(0xfc), T(0xfd), T(0xfe), 0x01000100};
+#undef T
+
+void ARGBUnattenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
+ int i;
+ for (i = 0; i < width; ++i) {
+ uint32 b = src_argb[0];
+ uint32 g = src_argb[1];
+ uint32 r = src_argb[2];
+ const uint32 a = src_argb[3];
+ const uint32 ia = fixed_invtbl8[a] & 0xffff; // 8.8 fixed point
+ b = (b * ia) >> 8;
+ g = (g * ia) >> 8;
+ r = (r * ia) >> 8;
+ // Clamping should not be necessary but is free in assembly.
+ dst_argb[0] = clamp255(b);
+ dst_argb[1] = clamp255(g);
+ dst_argb[2] = clamp255(r);
+ dst_argb[3] = a;
+ src_argb += 4;
+ dst_argb += 4;
+ }
+}
+
+void ComputeCumulativeSumRow_C(const uint8* row,
+ int32* cumsum,
+ const int32* previous_cumsum,
+ int width) {
+ int32 row_sum[4] = {0, 0, 0, 0};
+ int x;
+ for (x = 0; x < width; ++x) {
+ row_sum[0] += row[x * 4 + 0];
+ row_sum[1] += row[x * 4 + 1];
+ row_sum[2] += row[x * 4 + 2];
+ row_sum[3] += row[x * 4 + 3];
+ cumsum[x * 4 + 0] = row_sum[0] + previous_cumsum[x * 4 + 0];
+ cumsum[x * 4 + 1] = row_sum[1] + previous_cumsum[x * 4 + 1];
+ cumsum[x * 4 + 2] = row_sum[2] + previous_cumsum[x * 4 + 2];
+ cumsum[x * 4 + 3] = row_sum[3] + previous_cumsum[x * 4 + 3];
+ }
+}
+
+void CumulativeSumToAverageRow_C(const int32* tl,
+ const int32* bl,
+ int w,
+ int area,
+ uint8* dst,
+ int count) {
+ float ooa = 1.0f / area;
+ int i;
+ for (i = 0; i < count; ++i) {
+ dst[0] = (uint8)((bl[w + 0] + tl[0] - bl[0] - tl[w + 0]) * ooa);
+ dst[1] = (uint8)((bl[w + 1] + tl[1] - bl[1] - tl[w + 1]) * ooa);
+ dst[2] = (uint8)((bl[w + 2] + tl[2] - bl[2] - tl[w + 2]) * ooa);
+ dst[3] = (uint8)((bl[w + 3] + tl[3] - bl[3] - tl[w + 3]) * ooa);
+ dst += 4;
+ tl += 4;
+ bl += 4;
+ }
+}
+
+// Copy pixels from rotated source to destination row with a slope.
+LIBYUV_API
+void ARGBAffineRow_C(const uint8* src_argb,
+ int src_argb_stride,
+ uint8* dst_argb,
+ const float* uv_dudv,
+ int width) {
+ int i;
+ // Render a row of pixels from source into a buffer.
+ float uv[2];
+ uv[0] = uv_dudv[0];
+ uv[1] = uv_dudv[1];
+ for (i = 0; i < width; ++i) {
+ int x = (int)(uv[0]);
+ int y = (int)(uv[1]);
+ *(uint32*)(dst_argb) =
+ *(const uint32*)(src_argb + y * src_argb_stride + x * 4);
+ dst_argb += 4;
+ uv[0] += uv_dudv[2];
+ uv[1] += uv_dudv[3];
+ }
+}
+
+// Blend 2 rows into 1.
+static void HalfRow_C(const uint8* src_uv,
+ ptrdiff_t src_uv_stride,
+ uint8* dst_uv,
+ int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ dst_uv[x] = (src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1;
+ }
+}
+
+static void HalfRow_16_C(const uint16* src_uv,
+ ptrdiff_t src_uv_stride,
+ uint16* dst_uv,
+ int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ dst_uv[x] = (src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1;
+ }
+}
+
+// C version 2x2 -> 2x1.
+void InterpolateRow_C(uint8* dst_ptr,
+ const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ int width,
+ int source_y_fraction) {
+ int y1_fraction = source_y_fraction;
+ int y0_fraction = 256 - y1_fraction;
+ const uint8* src_ptr1 = src_ptr + src_stride;
+ int x;
+ if (y1_fraction == 0) {
+ memcpy(dst_ptr, src_ptr, width);
+ return;
+ }
+ if (y1_fraction == 128) {
+ HalfRow_C(src_ptr, src_stride, dst_ptr, width);
+ return;
+ }
+ for (x = 0; x < width - 1; x += 2) {
+ dst_ptr[0] =
+ (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction + 128) >> 8;
+ dst_ptr[1] =
+ (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction + 128) >> 8;
+ src_ptr += 2;
+ src_ptr1 += 2;
+ dst_ptr += 2;
+ }
+ if (width & 1) {
+ dst_ptr[0] =
+ (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction + 128) >> 8;
+ }
+}
+
+void InterpolateRow_16_C(uint16* dst_ptr,
+ const uint16* src_ptr,
+ ptrdiff_t src_stride,
+ int width,
+ int source_y_fraction) {
+ int y1_fraction = source_y_fraction;
+ int y0_fraction = 256 - y1_fraction;
+ const uint16* src_ptr1 = src_ptr + src_stride;
+ int x;
+ if (source_y_fraction == 0) {
+ memcpy(dst_ptr, src_ptr, width * 2);
+ return;
+ }
+ if (source_y_fraction == 128) {
+ HalfRow_16_C(src_ptr, src_stride, dst_ptr, width);
+ return;
+ }
+ for (x = 0; x < width - 1; x += 2) {
+ dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;
+ dst_ptr[1] = (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction) >> 8;
+ src_ptr += 2;
+ src_ptr1 += 2;
+ dst_ptr += 2;
+ }
+ if (width & 1) {
+ dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;
+ }
+}
+
+// Use first 4 shuffler values to reorder ARGB channels.
+void ARGBShuffleRow_C(const uint8* src_argb,
+ uint8* dst_argb,
+ const uint8* shuffler,
+ int width) {
+ int index0 = shuffler[0];
+ int index1 = shuffler[1];
+ int index2 = shuffler[2];
+ int index3 = shuffler[3];
+ // Shuffle a row of ARGB.
+ int x;
+ for (x = 0; x < width; ++x) {
+ // To support in-place conversion.
+ uint8 b = src_argb[index0];
+ uint8 g = src_argb[index1];
+ uint8 r = src_argb[index2];
+ uint8 a = src_argb[index3];
+ dst_argb[0] = b;
+ dst_argb[1] = g;
+ dst_argb[2] = r;
+ dst_argb[3] = a;
+ src_argb += 4;
+ dst_argb += 4;
+ }
+}
+
+void I422ToYUY2Row_C(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_frame,
+ int width) {
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ dst_frame[0] = src_y[0];
+ dst_frame[1] = src_u[0];
+ dst_frame[2] = src_y[1];
+ dst_frame[3] = src_v[0];
+ dst_frame += 4;
+ src_y += 2;
+ src_u += 1;
+ src_v += 1;
+ }
+ if (width & 1) {
+ dst_frame[0] = src_y[0];
+ dst_frame[1] = src_u[0];
+ dst_frame[2] = 0;
+ dst_frame[3] = src_v[0];
+ }
+}
+
+void I422ToUYVYRow_C(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_frame,
+ int width) {
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ dst_frame[0] = src_u[0];
+ dst_frame[1] = src_y[0];
+ dst_frame[2] = src_v[0];
+ dst_frame[3] = src_y[1];
+ dst_frame += 4;
+ src_y += 2;
+ src_u += 1;
+ src_v += 1;
+ }
+ if (width & 1) {
+ dst_frame[0] = src_u[0];
+ dst_frame[1] = src_y[0];
+ dst_frame[2] = src_v[0];
+ dst_frame[3] = 0;
+ }
+}
+
+void ARGBPolynomialRow_C(const uint8* src_argb,
+ uint8* dst_argb,
+ const float* poly,
+ int width) {
+ int i;
+ for (i = 0; i < width; ++i) {
+ float b = (float)(src_argb[0]);
+ float g = (float)(src_argb[1]);
+ float r = (float)(src_argb[2]);
+ float a = (float)(src_argb[3]);
+ float b2 = b * b;
+ float g2 = g * g;
+ float r2 = r * r;
+ float a2 = a * a;
+ float db = poly[0] + poly[4] * b;
+ float dg = poly[1] + poly[5] * g;
+ float dr = poly[2] + poly[6] * r;
+ float da = poly[3] + poly[7] * a;
+ float b3 = b2 * b;
+ float g3 = g2 * g;
+ float r3 = r2 * r;
+ float a3 = a2 * a;
+ db += poly[8] * b2;
+ dg += poly[9] * g2;
+ dr += poly[10] * r2;
+ da += poly[11] * a2;
+ db += poly[12] * b3;
+ dg += poly[13] * g3;
+ dr += poly[14] * r3;
+ da += poly[15] * a3;
+
+ dst_argb[0] = Clamp((int32)(db));
+ dst_argb[1] = Clamp((int32)(dg));
+ dst_argb[2] = Clamp((int32)(dr));
+ dst_argb[3] = Clamp((int32)(da));
+ src_argb += 4;
+ dst_argb += 4;
+ }
+}
+
+// Samples assumed to be unsigned in low 9, 10 or 12 bits. Scale factor
+// adjust the source integer range to the half float range desired.
+
+// This magic constant is 2^-112. Multiplying by this
+// is the same as subtracting 112 from the exponent, which
+// is the difference in exponent bias between 32-bit and
+// 16-bit floats. Once we've done this subtraction, we can
+// simply extract the low bits of the exponent and the high
+// bits of the mantissa from our float and we're done.
+
+void HalfFloatRow_C(const uint16* src, uint16* dst, float scale, int width) {
+ int i;
+ float mult = 1.9259299444e-34f * scale;
+ for (i = 0; i < width; ++i) {
+ float value = src[i] * mult;
+ dst[i] = (uint16)((*(uint32_t*)&value) >> 13);
+ }
+}
+
+void ARGBLumaColorTableRow_C(const uint8* src_argb,
+ uint8* dst_argb,
+ int width,
+ const uint8* luma,
+ uint32 lumacoeff) {
+ uint32 bc = lumacoeff & 0xff;
+ uint32 gc = (lumacoeff >> 8) & 0xff;
+ uint32 rc = (lumacoeff >> 16) & 0xff;
+
+ int i;
+ for (i = 0; i < width - 1; i += 2) {
+ // Luminance in rows, color values in columns.
+ const uint8* luma0 =
+ ((src_argb[0] * bc + src_argb[1] * gc + src_argb[2] * rc) & 0x7F00u) +
+ luma;
+ const uint8* luma1;
+ dst_argb[0] = luma0[src_argb[0]];
+ dst_argb[1] = luma0[src_argb[1]];
+ dst_argb[2] = luma0[src_argb[2]];
+ dst_argb[3] = src_argb[3];
+ luma1 =
+ ((src_argb[4] * bc + src_argb[5] * gc + src_argb[6] * rc) & 0x7F00u) +
+ luma;
+ dst_argb[4] = luma1[src_argb[4]];
+ dst_argb[5] = luma1[src_argb[5]];
+ dst_argb[6] = luma1[src_argb[6]];
+ dst_argb[7] = src_argb[7];
+ src_argb += 8;
+ dst_argb += 8;
+ }
+ if (width & 1) {
+ // Luminance in rows, color values in columns.
+ const uint8* luma0 =
+ ((src_argb[0] * bc + src_argb[1] * gc + src_argb[2] * rc) & 0x7F00u) +
+ luma;
+ dst_argb[0] = luma0[src_argb[0]];
+ dst_argb[1] = luma0[src_argb[1]];
+ dst_argb[2] = luma0[src_argb[2]];
+ dst_argb[3] = src_argb[3];
+ }
+}
+
+void ARGBCopyAlphaRow_C(const uint8* src, uint8* dst, int width) {
+ int i;
+ for (i = 0; i < width - 1; i += 2) {
+ dst[3] = src[3];
+ dst[7] = src[7];
+ dst += 8;
+ src += 8;
+ }
+ if (width & 1) {
+ dst[3] = src[3];
+ }
+}
+
+void ARGBExtractAlphaRow_C(const uint8* src_argb, uint8* dst_a, int width) {
+ int i;
+ for (i = 0; i < width - 1; i += 2) {
+ dst_a[0] = src_argb[3];
+ dst_a[1] = src_argb[7];
+ dst_a += 2;
+ src_argb += 8;
+ }
+ if (width & 1) {
+ dst_a[0] = src_argb[3];
+ }
+}
+
+void ARGBCopyYToAlphaRow_C(const uint8* src, uint8* dst, int width) {
+ int i;
+ for (i = 0; i < width - 1; i += 2) {
+ dst[3] = src[0];
+ dst[7] = src[1];
+ dst += 8;
+ src += 2;
+ }
+ if (width & 1) {
+ dst[3] = src[0];
+ }
+}
+
+// Maximum temporary width for wrappers to process at a time, in pixels.
+#define MAXTWIDTH 2048
+
+#if !(defined(_MSC_VER) && defined(_M_IX86)) && \
+ defined(HAS_I422TORGB565ROW_SSSE3)
+// row_win.cc has asm version, but GCC uses 2 step wrapper.
+void I422ToRGB565Row_SSSE3(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_rgb565,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
+ while (width > 0) {
+ int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+ I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, yuvconstants, twidth);
+ ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth);
+ src_y += twidth;
+ src_u += twidth / 2;
+ src_v += twidth / 2;
+ dst_rgb565 += twidth * 2;
+ width -= twidth;
+ }
+}
+#endif
+
+#if defined(HAS_I422TOARGB1555ROW_SSSE3)
+void I422ToARGB1555Row_SSSE3(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb1555,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ // Row buffer for intermediate ARGB pixels.
+ SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
+ while (width > 0) {
+ int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+ I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, yuvconstants, twidth);
+ ARGBToARGB1555Row_SSE2(row, dst_argb1555, twidth);
+ src_y += twidth;
+ src_u += twidth / 2;
+ src_v += twidth / 2;
+ dst_argb1555 += twidth * 2;
+ width -= twidth;
+ }
+}
+#endif
+
+#if defined(HAS_I422TOARGB4444ROW_SSSE3)
+void I422ToARGB4444Row_SSSE3(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb4444,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ // Row buffer for intermediate ARGB pixels.
+ SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
+ while (width > 0) {
+ int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+ I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, yuvconstants, twidth);
+ ARGBToARGB4444Row_SSE2(row, dst_argb4444, twidth);
+ src_y += twidth;
+ src_u += twidth / 2;
+ src_v += twidth / 2;
+ dst_argb4444 += twidth * 2;
+ width -= twidth;
+ }
+}
+#endif
+
+#if defined(HAS_NV12TORGB565ROW_SSSE3)
+void NV12ToRGB565Row_SSSE3(const uint8* src_y,
+ const uint8* src_uv,
+ uint8* dst_rgb565,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ // Row buffer for intermediate ARGB pixels.
+ SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
+ while (width > 0) {
+ int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+ NV12ToARGBRow_SSSE3(src_y, src_uv, row, yuvconstants, twidth);
+ ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth);
+ src_y += twidth;
+ src_uv += twidth;
+ dst_rgb565 += twidth * 2;
+ width -= twidth;
+ }
+}
+#endif
+
+#if defined(HAS_I422TORGB565ROW_AVX2)
+void I422ToRGB565Row_AVX2(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_rgb565,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
+ while (width > 0) {
+ int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+ I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth);
+#if defined(HAS_ARGBTORGB565ROW_AVX2)
+ ARGBToRGB565Row_AVX2(row, dst_rgb565, twidth);
+#else
+ ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth);
+#endif
+ src_y += twidth;
+ src_u += twidth / 2;
+ src_v += twidth / 2;
+ dst_rgb565 += twidth * 2;
+ width -= twidth;
+ }
+}
+#endif
+
+#if defined(HAS_I422TOARGB1555ROW_AVX2)
+void I422ToARGB1555Row_AVX2(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb1555,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ // Row buffer for intermediate ARGB pixels.
+ SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
+ while (width > 0) {
+ int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+ I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth);
+#if defined(HAS_ARGBTOARGB1555ROW_AVX2)
+ ARGBToARGB1555Row_AVX2(row, dst_argb1555, twidth);
+#else
+ ARGBToARGB1555Row_SSE2(row, dst_argb1555, twidth);
+#endif
+ src_y += twidth;
+ src_u += twidth / 2;
+ src_v += twidth / 2;
+ dst_argb1555 += twidth * 2;
+ width -= twidth;
+ }
+}
+#endif
+
+#if defined(HAS_I422TOARGB4444ROW_AVX2)
+void I422ToARGB4444Row_AVX2(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb4444,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ // Row buffer for intermediate ARGB pixels.
+ SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
+ while (width > 0) {
+ int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+ I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth);
+#if defined(HAS_ARGBTOARGB4444ROW_AVX2)
+ ARGBToARGB4444Row_AVX2(row, dst_argb4444, twidth);
+#else
+ ARGBToARGB4444Row_SSE2(row, dst_argb4444, twidth);
+#endif
+ src_y += twidth;
+ src_u += twidth / 2;
+ src_v += twidth / 2;
+ dst_argb4444 += twidth * 2;
+ width -= twidth;
+ }
+}
+#endif
+
+#if defined(HAS_I422TORGB24ROW_AVX2)
+void I422ToRGB24Row_AVX2(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ // Row buffer for intermediate ARGB pixels.
+ SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
+ while (width > 0) {
+ int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+ I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth);
+ // TODO(fbarchard): ARGBToRGB24Row_AVX2
+ ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth);
+ src_y += twidth;
+ src_u += twidth / 2;
+ src_v += twidth / 2;
+ dst_rgb24 += twidth * 3;
+ width -= twidth;
+ }
+}
+#endif
+
+#if defined(HAS_NV12TORGB565ROW_AVX2)
+void NV12ToRGB565Row_AVX2(const uint8* src_y,
+ const uint8* src_uv,
+ uint8* dst_rgb565,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ // Row buffer for intermediate ARGB pixels.
+ SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
+ while (width > 0) {
+ int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+ NV12ToARGBRow_AVX2(src_y, src_uv, row, yuvconstants, twidth);
+#if defined(HAS_ARGBTORGB565ROW_AVX2)
+ ARGBToRGB565Row_AVX2(row, dst_rgb565, twidth);
+#else
+ ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth);
+#endif
+ src_y += twidth;
+ src_uv += twidth;
+ dst_rgb565 += twidth * 2;
+ width -= twidth;
+ }
+}
+#endif
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/libyuv/src/main/cpp/libyuv/source/row_dspr2.cc b/libyuv/src/main/cpp/libyuv/source/row_dspr2.cc
new file mode 100644
index 0000000..11f78e0
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/source/row_dspr2.cc
@@ -0,0 +1,1721 @@
+/*
+ * Copyright (c) 2012 The LibYuv project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// The following are available on Mips platforms:
+#if !defined(LIBYUV_DISABLE_DSPR2) && defined(__mips__) && \
+ (_MIPS_SIM == _MIPS_SIM_ABI32)
+
+#ifdef HAS_COPYROW_MIPS
+void CopyRow_MIPS(const uint8* src, uint8* dst, int count) {
+ __asm__ __volatile__(
+ ".set noreorder \n"
+ ".set noat \n"
+ "slti $at, %[count], 8 \n"
+ "bne $at ,$zero, $last8 \n"
+ "xor $t8, %[src], %[dst] \n"
+ "andi $t8, $t8, 0x3 \n"
+
+ "bne $t8, $zero, unaligned \n"
+ "negu $a3, %[dst] \n"
+ // make dst/src aligned
+ "andi $a3, $a3, 0x3 \n"
+ "beq $a3, $zero, $chk16w \n"
+ // word-aligned now count is the remining bytes count
+ "subu %[count], %[count], $a3 \n"
+
+ "lwr $t8, 0(%[src]) \n"
+ "addu %[src], %[src], $a3 \n"
+ "swr $t8, 0(%[dst]) \n"
+ "addu %[dst], %[dst], $a3 \n"
+
+ // Now the dst/src are mutually word-aligned with word-aligned addresses
+ "$chk16w: \n"
+ "andi $t8, %[count], 0x3f \n" // whole 64-B chunks?
+ // t8 is the byte count after 64-byte chunks
+ "beq %[count], $t8, chk8w \n"
+ // There will be at most 1 32-byte chunk after it
+ "subu $a3, %[count], $t8 \n" // the reminder
+ // Here a3 counts bytes in 16w chunks
+ "addu $a3, %[dst], $a3 \n"
+ // Now a3 is the final dst after 64-byte chunks
+ "addu $t0, %[dst], %[count] \n"
+ // t0 is the "past the end" address
+
+ // When in the loop we exercise "pref 30,x(a1)", the a1+x should not be
+ // past
+ // the "t0-32" address
+ // This means: for x=128 the last "safe" a1 address is "t0-160"
+ // Alternatively, for x=64 the last "safe" a1 address is "t0-96"
+ // we will use "pref 30,128(a1)", so "t0-160" is the limit
+ "subu $t9, $t0, 160 \n"
+ // t9 is the "last safe pref 30,128(a1)" address
+ "pref 0, 0(%[src]) \n" // first line of src
+ "pref 0, 32(%[src]) \n" // second line of src
+ "pref 0, 64(%[src]) \n"
+ "pref 30, 32(%[dst]) \n"
+ // In case the a1 > t9 don't use "pref 30" at all
+ "sltu $v1, $t9, %[dst] \n"
+ "bgtz $v1, $loop16w \n"
+ "nop \n"
+ // otherwise, start with using pref30
+ "pref 30, 64(%[dst]) \n"
+ "$loop16w: \n"
+ "pref 0, 96(%[src]) \n"
+ "lw $t0, 0(%[src]) \n"
+ "bgtz $v1, $skip_pref30_96 \n" // skip
+ "lw $t1, 4(%[src]) \n"
+ "pref 30, 96(%[dst]) \n" // continue
+ "$skip_pref30_96: \n"
+ "lw $t2, 8(%[src]) \n"
+ "lw $t3, 12(%[src]) \n"
+ "lw $t4, 16(%[src]) \n"
+ "lw $t5, 20(%[src]) \n"
+ "lw $t6, 24(%[src]) \n"
+ "lw $t7, 28(%[src]) \n"
+ "pref 0, 128(%[src]) \n"
+ // bring the next lines of src, addr 128
+ "sw $t0, 0(%[dst]) \n"
+ "sw $t1, 4(%[dst]) \n"
+ "sw $t2, 8(%[dst]) \n"
+ "sw $t3, 12(%[dst]) \n"
+ "sw $t4, 16(%[dst]) \n"
+ "sw $t5, 20(%[dst]) \n"
+ "sw $t6, 24(%[dst]) \n"
+ "sw $t7, 28(%[dst]) \n"
+ "lw $t0, 32(%[src]) \n"
+ "bgtz $v1, $skip_pref30_128 \n" // skip pref 30,128(a1)
+ "lw $t1, 36(%[src]) \n"
+ "pref 30, 128(%[dst]) \n" // set dest, addr 128
+ "$skip_pref30_128: \n"
+ "lw $t2, 40(%[src]) \n"
+ "lw $t3, 44(%[src]) \n"
+ "lw $t4, 48(%[src]) \n"
+ "lw $t5, 52(%[src]) \n"
+ "lw $t6, 56(%[src]) \n"
+ "lw $t7, 60(%[src]) \n"
+ "pref 0, 160(%[src]) \n"
+ // bring the next lines of src, addr 160
+ "sw $t0, 32(%[dst]) \n"
+ "sw $t1, 36(%[dst]) \n"
+ "sw $t2, 40(%[dst]) \n"
+ "sw $t3, 44(%[dst]) \n"
+ "sw $t4, 48(%[dst]) \n"
+ "sw $t5, 52(%[dst]) \n"
+ "sw $t6, 56(%[dst]) \n"
+ "sw $t7, 60(%[dst]) \n"
+
+ "addiu %[dst], %[dst], 64 \n" // adding 64 to dest
+ "sltu $v1, $t9, %[dst] \n"
+ "bne %[dst], $a3, $loop16w \n"
+ " addiu %[src], %[src], 64 \n" // adding 64 to src
+ "move %[count], $t8 \n"
+
+ // Here we have src and dest word-aligned but less than 64-bytes to go
+
+ "chk8w: \n"
+ "pref 0, 0x0(%[src]) \n"
+ "andi $t8, %[count], 0x1f \n" // 32-byte chunk?
+ // the t8 is the reminder count past 32-bytes
+ "beq %[count], $t8, chk1w \n"
+ // count=t8,no 32-byte chunk
+ " nop \n"
+
+ "lw $t0, 0(%[src]) \n"
+ "lw $t1, 4(%[src]) \n"
+ "lw $t2, 8(%[src]) \n"
+ "lw $t3, 12(%[src]) \n"
+ "lw $t4, 16(%[src]) \n"
+ "lw $t5, 20(%[src]) \n"
+ "lw $t6, 24(%[src]) \n"
+ "lw $t7, 28(%[src]) \n"
+ "addiu %[src], %[src], 32 \n"
+
+ "sw $t0, 0(%[dst]) \n"
+ "sw $t1, 4(%[dst]) \n"
+ "sw $t2, 8(%[dst]) \n"
+ "sw $t3, 12(%[dst]) \n"
+ "sw $t4, 16(%[dst]) \n"
+ "sw $t5, 20(%[dst]) \n"
+ "sw $t6, 24(%[dst]) \n"
+ "sw $t7, 28(%[dst]) \n"
+ "addiu %[dst], %[dst], 32 \n"
+
+ "chk1w: \n"
+ "andi %[count], $t8, 0x3 \n"
+ // now count is the reminder past 1w chunks
+ "beq %[count], $t8, $last8 \n"
+ " subu $a3, $t8, %[count] \n"
+ // a3 is count of bytes in 1w chunks
+ "addu $a3, %[dst], $a3 \n"
+ // now a3 is the dst address past the 1w chunks
+ // copying in words (4-byte chunks)
+ "$wordCopy_loop: \n"
+ "lw $t3, 0(%[src]) \n"
+ // the first t3 may be equal t0 ... optimize?
+ "addiu %[src], %[src],4 \n"
+ "addiu %[dst], %[dst],4 \n"
+ "bne %[dst], $a3,$wordCopy_loop \n"
+ " sw $t3, -4(%[dst]) \n"
+
+ // For the last (<8) bytes
+ "$last8: \n"
+ "blez %[count], leave \n"
+ " addu $a3, %[dst], %[count] \n" // a3 -last dst address
+ "$last8loop: \n"
+ "lb $v1, 0(%[src]) \n"
+ "addiu %[src], %[src], 1 \n"
+ "addiu %[dst], %[dst], 1 \n"
+ "bne %[dst], $a3, $last8loop \n"
+ " sb $v1, -1(%[dst]) \n"
+
+ "leave: \n"
+ " j $ra \n"
+ " nop \n"
+
+ //
+ // UNALIGNED case
+ //
+
+ "unaligned: \n"
+ // got here with a3="negu a1"
+ "andi $a3, $a3, 0x3 \n" // a1 is word aligned?
+ "beqz $a3, $ua_chk16w \n"
+ " subu %[count], %[count], $a3 \n"
+ // bytes left after initial a3 bytes
+ "lwr $v1, 0(%[src]) \n"
+ "lwl $v1, 3(%[src]) \n"
+ "addu %[src], %[src], $a3 \n" // a3 may be 1, 2 or 3
+ "swr $v1, 0(%[dst]) \n"
+ "addu %[dst], %[dst], $a3 \n"
+ // below the dst will be word aligned (NOTE1)
+ "$ua_chk16w: \n"
+ "andi $t8, %[count], 0x3f \n" // whole 64-B chunks?
+ // t8 is the byte count after 64-byte chunks
+ "beq %[count], $t8, ua_chk8w \n"
+ // if a2==t8, no 64-byte chunks
+ // There will be at most 1 32-byte chunk after it
+ "subu $a3, %[count], $t8 \n" // the reminder
+ // Here a3 counts bytes in 16w chunks
+ "addu $a3, %[dst], $a3 \n"
+ // Now a3 is the final dst after 64-byte chunks
+ "addu $t0, %[dst], %[count] \n" // t0 "past the end"
+ "subu $t9, $t0, 160 \n"
+ // t9 is the "last safe pref 30,128(a1)" address
+ "pref 0, 0(%[src]) \n" // first line of src
+ "pref 0, 32(%[src]) \n" // second line addr 32
+ "pref 0, 64(%[src]) \n"
+ "pref 30, 32(%[dst]) \n"
+ // safe, as we have at least 64 bytes ahead
+ // In case the a1 > t9 don't use "pref 30" at all
+ "sltu $v1, $t9, %[dst] \n"
+ "bgtz $v1, $ua_loop16w \n"
+ // skip "pref 30,64(a1)" for too short arrays
+ " nop \n"
+ // otherwise, start with using pref30
+ "pref 30, 64(%[dst]) \n"
+ "$ua_loop16w: \n"
+ "pref 0, 96(%[src]) \n"
+ "lwr $t0, 0(%[src]) \n"
+ "lwl $t0, 3(%[src]) \n"
+ "lwr $t1, 4(%[src]) \n"
+ "bgtz $v1, $ua_skip_pref30_96 \n"
+ " lwl $t1, 7(%[src]) \n"
+ "pref 30, 96(%[dst]) \n"
+ // continue setting up the dest, addr 96
+ "$ua_skip_pref30_96: \n"
+ "lwr $t2, 8(%[src]) \n"
+ "lwl $t2, 11(%[src]) \n"
+ "lwr $t3, 12(%[src]) \n"
+ "lwl $t3, 15(%[src]) \n"
+ "lwr $t4, 16(%[src]) \n"
+ "lwl $t4, 19(%[src]) \n"
+ "lwr $t5, 20(%[src]) \n"
+ "lwl $t5, 23(%[src]) \n"
+ "lwr $t6, 24(%[src]) \n"
+ "lwl $t6, 27(%[src]) \n"
+ "lwr $t7, 28(%[src]) \n"
+ "lwl $t7, 31(%[src]) \n"
+ "pref 0, 128(%[src]) \n"
+ // bring the next lines of src, addr 128
+ "sw $t0, 0(%[dst]) \n"
+ "sw $t1, 4(%[dst]) \n"
+ "sw $t2, 8(%[dst]) \n"
+ "sw $t3, 12(%[dst]) \n"
+ "sw $t4, 16(%[dst]) \n"
+ "sw $t5, 20(%[dst]) \n"
+ "sw $t6, 24(%[dst]) \n"
+ "sw $t7, 28(%[dst]) \n"
+ "lwr $t0, 32(%[src]) \n"
+ "lwl $t0, 35(%[src]) \n"
+ "lwr $t1, 36(%[src]) \n"
+ "bgtz $v1, ua_skip_pref30_128 \n"
+ " lwl $t1, 39(%[src]) \n"
+ "pref 30, 128(%[dst]) \n"
+ // continue setting up the dest, addr 128
+ "ua_skip_pref30_128: \n"
+
+ "lwr $t2, 40(%[src]) \n"
+ "lwl $t2, 43(%[src]) \n"
+ "lwr $t3, 44(%[src]) \n"
+ "lwl $t3, 47(%[src]) \n"
+ "lwr $t4, 48(%[src]) \n"
+ "lwl $t4, 51(%[src]) \n"
+ "lwr $t5, 52(%[src]) \n"
+ "lwl $t5, 55(%[src]) \n"
+ "lwr $t6, 56(%[src]) \n"
+ "lwl $t6, 59(%[src]) \n"
+ "lwr $t7, 60(%[src]) \n"
+ "lwl $t7, 63(%[src]) \n"
+ "pref 0, 160(%[src]) \n"
+ // bring the next lines of src, addr 160
+ "sw $t0, 32(%[dst]) \n"
+ "sw $t1, 36(%[dst]) \n"
+ "sw $t2, 40(%[dst]) \n"
+ "sw $t3, 44(%[dst]) \n"
+ "sw $t4, 48(%[dst]) \n"
+ "sw $t5, 52(%[dst]) \n"
+ "sw $t6, 56(%[dst]) \n"
+ "sw $t7, 60(%[dst]) \n"
+
+ "addiu %[dst],%[dst],64 \n" // adding 64 to dest
+ "sltu $v1,$t9,%[dst] \n"
+ "bne %[dst],$a3,$ua_loop16w \n"
+ " addiu %[src],%[src],64 \n" // adding 64 to src
+ "move %[count],$t8 \n"
+
+ // Here we have src and dest word-aligned but less than 64-bytes to go
+
+ "ua_chk8w: \n"
+ "pref 0, 0x0(%[src]) \n"
+ "andi $t8, %[count], 0x1f \n" // 32-byte chunk?
+ // the t8 is the reminder count
+ "beq %[count], $t8, $ua_chk1w \n"
+ // when count==t8, no 32-byte chunk
+
+ "lwr $t0, 0(%[src]) \n"
+ "lwl $t0, 3(%[src]) \n"
+ "lwr $t1, 4(%[src]) \n"
+ "lwl $t1, 7(%[src]) \n"
+ "lwr $t2, 8(%[src]) \n"
+ "lwl $t2, 11(%[src]) \n"
+ "lwr $t3, 12(%[src]) \n"
+ "lwl $t3, 15(%[src]) \n"
+ "lwr $t4, 16(%[src]) \n"
+ "lwl $t4, 19(%[src]) \n"
+ "lwr $t5, 20(%[src]) \n"
+ "lwl $t5, 23(%[src]) \n"
+ "lwr $t6, 24(%[src]) \n"
+ "lwl $t6, 27(%[src]) \n"
+ "lwr $t7, 28(%[src]) \n"
+ "lwl $t7, 31(%[src]) \n"
+ "addiu %[src], %[src], 32 \n"
+
+ "sw $t0, 0(%[dst]) \n"
+ "sw $t1, 4(%[dst]) \n"
+ "sw $t2, 8(%[dst]) \n"
+ "sw $t3, 12(%[dst]) \n"
+ "sw $t4, 16(%[dst]) \n"
+ "sw $t5, 20(%[dst]) \n"
+ "sw $t6, 24(%[dst]) \n"
+ "sw $t7, 28(%[dst]) \n"
+ "addiu %[dst], %[dst], 32 \n"
+
+ "$ua_chk1w: \n"
+ "andi %[count], $t8, 0x3 \n"
+ // now count is the reminder past 1w chunks
+ "beq %[count], $t8, ua_smallCopy \n"
+ "subu $a3, $t8, %[count] \n"
+ // a3 is count of bytes in 1w chunks
+ "addu $a3, %[dst], $a3 \n"
+ // now a3 is the dst address past the 1w chunks
+
+ // copying in words (4-byte chunks)
+ "$ua_wordCopy_loop: \n"
+ "lwr $v1, 0(%[src]) \n"
+ "lwl $v1, 3(%[src]) \n"
+ "addiu %[src], %[src], 4 \n"
+ "addiu %[dst], %[dst], 4 \n"
+ // note: dst=a1 is word aligned here, see NOTE1
+ "bne %[dst], $a3, $ua_wordCopy_loop \n"
+ " sw $v1,-4(%[dst]) \n"
+
+ // Now less than 4 bytes (value in count) left to copy
+ "ua_smallCopy: \n"
+ "beqz %[count], leave \n"
+ " addu $a3, %[dst], %[count] \n" // a3 = last dst address
+ "$ua_smallCopy_loop: \n"
+ "lb $v1, 0(%[src]) \n"
+ "addiu %[src], %[src], 1 \n"
+ "addiu %[dst], %[dst], 1 \n"
+ "bne %[dst],$a3,$ua_smallCopy_loop \n"
+ " sb $v1, -1(%[dst]) \n"
+
+ "j $ra \n"
+ " nop \n"
+ ".set at \n"
+ ".set reorder \n"
+ : [dst] "+r"(dst), [src] "+r"(src)
+ : [count] "r"(count)
+ : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9", "a3", "v1",
+ "at");
+}
+#endif // HAS_COPYROW_MIPS
+
+// DSPR2 functions
+#if !defined(LIBYUV_DISABLE_DSPR2) && defined(__mips_dsp) && \
+ (__mips_dsp_rev >= 2) && (_MIPS_SIM == _MIPS_SIM_ABI32) && \
+ (__mips_isa_rev < 6)
+
+void SplitUVRow_DSPR2(const uint8* src_uv,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
+ __asm__ __volatile__(
+ ".set push \n"
+ ".set noreorder \n"
+ "srl $t4, %[width], 4 \n" // multiplies of 16
+ "blez $t4, 2f \n"
+ " andi %[width], %[width], 0xf \n" // residual
+
+ "1: \n"
+ "addiu $t4, $t4, -1 \n"
+ "lw $t0, 0(%[src_uv]) \n" // V1 | U1 | V0 | U0
+ "lw $t1, 4(%[src_uv]) \n" // V3 | U3 | V2 | U2
+ "lw $t2, 8(%[src_uv]) \n" // V5 | U5 | V4 | U4
+ "lw $t3, 12(%[src_uv]) \n" // V7 | U7 | V6 | U6
+ "lw $t5, 16(%[src_uv]) \n" // V9 | U9 | V8 | U8
+ "lw $t6, 20(%[src_uv]) \n" // V11 | U11 | V10 |
+ // U10
+ "lw $t7, 24(%[src_uv]) \n" // V13 | U13 | V12 |
+ // U12
+ "lw $t8, 28(%[src_uv]) \n" // V15 | U15 | V14 |
+ // U14
+ "addiu %[src_uv], %[src_uv], 32 \n"
+ "precrq.qb.ph $t9, $t1, $t0 \n" // V3 | V2 | V1 | V0
+ "precr.qb.ph $t0, $t1, $t0 \n" // U3 | U2 | U1 | U0
+ "precrq.qb.ph $t1, $t3, $t2 \n" // V7 | V6 | V5 | V4
+ "precr.qb.ph $t2, $t3, $t2 \n" // U7 | U6 | U5 | U4
+ "precrq.qb.ph $t3, $t6, $t5 \n" // V11 | V10 | V9 | V8
+ "precr.qb.ph $t5, $t6, $t5 \n" // U11 | U10 | U9 | U8
+ "precrq.qb.ph $t6, $t8, $t7 \n" // V15 | V14 | V13 |
+ // V12
+ "precr.qb.ph $t7, $t8, $t7 \n" // U15 | U14 | U13 |
+ // U12
+ "sw $t9, 0(%[dst_v]) \n"
+ "sw $t0, 0(%[dst_u]) \n"
+ "sw $t1, 4(%[dst_v]) \n"
+ "sw $t2, 4(%[dst_u]) \n"
+ "sw $t3, 8(%[dst_v]) \n"
+ "sw $t5, 8(%[dst_u]) \n"
+ "sw $t6, 12(%[dst_v]) \n"
+ "sw $t7, 12(%[dst_u]) \n"
+ "addiu %[dst_v], %[dst_v], 16 \n"
+ "bgtz $t4, 1b \n"
+ " addiu %[dst_u], %[dst_u], 16 \n"
+
+ "beqz %[width], 3f \n"
+ " nop \n"
+
+ "2: \n"
+ "lbu $t0, 0(%[src_uv]) \n"
+ "lbu $t1, 1(%[src_uv]) \n"
+ "addiu %[src_uv], %[src_uv], 2 \n"
+ "addiu %[width], %[width], -1 \n"
+ "sb $t0, 0(%[dst_u]) \n"
+ "sb $t1, 0(%[dst_v]) \n"
+ "addiu %[dst_u], %[dst_u], 1 \n"
+ "bgtz %[width], 2b \n"
+ " addiu %[dst_v], %[dst_v], 1 \n"
+
+ "3: \n"
+ ".set pop \n"
+ : [src_uv] "+r"(src_uv), [width] "+r"(width), [dst_u] "+r"(dst_u),
+ [dst_v] "+r"(dst_v)
+ :
+ : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9");
+}
+
+void MirrorRow_DSPR2(const uint8* src, uint8* dst, int width) {
+ __asm__ __volatile__(
+ ".set push \n"
+ ".set noreorder \n"
+
+ "srl $t4, %[width], 4 \n" // multiplies of 16
+ "andi $t5, %[width], 0xf \n"
+ "blez $t4, 2f \n"
+ " addu %[src], %[src], %[width] \n" // src += width
+
+ "1: \n"
+ "lw $t0, -16(%[src]) \n" // |3|2|1|0|
+ "lw $t1, -12(%[src]) \n" // |7|6|5|4|
+ "lw $t2, -8(%[src]) \n" // |11|10|9|8|
+ "lw $t3, -4(%[src]) \n" // |15|14|13|12|
+ "wsbh $t0, $t0 \n" // |2|3|0|1|
+ "wsbh $t1, $t1 \n" // |6|7|4|5|
+ "wsbh $t2, $t2 \n" // |10|11|8|9|
+ "wsbh $t3, $t3 \n" // |14|15|12|13|
+ "rotr $t0, $t0, 16 \n" // |0|1|2|3|
+ "rotr $t1, $t1, 16 \n" // |4|5|6|7|
+ "rotr $t2, $t2, 16 \n" // |8|9|10|11|
+ "rotr $t3, $t3, 16 \n" // |12|13|14|15|
+ "addiu %[src], %[src], -16 \n"
+ "addiu $t4, $t4, -1 \n"
+ "sw $t3, 0(%[dst]) \n" // |15|14|13|12|
+ "sw $t2, 4(%[dst]) \n" // |11|10|9|8|
+ "sw $t1, 8(%[dst]) \n" // |7|6|5|4|
+ "sw $t0, 12(%[dst]) \n" // |3|2|1|0|
+ "bgtz $t4, 1b \n"
+ " addiu %[dst], %[dst], 16 \n"
+ "beqz $t5, 3f \n"
+ " nop \n"
+
+ "2: \n"
+ "lbu $t0, -1(%[src]) \n"
+ "addiu $t5, $t5, -1 \n"
+ "addiu %[src], %[src], -1 \n"
+ "sb $t0, 0(%[dst]) \n"
+ "bgez $t5, 2b \n"
+ " addiu %[dst], %[dst], 1 \n"
+
+ "3: \n"
+ ".set pop \n"
+ : [src] "+r"(src), [dst] "+r"(dst)
+ : [width] "r"(width)
+ : "t0", "t1", "t2", "t3", "t4", "t5");
+}
+
+void MirrorUVRow_DSPR2(const uint8* src_uv,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
+ int x;
+ int y;
+ __asm__ __volatile__(
+ ".set push \n"
+ ".set noreorder \n"
+
+ "addu $t4, %[width], %[width] \n"
+ "srl %[x], %[width], 4 \n"
+ "andi %[y], %[width], 0xf \n"
+ "blez %[x], 2f \n"
+ " addu %[src_uv], %[src_uv], $t4 \n"
+
+ "1: \n"
+ "lw $t0, -32(%[src_uv]) \n" // |3|2|1|0|
+ "lw $t1, -28(%[src_uv]) \n" // |7|6|5|4|
+ "lw $t2, -24(%[src_uv]) \n" // |11|10|9|8|
+ "lw $t3, -20(%[src_uv]) \n" // |15|14|13|12|
+ "lw $t4, -16(%[src_uv]) \n" // |19|18|17|16|
+ "lw $t6, -12(%[src_uv]) \n" // |23|22|21|20|
+ "lw $t7, -8(%[src_uv]) \n" // |27|26|25|24|
+ "lw $t8, -4(%[src_uv]) \n" // |31|30|29|28|
+
+ "rotr $t0, $t0, 16 \n" // |1|0|3|2|
+ "rotr $t1, $t1, 16 \n" // |5|4|7|6|
+ "rotr $t2, $t2, 16 \n" // |9|8|11|10|
+ "rotr $t3, $t3, 16 \n" // |13|12|15|14|
+ "rotr $t4, $t4, 16 \n" // |17|16|19|18|
+ "rotr $t6, $t6, 16 \n" // |21|20|23|22|
+ "rotr $t7, $t7, 16 \n" // |25|24|27|26|
+ "rotr $t8, $t8, 16 \n" // |29|28|31|30|
+ "precr.qb.ph $t9, $t0, $t1 \n" // |0|2|4|6|
+ "precrq.qb.ph $t5, $t0, $t1 \n" // |1|3|5|7|
+ "precr.qb.ph $t0, $t2, $t3 \n" // |8|10|12|14|
+ "precrq.qb.ph $t1, $t2, $t3 \n" // |9|11|13|15|
+ "precr.qb.ph $t2, $t4, $t6 \n" // |16|18|20|22|
+ "precrq.qb.ph $t3, $t4, $t6 \n" // |17|19|21|23|
+ "precr.qb.ph $t4, $t7, $t8 \n" // |24|26|28|30|
+ "precrq.qb.ph $t6, $t7, $t8 \n" // |25|27|29|31|
+ "addiu %[src_uv], %[src_uv], -32 \n"
+ "addiu %[x], %[x], -1 \n"
+ "swr $t4, 0(%[dst_u]) \n"
+ "swl $t4, 3(%[dst_u]) \n" // |30|28|26|24|
+ "swr $t6, 0(%[dst_v]) \n"
+ "swl $t6, 3(%[dst_v]) \n" // |31|29|27|25|
+ "swr $t2, 4(%[dst_u]) \n"
+ "swl $t2, 7(%[dst_u]) \n" // |22|20|18|16|
+ "swr $t3, 4(%[dst_v]) \n"
+ "swl $t3, 7(%[dst_v]) \n" // |23|21|19|17|
+ "swr $t0, 8(%[dst_u]) \n"
+ "swl $t0, 11(%[dst_u]) \n" // |14|12|10|8|
+ "swr $t1, 8(%[dst_v]) \n"
+ "swl $t1, 11(%[dst_v]) \n" // |15|13|11|9|
+ "swr $t9, 12(%[dst_u]) \n"
+ "swl $t9, 15(%[dst_u]) \n" // |6|4|2|0|
+ "swr $t5, 12(%[dst_v]) \n"
+ "swl $t5, 15(%[dst_v]) \n" // |7|5|3|1|
+ "addiu %[dst_v], %[dst_v], 16 \n"
+ "bgtz %[x], 1b \n"
+ " addiu %[dst_u], %[dst_u], 16 \n"
+ "beqz %[y], 3f \n"
+ " nop \n"
+ "b 2f \n"
+ " nop \n"
+
+ "2: \n"
+ "lbu $t0, -2(%[src_uv]) \n"
+ "lbu $t1, -1(%[src_uv]) \n"
+ "addiu %[src_uv], %[src_uv], -2 \n"
+ "addiu %[y], %[y], -1 \n"
+ "sb $t0, 0(%[dst_u]) \n"
+ "sb $t1, 0(%[dst_v]) \n"
+ "addiu %[dst_u], %[dst_u], 1 \n"
+ "bgtz %[y], 2b \n"
+ " addiu %[dst_v], %[dst_v], 1 \n"
+
+ "3: \n"
+ ".set pop \n"
+ : [src_uv] "+r"(src_uv), [dst_u] "+r"(dst_u), [dst_v] "+r"(dst_v),
+ [x] "=&r"(x), [y] "=&r"(y)
+ : [width] "r"(width)
+ : "t0", "t1", "t2", "t3", "t4", "t5", "t7", "t8", "t9");
+}
+
+void I422ToARGBRow_DSPR2(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ uint32 tmp_ub = yuvconstants->kUVToB[0];
+ uint32 tmp_ug = yuvconstants->kUVToG[0];
+ uint32 tmp_vg = yuvconstants->kUVToG[1];
+ uint32 tmp_vr = yuvconstants->kUVToR[1];
+ uint32 tmp_bb = yuvconstants->kUVBiasB[0];
+ uint32 tmp_bg = yuvconstants->kUVBiasG[0];
+ uint32 tmp_br = yuvconstants->kUVBiasR[0];
+ uint32 yg = yuvconstants->kYToRgb[0];
+ uint32 tmp_yg;
+ uint32 tmp_mask = 0x7fff7fff;
+ tmp_bb = ((uint)(tmp_bb & 0xffff) << 16) | (tmp_bb & 0xffff);
+ tmp_bg = ((uint)(tmp_bg & 0xffff) << 16) | (tmp_bg & 0xffff);
+ tmp_br = ((uint)(tmp_br & 0xffff) << 16) | (tmp_br & 0xffff);
+ tmp_yg = ((uint)(yg & 0xffff) << 16) | (yg & 0xffff);
+ tmp_ub = ~(((uint)(tmp_ub & 0xffff) << 16) | (tmp_ub & 0xffff)) + 0x00010001;
+ tmp_ug = ((uint)(tmp_ug & 0xffff) << 16) | (tmp_ug & 0xffff);
+ tmp_vg = ((uint)(tmp_vg & 0xffff) << 16) | (tmp_vg & 0xffff);
+ tmp_vr = ~(((uint)(tmp_vr & 0xffff) << 16) | (tmp_vr & 0xffff)) + 0x00010001;
+ yg = yg * 0x0101;
+
+ for (x = 0; x < width - 1; x += 2) {
+ uint32 tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;
+ uint32 tmp_t6, tmp_t7, tmp_t8, tmp_t9;
+ __asm__ __volatile__(
+ ".set push \n"
+ ".set noreorder \n"
+ "lbu %[tmp_t7], 0(%[src_y]) \n"
+ "lbu %[tmp_t1], 1(%[src_y]) \n"
+ "mul %[tmp_t7], %[tmp_t7], %[yg] \n"
+ "mul %[tmp_t1], %[tmp_t1], %[yg] \n"
+ "lbu %[tmp_t2], 0(%[src_u]) \n"
+ "lbu %[tmp_t3], 0(%[src_v]) \n"
+ "replv.ph %[tmp_t2], %[tmp_t2] \n"
+ "replv.ph %[tmp_t3], %[tmp_t3] \n"
+ "mul.ph %[tmp_t4], %[tmp_t2], %[tmp_ub] \n"
+ "mul.ph %[tmp_t5], %[tmp_t2], %[tmp_ug] \n"
+ "mul.ph %[tmp_t6], %[tmp_t3], %[tmp_vr] \n"
+ "mul.ph %[tmp_t3], %[tmp_t3], %[tmp_vg] \n"
+ "srl %[tmp_t7], %[tmp_t7], 16 \n"
+ "ins %[tmp_t1], %[tmp_t7], 0, 16 \n"
+ "addq_s.ph %[tmp_t7], %[tmp_t1], %[tmp_bb] \n"
+ "addq_s.ph %[tmp_t8], %[tmp_t1], %[tmp_bg] \n"
+ "addq_s.ph %[tmp_t9], %[tmp_t1], %[tmp_br] \n"
+ "addq_s.ph %[tmp_t5], %[tmp_t5], %[tmp_t3] \n"
+ "addq_s.ph %[tmp_t7], %[tmp_t7], %[tmp_t4] \n"
+ "subq_s.ph %[tmp_t8], %[tmp_t8], %[tmp_t5] \n"
+ "addq_s.ph %[tmp_t9], %[tmp_t9], %[tmp_t6] \n"
+ "shra.ph %[tmp_t7], %[tmp_t7], 6 \n"
+ "shra.ph %[tmp_t8], %[tmp_t8], 6 \n"
+ "shra.ph %[tmp_t9], %[tmp_t9], 6 \n"
+ "shll_s.ph %[tmp_t7], %[tmp_t7], 7 \n"
+ "shll_s.ph %[tmp_t8], %[tmp_t8], 7 \n"
+ "shll_s.ph %[tmp_t9], %[tmp_t9], 7 \n"
+ "precrqu_s.qb.ph %[tmp_t8], %[tmp_mask], %[tmp_t8] \n"
+ "precrqu_s.qb.ph %[tmp_t7], %[tmp_t9], %[tmp_t7] \n"
+ "precrq.ph.w %[tmp_t9], %[tmp_t8], %[tmp_t7] \n"
+ "ins %[tmp_t7], %[tmp_t8], 16, 16 \n"
+ "precr.qb.ph %[tmp_t8], %[tmp_t9], %[tmp_t7] \n"
+ "precrq.qb.ph %[tmp_t7], %[tmp_t9], %[tmp_t7] \n"
+ "sw %[tmp_t8], 0(%[rgb_buf]) \n"
+ "sw %[tmp_t7], 4(%[rgb_buf]) \n"
+ ".set pop \n"
+ : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
+ [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4),
+ [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
+ [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8), [tmp_t9] "=&r"(tmp_t9)
+ : [src_y] "r"(src_y), [src_u] "r"(src_u), [src_v] "r"(src_v),
+ [tmp_ub] "r"(tmp_ub), [tmp_ug] "r"(tmp_ug), [yg] "r"(yg),
+ [tmp_vg] "r"(tmp_vg), [tmp_vr] "r"(tmp_vr), [tmp_bb] "r"(tmp_bb),
+ [tmp_bg] "r"(tmp_bg), [tmp_br] "r"(tmp_br), [tmp_yg] "r"(tmp_yg),
+ [rgb_buf] "r"(rgb_buf), [tmp_mask] "r"(tmp_mask));
+ src_y += 2;
+ src_u += 1;
+ src_v += 1;
+ rgb_buf += 8; // Advance 4 pixels.
+ }
+}
+
+// Bilinear filter 8x2 -> 8x1
+void InterpolateRow_DSPR2(uint8* dst_ptr,
+ const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ int dst_width,
+ int source_y_fraction) {
+ int y0_fraction = 256 - source_y_fraction;
+ const uint8* src_ptr1 = src_ptr + src_stride;
+
+ __asm__ __volatile__(
+ ".set push \n"
+ ".set noreorder \n"
+
+ "replv.ph $t0, %[y0_fraction] \n"
+ "replv.ph $t1, %[source_y_fraction] \n"
+
+ "1: \n"
+ "lw $t2, 0(%[src_ptr]) \n"
+ "lw $t3, 0(%[src_ptr1]) \n"
+ "lw $t4, 4(%[src_ptr]) \n"
+ "lw $t5, 4(%[src_ptr1]) \n"
+ "muleu_s.ph.qbl $t6, $t2, $t0 \n"
+ "muleu_s.ph.qbr $t7, $t2, $t0 \n"
+ "muleu_s.ph.qbl $t8, $t3, $t1 \n"
+ "muleu_s.ph.qbr $t9, $t3, $t1 \n"
+ "muleu_s.ph.qbl $t2, $t4, $t0 \n"
+ "muleu_s.ph.qbr $t3, $t4, $t0 \n"
+ "muleu_s.ph.qbl $t4, $t5, $t1 \n"
+ "muleu_s.ph.qbr $t5, $t5, $t1 \n"
+ "addq.ph $t6, $t6, $t8 \n"
+ "addq.ph $t7, $t7, $t9 \n"
+ "addq.ph $t2, $t2, $t4 \n"
+ "addq.ph $t3, $t3, $t5 \n"
+ "shra_r.ph $t6, $t6, 8 \n"
+ "shra_r.ph $t7, $t7, 8 \n"
+ "shra_r.ph $t2, $t2, 8 \n"
+ "shra_r.ph $t3, $t3, 8 \n"
+ "precr.qb.ph $t6, $t6, $t7 \n"
+ "precr.qb.ph $t2, $t2, $t3 \n"
+ "addiu %[src_ptr], %[src_ptr], 8 \n"
+ "addiu %[src_ptr1], %[src_ptr1], 8 \n"
+ "addiu %[dst_width], %[dst_width], -8 \n"
+ "sw $t6, 0(%[dst_ptr]) \n"
+ "sw $t2, 4(%[dst_ptr]) \n"
+ "bgtz %[dst_width], 1b \n"
+ " addiu %[dst_ptr], %[dst_ptr], 8 \n"
+
+ ".set pop \n"
+ : [dst_ptr] "+r"(dst_ptr), [src_ptr1] "+r"(src_ptr1),
+ [src_ptr] "+r"(src_ptr), [dst_width] "+r"(dst_width)
+ : [source_y_fraction] "r"(source_y_fraction),
+ [y0_fraction] "r"(y0_fraction), [src_stride] "r"(src_stride)
+ : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9");
+}
+#include
+void RGB24ToARGBRow_DSPR2(const uint8* src_rgb24, uint8* dst_argb, int width) {
+ int x;
+ uint32 tmp_mask = 0xff;
+ uint32 tmp_t1;
+ for (x = 0; x < (width - 1); ++x) {
+ __asm__ __volatile__(
+ ".set push \n"
+ ".set noreorder \n"
+ "ulw %[tmp_t1], 0(%[src_rgb24]) \n"
+ "addiu %[dst_argb], %[dst_argb], 4 \n"
+ "addiu %[src_rgb24], %[src_rgb24], 3 \n"
+ "ins %[tmp_t1], %[tmp_mask], 24, 8 \n"
+ "sw %[tmp_t1], -4(%[dst_argb]) \n"
+ ".set pop \n"
+ : [src_rgb24] "+r"(src_rgb24), [dst_argb] "+r"(dst_argb),
+ [tmp_t1] "=&r"(tmp_t1)
+ : [tmp_mask] "r"(tmp_mask)
+ : "memory");
+ }
+ uint8 b = src_rgb24[0];
+ uint8 g = src_rgb24[1];
+ uint8 r = src_rgb24[2];
+ dst_argb[0] = b;
+ dst_argb[1] = g;
+ dst_argb[2] = r;
+ dst_argb[3] = 255u;
+}
+
+void RAWToARGBRow_DSPR2(const uint8* src_raw, uint8* dst_argb, int width) {
+ int x;
+ uint32 tmp_mask = 0xff;
+ uint32 tmp_t1, tmp_t2;
+ for (x = 0; x < (width - 1); ++x) {
+ __asm__ __volatile__(
+ ".set push \n"
+ ".set noreorder \n"
+ "ulw %[tmp_t1], 0(%[src_raw]) \n"
+ "addiu %[dst_argb], %[dst_argb], 4 \n"
+ "addiu %[src_raw], %[src_raw], 3 \n"
+ "srl %[tmp_t2], %[tmp_t1], 16 \n"
+ "ins %[tmp_t1], %[tmp_mask], 24, 8 \n"
+ "ins %[tmp_t1], %[tmp_t1], 16, 8 \n"
+ "ins %[tmp_t1], %[tmp_t2], 0, 8 \n"
+ "sw %[tmp_t1], -4(%[dst_argb]) \n"
+ ".set pop \n"
+ : [src_raw] "+r"(src_raw), [dst_argb] "+r"(dst_argb),
+ [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2)
+ : [tmp_mask] "r"(tmp_mask)
+ : "memory");
+ }
+ uint8 r = src_raw[0];
+ uint8 g = src_raw[1];
+ uint8 b = src_raw[2];
+ dst_argb[0] = b;
+ dst_argb[1] = g;
+ dst_argb[2] = r;
+ dst_argb[3] = 255u;
+}
+
+void RGB565ToARGBRow_DSPR2(const uint8* src_rgb565,
+ uint8* dst_argb,
+ int width) {
+ int x;
+ uint32 tmp_mask = 0xff;
+ uint32 tmp_t1, tmp_t2, tmp_t3;
+ for (x = 0; x < width; ++x) {
+ __asm__ __volatile__(
+ ".set push \n"
+ ".set noreorder \n"
+ "lhu %[tmp_t1], 0(%[src_rgb565]) \n"
+ "addiu %[dst_argb], %[dst_argb], 4 \n"
+ "addiu %[src_rgb565], %[src_rgb565], 2 \n"
+ "sll %[tmp_t2], %[tmp_t1], 8 \n"
+ "ins %[tmp_t2], %[tmp_mask], 24,8 \n"
+ "ins %[tmp_t2], %[tmp_t1], 3, 16 \n"
+ "ins %[tmp_t2], %[tmp_t1], 5, 11 \n"
+ "srl %[tmp_t3], %[tmp_t1], 9 \n"
+ "ins %[tmp_t2], %[tmp_t3], 8, 2 \n"
+ "ins %[tmp_t2], %[tmp_t1], 3, 5 \n"
+ "srl %[tmp_t3], %[tmp_t1], 2 \n"
+ "ins %[tmp_t2], %[tmp_t3], 0, 3 \n"
+ "sw %[tmp_t2], -4(%[dst_argb]) \n"
+ ".set pop \n"
+ : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
+ [tmp_t3] "=&r"(tmp_t3), [src_rgb565] "+r"(src_rgb565),
+ [dst_argb] "+r"(dst_argb)
+ : [tmp_mask] "r"(tmp_mask));
+ }
+}
+
+void ARGB1555ToARGBRow_DSPR2(const uint8* src_argb1555,
+ uint8* dst_argb,
+ int width) {
+ int x;
+ uint32 tmp_t1, tmp_t2, tmp_t3;
+ for (x = 0; x < width; ++x) {
+ __asm__ __volatile__(
+ ".set push \n"
+ ".set noreorder \n"
+ "lh %[tmp_t1], 0(%[src_argb1555]) \n"
+ "addiu %[dst_argb], %[dst_argb], 4 \n"
+ "addiu %[src_argb1555], %[src_argb1555], 2 \n"
+ "sll %[tmp_t2], %[tmp_t1], 9 \n"
+ "ins %[tmp_t2], %[tmp_t1], 4, 15 \n"
+ "ins %[tmp_t2], %[tmp_t1], 6, 10 \n"
+ "srl %[tmp_t3], %[tmp_t1], 7 \n"
+ "ins %[tmp_t2], %[tmp_t3], 8, 3 \n"
+ "ins %[tmp_t2], %[tmp_t1], 3, 5 \n"
+ "srl %[tmp_t3], %[tmp_t1], 2 \n"
+ "ins %[tmp_t2], %[tmp_t3], 0, 3 \n"
+ "sw %[tmp_t2], -4(%[dst_argb]) \n"
+ ".set pop \n"
+ : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
+ [tmp_t3] "=&r"(tmp_t3), [src_argb1555] "+r"(src_argb1555),
+ [dst_argb] "+r"(dst_argb)
+ :);
+ }
+}
+
+void ARGB4444ToARGBRow_DSPR2(const uint8* src_argb4444,
+ uint8* dst_argb,
+ int width) {
+ int x;
+ uint32 tmp_t1;
+ for (x = 0; x < width; ++x) {
+ __asm__ __volatile__(
+ ".set push \n"
+ ".set noreorder \n"
+ "lh %[tmp_t1], 0(%[src_argb4444]) \n"
+ "addiu %[dst_argb], %[dst_argb], 4 \n"
+ "addiu %[src_argb4444], %[src_argb4444], 2 \n"
+ "ins %[tmp_t1], %[tmp_t1], 16, 16 \n"
+ "ins %[tmp_t1], %[tmp_t1], 12, 16 \n"
+ "ins %[tmp_t1], %[tmp_t1], 8, 12 \n"
+ "ins %[tmp_t1], %[tmp_t1], 4, 8 \n"
+ "sw %[tmp_t1], -4(%[dst_argb]) \n"
+ ".set pop \n"
+ : [src_argb4444] "+r"(src_argb4444), [dst_argb] "+r"(dst_argb),
+ [tmp_t1] "=&r"(tmp_t1));
+ }
+}
+
+void I444ToARGBRow_DSPR2(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ uint32 tmp_ub = yuvconstants->kUVToB[0];
+ uint32 tmp_ug = yuvconstants->kUVToG[0];
+ uint32 tmp_vg = yuvconstants->kUVToG[1];
+ uint32 tmp_vr = yuvconstants->kUVToR[1];
+ uint32 tmp_bb = yuvconstants->kUVBiasB[0];
+ uint32 tmp_bg = yuvconstants->kUVBiasG[0];
+ uint32 tmp_br = yuvconstants->kUVBiasR[0];
+ uint32 yg = yuvconstants->kYToRgb[0];
+ uint32 tmp_mask = 0x7fff7fff;
+ uint32 tmp_yg;
+
+ tmp_bb = ((uint)(tmp_bb & 0xffff) << 16) | (tmp_bb & 0xffff);
+ tmp_bg = ((uint)(tmp_bg & 0xffff) << 16) | (tmp_bg & 0xffff);
+ tmp_br = ((uint)(tmp_br & 0xffff) << 16) | (tmp_br & 0xffff);
+ tmp_yg = ((uint)(yg & 0xffff) << 16) | (yg & 0xffff);
+ tmp_ub = ~(((uint)(tmp_ub & 0xffff) << 16) | (tmp_ub & 0xffff)) + 0x00010001;
+ tmp_ug = ((uint)(tmp_ug & 0xffff) << 16) | (tmp_ug & 0xffff);
+ tmp_vg = ((uint)(tmp_vg & 0xffff) << 16) | (tmp_vg & 0xffff);
+ tmp_vr = ~(((uint)(tmp_vr & 0xffff) << 16) | (tmp_vr & 0xffff)) + 0x00010001;
+ yg = yg * 0x0101;
+
+ for (x = 0; x < width - 1; x += 2) {
+ uint32 tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;
+ uint32 tmp_t6, tmp_t7, tmp_t8, tmp_t9;
+ __asm__ __volatile__(
+ ".set push \n"
+ ".set noreorder \n"
+ "lbu %[tmp_t7], 0(%[y_buf]) \n"
+ "lbu %[tmp_t1], 1(%[y_buf]) \n"
+ "mul %[tmp_t7], %[tmp_t7], %[yg] \n"
+ "mul %[tmp_t1], %[tmp_t1], %[yg] \n"
+ "lh %[tmp_t2], 0(%[u_buf]) \n"
+ "lh %[tmp_t3], 0(%[v_buf]) \n"
+ "preceu.ph.qbr %[tmp_t2], %[tmp_t2] \n"
+ "preceu.ph.qbr %[tmp_t3], %[tmp_t3] \n"
+ "mul.ph %[tmp_t4], %[tmp_t2], %[tmp_ub] \n"
+ "mul.ph %[tmp_t5], %[tmp_t2], %[tmp_ug] \n"
+ "mul.ph %[tmp_t6], %[tmp_t3], %[tmp_vr] \n"
+ "mul.ph %[tmp_t3], %[tmp_t3], %[tmp_vg] \n"
+ "srl %[tmp_t7], %[tmp_t7], 16 \n"
+ "ins %[tmp_t1], %[tmp_t7], 0, 16 \n"
+ "addq_s.ph %[tmp_t7], %[tmp_t1], %[tmp_bb] \n"
+ "addq_s.ph %[tmp_t8], %[tmp_t1], %[tmp_bg] \n"
+ "addq_s.ph %[tmp_t9], %[tmp_t1], %[tmp_br] \n"
+ "addq_s.ph %[tmp_t5], %[tmp_t5], %[tmp_t3] \n"
+ "addq_s.ph %[tmp_t7], %[tmp_t7], %[tmp_t4] \n"
+ "subq_s.ph %[tmp_t8], %[tmp_t8], %[tmp_t5] \n"
+ "addq_s.ph %[tmp_t9], %[tmp_t9], %[tmp_t6] \n"
+ "shra.ph %[tmp_t7], %[tmp_t7], 6 \n"
+ "shra.ph %[tmp_t8], %[tmp_t8], 6 \n"
+ "shra.ph %[tmp_t9], %[tmp_t9], 6 \n"
+ "shll_s.ph %[tmp_t7], %[tmp_t7], 7 \n"
+ "shll_s.ph %[tmp_t8], %[tmp_t8], 7 \n"
+ "shll_s.ph %[tmp_t9], %[tmp_t9], 7 \n"
+ "precrqu_s.qb.ph %[tmp_t8], %[tmp_mask], %[tmp_t8] \n"
+ "precrqu_s.qb.ph %[tmp_t7], %[tmp_t9], %[tmp_t7] \n"
+ "precrq.ph.w %[tmp_t2], %[tmp_t8], %[tmp_t7] \n"
+ "ins %[tmp_t7], %[tmp_t8], 16, 16 \n"
+ "precr.qb.ph %[tmp_t8], %[tmp_t2], %[tmp_t7] \n"
+ "precrq.qb.ph %[tmp_t7], %[tmp_t2], %[tmp_t7] \n"
+ "sw %[tmp_t8], 0(%[rgb_buf]) \n"
+ "sw %[tmp_t7], 4(%[rgb_buf]) \n"
+ ".set pop \n"
+ : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
+ [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4),
+ [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
+ [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8), [tmp_t9] "=&r"(tmp_t9)
+ : [y_buf] "r"(y_buf), [yg] "r"(yg), [u_buf] "r"(u_buf),
+ [v_buf] "r"(v_buf), [tmp_ub] "r"(tmp_ub), [tmp_ug] "r"(tmp_ug),
+ [tmp_vg] "r"(tmp_vg), [tmp_vr] "r"(tmp_vr), [tmp_bb] "r"(tmp_bb),
+ [tmp_bg] "r"(tmp_bg), [tmp_br] "r"(tmp_br), [tmp_yg] "r"(tmp_yg),
+ [rgb_buf] "r"(rgb_buf), [tmp_mask] "r"(tmp_mask));
+ y_buf += 2;
+ u_buf += 2;
+ v_buf += 2;
+ rgb_buf += 8; // Advance 1 pixel.
+ }
+}
+
+void I422ToARGB4444Row_DSPR2(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb4444,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ uint32 tmp_ub = yuvconstants->kUVToB[0];
+ uint32 tmp_ug = yuvconstants->kUVToG[0];
+ uint32 tmp_vg = yuvconstants->kUVToG[1];
+ uint32 tmp_vr = yuvconstants->kUVToR[1];
+ uint32 tmp_bb = yuvconstants->kUVBiasB[0];
+ uint32 tmp_bg = yuvconstants->kUVBiasG[0];
+ uint32 tmp_br = yuvconstants->kUVBiasR[0];
+ uint32 yg = yuvconstants->kYToRgb[0];
+ uint32 tmp_yg;
+ uint32 tmp_mask = 0x7fff7fff;
+ tmp_bb = ((uint)(tmp_bb & 0xffff) << 16) | (tmp_bb & 0xffff);
+ tmp_bg = ((uint)(tmp_bg & 0xffff) << 16) | (tmp_bg & 0xffff);
+ tmp_br = ((uint)(tmp_br & 0xffff) << 16) | (tmp_br & 0xffff);
+ tmp_yg = ((uint)(yg & 0xffff) << 16) | (yg & 0xffff);
+ tmp_ub = ~(((uint)(tmp_ub & 0xffff) << 16) | (tmp_ub & 0xffff)) + 0x00010001;
+ tmp_ug = ((uint)(tmp_ug & 0xffff) << 16) | (tmp_ug & 0xffff);
+ tmp_vg = ((uint)(tmp_vg & 0xffff) << 16) | (tmp_vg & 0xffff);
+ tmp_vr = ~(((uint)(tmp_vr & 0xffff) << 16) | (tmp_vr & 0xffff)) + 0x00010001;
+ yg = yg * 0x0101;
+
+ for (x = 0; x < width - 1; x += 2) {
+ uint32 tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;
+ uint32 tmp_t6, tmp_t7, tmp_t8, tmp_t9;
+ __asm__ __volatile__(
+ ".set push \n"
+ ".set noreorder \n"
+ "lbu %[tmp_t7], 0(%[src_y]) \n"
+ "lbu %[tmp_t1], 1(%[src_y]) \n"
+ "mul %[tmp_t7], %[tmp_t7], %[yg] \n"
+ "mul %[tmp_t1], %[tmp_t1], %[yg] \n"
+ "lbu %[tmp_t2], 0(%[src_u]) \n"
+ "lbu %[tmp_t3], 0(%[src_v]) \n"
+ "replv.ph %[tmp_t2], %[tmp_t2] \n"
+ "replv.ph %[tmp_t3], %[tmp_t3] \n"
+ "mul.ph %[tmp_t4], %[tmp_t2], %[tmp_ub] \n"
+ "mul.ph %[tmp_t5], %[tmp_t2], %[tmp_ug] \n"
+ "mul.ph %[tmp_t6], %[tmp_t3], %[tmp_vr] \n"
+ "mul.ph %[tmp_t3], %[tmp_t3], %[tmp_vg] \n"
+ "srl %[tmp_t7], %[tmp_t7], 16 \n"
+ "ins %[tmp_t1], %[tmp_t7], 0, 16 \n"
+ "addq_s.ph %[tmp_t7], %[tmp_t1], %[tmp_bb] \n"
+ "addq_s.ph %[tmp_t8], %[tmp_t1], %[tmp_bg] \n"
+ "addq_s.ph %[tmp_t9], %[tmp_t1], %[tmp_br] \n"
+ "addq_s.ph %[tmp_t5], %[tmp_t5], %[tmp_t3] \n"
+ "addq_s.ph %[tmp_t7], %[tmp_t7], %[tmp_t4] \n"
+ "subq_s.ph %[tmp_t8], %[tmp_t8], %[tmp_t5] \n"
+ "addq_s.ph %[tmp_t9], %[tmp_t9], %[tmp_t6] \n"
+ "shra.ph %[tmp_t7], %[tmp_t7], 6 \n"
+ "shra.ph %[tmp_t8], %[tmp_t8], 6 \n"
+ "shra.ph %[tmp_t9], %[tmp_t9], 6 \n"
+ "shll_s.ph %[tmp_t7], %[tmp_t7], 7 \n"
+ "shll_s.ph %[tmp_t8], %[tmp_t8], 7 \n"
+ "shll_s.ph %[tmp_t9], %[tmp_t9], 7 \n"
+ "precrqu_s.qb.ph %[tmp_t8], %[tmp_mask], %[tmp_t8] \n"
+ "precrqu_s.qb.ph %[tmp_t7], %[tmp_t9], %[tmp_t7] \n"
+ "precrq.ph.w %[tmp_t2], %[tmp_t8], %[tmp_t7] \n"
+ "ins %[tmp_t7], %[tmp_t8], 16, 16 \n"
+ "precr.qb.ph %[tmp_t8], %[tmp_t2], %[tmp_t7] \n"
+ "precrq.qb.ph %[tmp_t7], %[tmp_t2], %[tmp_t7] \n"
+ "shrl.qb %[tmp_t1], %[tmp_t8], 4 \n"
+ "shrl.qb %[tmp_t2], %[tmp_t7], 4 \n"
+ "shrl.ph %[tmp_t8], %[tmp_t1], 4 \n"
+ "shrl.ph %[tmp_t7], %[tmp_t2], 4 \n"
+ "or %[tmp_t8], %[tmp_t8], %[tmp_t1] \n"
+ "or %[tmp_t7], %[tmp_t7], %[tmp_t2] \n"
+ "precr.qb.ph %[tmp_t8], %[tmp_t7], %[tmp_t8] \n"
+ "sw %[tmp_t8], 0(%[dst_argb4444]) \n"
+ ".set pop \n"
+ : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
+ [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4),
+ [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
+ [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8), [tmp_t9] "=&r"(tmp_t9)
+ : [dst_argb4444] "r"(dst_argb4444), [yg] "r"(yg), [src_u] "r"(src_u),
+ [src_v] "r"(src_v), [src_y] "r"(src_y), [tmp_ub] "r"(tmp_ub),
+ [tmp_ug] "r"(tmp_ug), [tmp_vg] "r"(tmp_vg), [tmp_vr] "r"(tmp_vr),
+ [tmp_bb] "r"(tmp_bb), [tmp_bg] "r"(tmp_bg), [tmp_br] "r"(tmp_br),
+ [tmp_yg] "r"(tmp_yg), [tmp_mask] "r"(tmp_mask));
+ src_y += 2;
+ src_u += 1;
+ src_v += 1;
+ dst_argb4444 += 4; // Advance 2 pixels.
+ }
+}
+
+void I422ToARGB1555Row_DSPR2(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb1555,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ uint32 tmp_ub = yuvconstants->kUVToB[0];
+ uint32 tmp_ug = yuvconstants->kUVToG[0];
+ uint32 tmp_vg = yuvconstants->kUVToG[1];
+ uint32 tmp_vr = yuvconstants->kUVToR[1];
+ uint32 tmp_bb = yuvconstants->kUVBiasB[0];
+ uint32 tmp_bg = yuvconstants->kUVBiasG[0];
+ uint32 tmp_br = yuvconstants->kUVBiasR[0];
+ uint32 yg = yuvconstants->kYToRgb[0];
+ uint32 tmp_yg;
+ uint32 tmp_mask = 0x80008000;
+ tmp_bb = ((uint)(tmp_bb & 0xffff) << 16) | (tmp_bb & 0xffff);
+ tmp_bg = ((uint)(tmp_bg & 0xffff) << 16) | (tmp_bg & 0xffff);
+ tmp_br = ((uint)(tmp_br & 0xffff) << 16) | (tmp_br & 0xffff);
+ tmp_yg = ((uint)(yg & 0xffff) << 16) | (yg & 0xffff);
+ tmp_ub = ~(((uint)(tmp_ub & 0xffff) << 16) | (tmp_ub & 0xffff)) + 0x00010001;
+ tmp_ug = ((uint)(tmp_ug & 0xffff) << 16) | (tmp_ug & 0xffff);
+ tmp_vg = ((uint)(tmp_vg & 0xffff) << 16) | (tmp_vg & 0xffff);
+ tmp_vr = ~(((uint)(tmp_vr & 0xffff) << 16) | (tmp_vr & 0xffff)) + 0x00010001;
+ yg = yg * 0x0101;
+
+ for (x = 0; x < width - 1; x += 2) {
+ uint32 tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;
+ uint32 tmp_t6, tmp_t7, tmp_t8, tmp_t9;
+ __asm__ __volatile__(
+ ".set push \n"
+ ".set noreorder \n"
+ "lbu %[tmp_t7], 0(%[src_y]) \n"
+ "lbu %[tmp_t1], 1(%[src_y]) \n"
+ "mul %[tmp_t7], %[tmp_t7], %[yg] \n"
+ "mul %[tmp_t1], %[tmp_t1], %[yg] \n"
+ "lbu %[tmp_t2], 0(%[src_u]) \n"
+ "lbu %[tmp_t3], 0(%[src_v]) \n"
+ "replv.ph %[tmp_t2], %[tmp_t2] \n"
+ "replv.ph %[tmp_t3], %[tmp_t3] \n"
+ "mul.ph %[tmp_t4], %[tmp_t2], %[tmp_ub] \n"
+ "mul.ph %[tmp_t5], %[tmp_t2], %[tmp_ug] \n"
+ "mul.ph %[tmp_t6], %[tmp_t3], %[tmp_vr] \n"
+ "mul.ph %[tmp_t3], %[tmp_t3], %[tmp_vg] \n"
+ "srl %[tmp_t7], %[tmp_t7], 16 \n"
+ "ins %[tmp_t1], %[tmp_t7], 0, 16 \n"
+ "addq_s.ph %[tmp_t7], %[tmp_t1], %[tmp_bb] \n"
+ "addq_s.ph %[tmp_t8], %[tmp_t1], %[tmp_bg] \n"
+ "addq_s.ph %[tmp_t9], %[tmp_t1], %[tmp_br] \n"
+ "addq_s.ph %[tmp_t5], %[tmp_t5], %[tmp_t3] \n"
+ "addq_s.ph %[tmp_t7], %[tmp_t7], %[tmp_t4] \n"
+ "subq_s.ph %[tmp_t8], %[tmp_t8], %[tmp_t5] \n"
+ "addq_s.ph %[tmp_t9], %[tmp_t9], %[tmp_t6] \n"
+ "shra.ph %[tmp_t7], %[tmp_t7], 6 \n"
+ "shra.ph %[tmp_t8], %[tmp_t8], 6 \n"
+ "shra.ph %[tmp_t9], %[tmp_t9], 6 \n"
+ "shll_s.ph %[tmp_t7], %[tmp_t7], 7 \n"
+ "shll_s.ph %[tmp_t8], %[tmp_t8], 7 \n"
+ "shll_s.ph %[tmp_t9], %[tmp_t9], 7 \n"
+ "precrqu_s.qb.ph %[tmp_t8], %[tmp_mask], %[tmp_t8] \n"
+ "precrqu_s.qb.ph %[tmp_t7], %[tmp_t9], %[tmp_t7] \n"
+ "precrq.ph.w %[tmp_t2], %[tmp_t8], %[tmp_t7] \n"
+ "ins %[tmp_t7], %[tmp_t8], 16, 16 \n"
+ "precr.qb.ph %[tmp_t8], %[tmp_t2], %[tmp_t7] \n"
+ "precrq.qb.ph %[tmp_t7], %[tmp_t2], %[tmp_t7] \n"
+ "ins %[tmp_t3], %[tmp_t8], 7, 24 \n"
+ "ins %[tmp_t3], %[tmp_t8], 10, 16 \n"
+ "ins %[tmp_t3], %[tmp_t8], 13, 8 \n"
+ "ins %[tmp_t4], %[tmp_t7], 7, 24 \n"
+ "ins %[tmp_t4], %[tmp_t7], 10, 16 \n"
+ "ins %[tmp_t4], %[tmp_t7], 13, 8 \n"
+ "precrq.ph.w %[tmp_t8], %[tmp_t4], %[tmp_t3] \n"
+ "or %[tmp_t8], %[tmp_t8], %[tmp_mask]\n"
+ "sw %[tmp_t8], 0(%[dst_argb1555]) \n"
+ ".set pop \n"
+ : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
+ [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4),
+ [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
+ [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8), [tmp_t9] "=&r"(tmp_t9)
+ : [dst_argb1555] "r"(dst_argb1555), [yg] "r"(yg), [src_u] "r"(src_u),
+ [src_v] "r"(src_v), [src_y] "r"(src_y), [tmp_ub] "r"(tmp_ub),
+ [tmp_ug] "r"(tmp_ug), [tmp_vg] "r"(tmp_vg), [tmp_vr] "r"(tmp_vr),
+ [tmp_bb] "r"(tmp_bb), [tmp_bg] "r"(tmp_bg), [tmp_br] "r"(tmp_br),
+ [tmp_yg] "r"(tmp_yg), [tmp_mask] "r"(tmp_mask));
+ src_y += 2;
+ src_u += 1;
+ src_v += 1;
+ dst_argb1555 += 4; // Advance 2 pixels.
+ }
+}
+
+void NV12ToARGBRow_DSPR2(const uint8* src_y,
+ const uint8* src_uv,
+ uint8* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ uint32 tmp_ub = yuvconstants->kUVToB[0];
+ uint32 tmp_ug = yuvconstants->kUVToG[0];
+ uint32 tmp_vg = yuvconstants->kUVToG[1];
+ uint32 tmp_vr = yuvconstants->kUVToR[1];
+ uint32 tmp_bb = yuvconstants->kUVBiasB[0];
+ uint32 tmp_bg = yuvconstants->kUVBiasG[0];
+ uint32 tmp_br = yuvconstants->kUVBiasR[0];
+ uint32 yg = yuvconstants->kYToRgb[0];
+ uint32 tmp_mask = 0x7fff7fff;
+ uint32 tmp_yg;
+ tmp_bb = ((uint)(tmp_bb & 0xffff) << 16) | (tmp_bb & 0xffff);
+ tmp_bg = ((uint)(tmp_bg & 0xffff) << 16) | (tmp_bg & 0xffff);
+ tmp_br = ((uint)(tmp_br & 0xffff) << 16) | (tmp_br & 0xffff);
+ tmp_yg = ((uint)(yg & 0xffff) << 16) | (yg & 0xffff);
+ tmp_ub = ~(((uint)(tmp_ub & 0xffff) << 16) | (tmp_ub & 0xffff)) + 0x00010001;
+ tmp_ug = ((uint)(tmp_ug & 0xffff) << 16) | (tmp_ug & 0xffff);
+ tmp_vg = ((uint)(tmp_vg & 0xffff) << 16) | (tmp_vg & 0xffff);
+ tmp_vr = ~(((uint)(tmp_vr & 0xffff) << 16) | (tmp_vr & 0xffff)) + 0x00010001;
+ yg = yg * 0x0101;
+
+ for (x = 0; x < width - 1; x += 2) {
+ uint32 tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;
+ uint32 tmp_t6, tmp_t7, tmp_t8, tmp_t9;
+ __asm__ __volatile__(
+ ".set push \n"
+ ".set noreorder \n"
+ "lbu %[tmp_t7], 0(%[src_y]) \n"
+ "lbu %[tmp_t1], 1(%[src_y]) \n"
+ "mul %[tmp_t7], %[tmp_t7], %[yg] \n"
+ "mul %[tmp_t1], %[tmp_t1], %[yg] \n"
+ "lbu %[tmp_t2], 0(%[src_uv]) \n"
+ "lbu %[tmp_t3], 1(%[src_uv]) \n"
+ "replv.ph %[tmp_t2], %[tmp_t2] \n"
+ "replv.ph %[tmp_t3], %[tmp_t3] \n"
+ "mul.ph %[tmp_t4], %[tmp_t2], %[tmp_ub] \n"
+ "mul.ph %[tmp_t5], %[tmp_t2], %[tmp_ug] \n"
+ "mul.ph %[tmp_t6], %[tmp_t3], %[tmp_vr] \n"
+ "mul.ph %[tmp_t3], %[tmp_t3], %[tmp_vg] \n"
+ "srl %[tmp_t7], %[tmp_t7], 16 \n"
+ "ins %[tmp_t1], %[tmp_t7], 0, 16 \n"
+ "addq_s.ph %[tmp_t7], %[tmp_t1], %[tmp_bb] \n"
+ "addq_s.ph %[tmp_t8], %[tmp_t1], %[tmp_bg] \n"
+ "addq_s.ph %[tmp_t9], %[tmp_t1], %[tmp_br] \n"
+ "addq_s.ph %[tmp_t5], %[tmp_t5], %[tmp_t3] \n"
+ "addq_s.ph %[tmp_t7], %[tmp_t7], %[tmp_t4] \n"
+ "subq_s.ph %[tmp_t8], %[tmp_t8], %[tmp_t5] \n"
+ "addq_s.ph %[tmp_t9], %[tmp_t9], %[tmp_t6] \n"
+ "shra.ph %[tmp_t7], %[tmp_t7], 6 \n"
+ "shra.ph %[tmp_t8], %[tmp_t8], 6 \n"
+ "shra.ph %[tmp_t9], %[tmp_t9], 6 \n"
+ "shll_s.ph %[tmp_t7], %[tmp_t7], 7 \n"
+ "shll_s.ph %[tmp_t8], %[tmp_t8], 7 \n"
+ "shll_s.ph %[tmp_t9], %[tmp_t9], 7 \n"
+ "precrqu_s.qb.ph %[tmp_t8], %[tmp_mask], %[tmp_t8] \n"
+ "precrqu_s.qb.ph %[tmp_t7], %[tmp_t9], %[tmp_t7] \n"
+ "precrq.ph.w %[tmp_t2], %[tmp_t8], %[tmp_t7] \n"
+ "ins %[tmp_t7], %[tmp_t8], 16, 16 \n"
+ "precr.qb.ph %[tmp_t8], %[tmp_t2], %[tmp_t7] \n"
+ "precrq.qb.ph %[tmp_t7], %[tmp_t2], %[tmp_t7] \n"
+ "sw %[tmp_t8], 0(%[rgb_buf]) \n"
+ "sw %[tmp_t7], 4(%[rgb_buf]) \n"
+ ".set pop \n"
+ : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
+ [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4),
+ [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
+ [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8), [tmp_t9] "=&r"(tmp_t9)
+ : [src_y] "r"(src_y), [src_uv] "r"(src_uv), [yg] "r"(yg),
+ [tmp_ub] "r"(tmp_ub), [tmp_ug] "r"(tmp_ug), [tmp_vg] "r"(tmp_vg),
+ [tmp_vr] "r"(tmp_vr), [tmp_bb] "r"(tmp_bb), [tmp_bg] "r"(tmp_bg),
+ [tmp_br] "r"(tmp_br), [tmp_yg] "r"(tmp_yg), [rgb_buf] "r"(rgb_buf),
+ [tmp_mask] "r"(tmp_mask));
+
+ src_y += 2;
+ src_uv += 2;
+ rgb_buf += 8; // Advance 2 pixels.
+ }
+}
+
+void BGRAToUVRow_DSPR2(const uint8* src_rgb0,
+ int src_stride_rgb,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
+ const uint8* src_rgb1 = src_rgb0 + src_stride_rgb;
+ int x;
+ int const1 = 0xffda0000;
+ int const2 = 0x0070ffb6;
+ int const3 = 0x00700000;
+ int const4 = 0xffeeffa2;
+ int const5 = 0x100;
+ for (x = 0; x < width - 1; x += 2) {
+ int tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;
+ int tmp_t6, tmp_t7, tmp_t8;
+ __asm__ __volatile__(
+ ".set push \n"
+ ".set noreorder \n"
+ "lw %[tmp_t1], 0(%[src_rgb0]) \n"
+ "lw %[tmp_t2], 4(%[src_rgb0]) \n"
+ "lw %[tmp_t3], 0(%[src_rgb1]) \n"
+ "lw %[tmp_t4], 4(%[src_rgb1]) \n"
+ "preceu.ph.qbr %[tmp_t5], %[tmp_t1] \n"
+ "preceu.ph.qbl %[tmp_t1], %[tmp_t1] \n"
+ "preceu.ph.qbr %[tmp_t6], %[tmp_t2] \n"
+ "preceu.ph.qbl %[tmp_t2], %[tmp_t2] \n"
+ "preceu.ph.qbr %[tmp_t7], %[tmp_t3] \n"
+ "preceu.ph.qbl %[tmp_t3], %[tmp_t3] \n"
+ "preceu.ph.qbr %[tmp_t8], %[tmp_t4] \n"
+ "preceu.ph.qbl %[tmp_t4], %[tmp_t4] \n"
+ "addu.ph %[tmp_t5], %[tmp_t5], %[tmp_t6] \n"
+ "addu.ph %[tmp_t7], %[tmp_t7], %[tmp_t8] \n"
+ "addu.ph %[tmp_t1], %[tmp_t1], %[tmp_t2] \n"
+ "addu.ph %[tmp_t3], %[tmp_t3], %[tmp_t4] \n"
+ "addu.ph %[tmp_t5], %[tmp_t5], %[tmp_t7] \n"
+ "addu.ph %[tmp_t1], %[tmp_t1], %[tmp_t3] \n"
+ "shrl.ph %[tmp_t5], %[tmp_t5], 2 \n"
+ "shrl.ph %[tmp_t1], %[tmp_t1], 2 \n"
+ "mult $ac0, %[const5], %[const5] \n"
+ "mult $ac1, %[const5], %[const5] \n"
+ "dpaq_s.w.ph $ac0, %[tmp_t5], %[const1] \n"
+ "dpaq_s.w.ph $ac1, %[tmp_t5], %[const3] \n"
+ "dpaq_s.w.ph $ac0, %[tmp_t1], %[const2] \n"
+ "dpaq_s.w.ph $ac1, %[tmp_t1], %[const4] \n"
+ "extr_r.w %[tmp_t7], $ac0, 9 \n"
+ "extr_r.w %[tmp_t8], $ac1, 9 \n"
+ "addiu %[dst_u], %[dst_u], 1 \n"
+ "addiu %[dst_v], %[dst_v], 1 \n"
+ "addiu %[src_rgb0], %[src_rgb0], 8 \n"
+ "addiu %[src_rgb1], %[src_rgb1], 8 \n"
+ "sb %[tmp_t7], -1(%[dst_u]) \n"
+ "sb %[tmp_t8], -1(%[dst_v]) \n"
+ ".set pop \n"
+ : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
+ [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4),
+ [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
+ [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8),
+ [src_rgb0] "+r"(src_rgb0), [src_rgb1] "+r"(src_rgb1),
+ [dst_u] "+r"(dst_u), [dst_v] "+r"(dst_v)
+ : [const1] "r"(const1), [const2] "r"(const2), [const3] "r"(const3),
+ [const4] "r"(const4), [const5] "r"(const5)
+ : "hi", "lo", "$ac1lo", "$ac1hi");
+ }
+}
+
+void BGRAToYRow_DSPR2(const uint8* src_argb0, uint8* dst_y, int width) {
+ int x;
+ int const1 = 0x00420000;
+ int const2 = 0x00190081;
+ int const5 = 0x40;
+ for (x = 0; x < width; x += 4) {
+ int tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;
+ int tmp_t6, tmp_t7, tmp_t8;
+ __asm__ __volatile__(
+ ".set push \n"
+ ".set noreorder \n"
+ "lw %[tmp_t1], 0(%[src_argb0]) \n"
+ "lw %[tmp_t2], 4(%[src_argb0]) \n"
+ "lw %[tmp_t3], 8(%[src_argb0]) \n"
+ "lw %[tmp_t4], 12(%[src_argb0]) \n"
+ "preceu.ph.qbr %[tmp_t5], %[tmp_t1] \n"
+ "preceu.ph.qbl %[tmp_t1], %[tmp_t1] \n"
+ "preceu.ph.qbr %[tmp_t6], %[tmp_t2] \n"
+ "preceu.ph.qbl %[tmp_t2], %[tmp_t2] \n"
+ "preceu.ph.qbr %[tmp_t7], %[tmp_t3] \n"
+ "preceu.ph.qbl %[tmp_t3], %[tmp_t3] \n"
+ "preceu.ph.qbr %[tmp_t8], %[tmp_t4] \n"
+ "preceu.ph.qbl %[tmp_t4], %[tmp_t4] \n"
+ "mult $ac0, %[const5], %[const5] \n"
+ "mult $ac1, %[const5], %[const5] \n"
+ "mult $ac2, %[const5], %[const5] \n"
+ "mult $ac3, %[const5], %[const5] \n"
+ "dpa.w.ph $ac0, %[tmp_t5], %[const1] \n"
+ "dpa.w.ph $ac1, %[tmp_t6], %[const1] \n"
+ "dpa.w.ph $ac2, %[tmp_t7], %[const1] \n"
+ "dpa.w.ph $ac3, %[tmp_t8], %[const1] \n"
+ "dpa.w.ph $ac0, %[tmp_t1], %[const2] \n"
+ "dpa.w.ph $ac1, %[tmp_t2], %[const2] \n"
+ "dpa.w.ph $ac2, %[tmp_t3], %[const2] \n"
+ "dpa.w.ph $ac3, %[tmp_t4], %[const2] \n"
+ "extr_r.w %[tmp_t1], $ac0, 8 \n"
+ "extr_r.w %[tmp_t2], $ac1, 8 \n"
+ "extr_r.w %[tmp_t3], $ac2, 8 \n"
+ "extr_r.w %[tmp_t4], $ac3, 8 \n"
+ "addiu %[src_argb0],%[src_argb0], 16 \n"
+ "addiu %[dst_y], %[dst_y], 4 \n"
+ "sb %[tmp_t1], -4(%[dst_y]) \n"
+ "sb %[tmp_t2], -3(%[dst_y]) \n"
+ "sb %[tmp_t3], -2(%[dst_y]) \n"
+ "sb %[tmp_t4], -1(%[dst_y]) \n"
+ ".set pop \n"
+ : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
+ [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4),
+ [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
+ [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8),
+ [src_argb0] "+r"(src_argb0), [dst_y] "+r"(dst_y)
+ : [const1] "r"(const1), [const2] "r"(const2), [const5] "r"(const5)
+ : "hi", "lo", "$ac1lo", "$ac1hi", "$ac2lo", "$ac2hi", "$ac3lo",
+ "$ac3hi");
+ }
+}
+
+void ABGRToUVRow_DSPR2(const uint8* src_rgb0,
+ int src_stride_rgb,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
+ const uint8* src_rgb1 = src_rgb0 + src_stride_rgb;
+ int x;
+ int const1 = 0xffb6ffda;
+ int const2 = 0x00000070;
+ int const3 = 0xffa20070;
+ int const4 = 0x0000ffee;
+ int const5 = 0x100;
+
+ for (x = 0; x < width - 1; x += 2) {
+ int tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;
+ int tmp_t6, tmp_t7, tmp_t8;
+ __asm__ __volatile__(
+ ".set push \n"
+ ".set noreorder \n"
+ "lw %[tmp_t1], 0(%[src_rgb0]) \n"
+ "lw %[tmp_t2], 4(%[src_rgb0]) \n"
+ "lw %[tmp_t3], 0(%[src_rgb1]) \n"
+ "lw %[tmp_t4], 4(%[src_rgb1]) \n"
+ "preceu.ph.qbr %[tmp_t5], %[tmp_t1] \n"
+ "preceu.ph.qbl %[tmp_t1], %[tmp_t1] \n"
+ "preceu.ph.qbr %[tmp_t6], %[tmp_t2] \n"
+ "preceu.ph.qbl %[tmp_t2], %[tmp_t2] \n"
+ "preceu.ph.qbr %[tmp_t7], %[tmp_t3] \n"
+ "preceu.ph.qbl %[tmp_t3], %[tmp_t3] \n"
+ "preceu.ph.qbr %[tmp_t8], %[tmp_t4] \n"
+ "preceu.ph.qbl %[tmp_t4], %[tmp_t4] \n"
+ "addu.ph %[tmp_t5], %[tmp_t5], %[tmp_t6] \n"
+ "addu.ph %[tmp_t7], %[tmp_t7], %[tmp_t8] \n"
+ "addu.ph %[tmp_t1], %[tmp_t1], %[tmp_t2] \n"
+ "addu.ph %[tmp_t3], %[tmp_t3], %[tmp_t4] \n"
+ "addu.ph %[tmp_t5], %[tmp_t5], %[tmp_t7] \n"
+ "addu.ph %[tmp_t1], %[tmp_t1], %[tmp_t3] \n"
+ "shrl.ph %[tmp_t5], %[tmp_t5], 2 \n"
+ "shrl.ph %[tmp_t1], %[tmp_t1], 2 \n"
+ "mult $ac0, %[const5], %[const5] \n"
+ "mult $ac1, %[const5], %[const5] \n"
+ "dpaq_s.w.ph $ac0, %[tmp_t5], %[const1] \n"
+ "dpaq_s.w.ph $ac1, %[tmp_t5], %[const3] \n"
+ "dpaq_s.w.ph $ac0, %[tmp_t1], %[const2] \n"
+ "dpaq_s.w.ph $ac1, %[tmp_t1], %[const4] \n"
+ "extr_r.w %[tmp_t7], $ac0, 9 \n"
+ "extr_r.w %[tmp_t8], $ac1, 9 \n"
+ "addiu %[dst_u], %[dst_u], 1 \n"
+ "addiu %[dst_v], %[dst_v], 1 \n"
+ "addiu %[src_rgb0], %[src_rgb0], 8 \n"
+ "addiu %[src_rgb1], %[src_rgb1], 8 \n"
+ "sb %[tmp_t7], -1(%[dst_u]) \n"
+ "sb %[tmp_t8], -1(%[dst_v]) \n"
+ ".set pop \n"
+ : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
+ [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4),
+ [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
+ [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8),
+ [src_rgb0] "+r"(src_rgb0), [src_rgb1] "+r"(src_rgb1),
+ [dst_u] "+r"(dst_u), [dst_v] "+r"(dst_v)
+ : [const1] "r"(const1), [const2] "r"(const2), [const3] "r"(const3),
+ [const4] "r"(const4), [const5] "r"(const5)
+ : "hi", "lo", "$ac1lo", "$ac1hi");
+ }
+}
+
+void ARGBToYRow_DSPR2(const uint8* src_argb0, uint8* dst_y, int width) {
+ int x;
+ int const1 = 0x00810019;
+ int const2 = 0x00000042;
+ int const5 = 0x40;
+ for (x = 0; x < width; x += 4) {
+ int tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;
+ int tmp_t6, tmp_t7, tmp_t8;
+ __asm__ __volatile__(
+ ".set push \n"
+ ".set noreorder \n"
+ "lw %[tmp_t1], 0(%[src_argb0]) \n"
+ "lw %[tmp_t2], 4(%[src_argb0]) \n"
+ "lw %[tmp_t3], 8(%[src_argb0]) \n"
+ "lw %[tmp_t4], 12(%[src_argb0]) \n"
+ "preceu.ph.qbr %[tmp_t5], %[tmp_t1] \n"
+ "preceu.ph.qbl %[tmp_t1], %[tmp_t1] \n"
+ "preceu.ph.qbr %[tmp_t6], %[tmp_t2] \n"
+ "preceu.ph.qbl %[tmp_t2], %[tmp_t2] \n"
+ "preceu.ph.qbr %[tmp_t7], %[tmp_t3] \n"
+ "preceu.ph.qbl %[tmp_t3], %[tmp_t3] \n"
+ "preceu.ph.qbr %[tmp_t8], %[tmp_t4] \n"
+ "preceu.ph.qbl %[tmp_t4], %[tmp_t4] \n"
+ "mult $ac0, %[const5], %[const5] \n"
+ "mult $ac1, %[const5], %[const5] \n"
+ "mult $ac2, %[const5], %[const5] \n"
+ "mult $ac3, %[const5], %[const5] \n"
+ "dpa.w.ph $ac0, %[tmp_t5], %[const1] \n"
+ "dpa.w.ph $ac1, %[tmp_t6], %[const1] \n"
+ "dpa.w.ph $ac2, %[tmp_t7], %[const1] \n"
+ "dpa.w.ph $ac3, %[tmp_t8], %[const1] \n"
+ "dpa.w.ph $ac0, %[tmp_t1], %[const2] \n"
+ "dpa.w.ph $ac1, %[tmp_t2], %[const2] \n"
+ "dpa.w.ph $ac2, %[tmp_t3], %[const2] \n"
+ "dpa.w.ph $ac3, %[tmp_t4], %[const2] \n"
+ "extr_r.w %[tmp_t1], $ac0, 8 \n"
+ "extr_r.w %[tmp_t2], $ac1, 8 \n"
+ "extr_r.w %[tmp_t3], $ac2, 8 \n"
+ "extr_r.w %[tmp_t4], $ac3, 8 \n"
+ "addiu %[dst_y], %[dst_y], 4 \n"
+ "addiu %[src_argb0],%[src_argb0], 16 \n"
+ "sb %[tmp_t1], -4(%[dst_y]) \n"
+ "sb %[tmp_t2], -3(%[dst_y]) \n"
+ "sb %[tmp_t3], -2(%[dst_y]) \n"
+ "sb %[tmp_t4], -1(%[dst_y]) \n"
+ ".set pop \n"
+ : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
+ [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4),
+ [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
+ [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8),
+ [src_argb0] "+r"(src_argb0), [dst_y] "+r"(dst_y)
+ : [const1] "r"(const1), [const2] "r"(const2), [const5] "r"(const5)
+ : "hi", "lo", "$ac1lo", "$ac1hi", "$ac2lo", "$ac2hi", "$ac3lo",
+ "$ac3hi");
+ }
+}
+
+void ABGRToYRow_DSPR2(const uint8* src_argb0, uint8* dst_y, int width) {
+ int x;
+ int const1 = 0x00810042;
+ int const2 = 0x00000019;
+ int const5 = 0x40;
+ for (x = 0; x < width; x += 4) {
+ int tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;
+ int tmp_t6, tmp_t7, tmp_t8;
+ __asm__ __volatile__(
+ ".set push \n"
+ ".set noreorder \n"
+ "lw %[tmp_t1], 0(%[src_argb0]) \n"
+ "lw %[tmp_t2], 4(%[src_argb0]) \n"
+ "lw %[tmp_t3], 8(%[src_argb0]) \n"
+ "lw %[tmp_t4], 12(%[src_argb0]) \n"
+ "preceu.ph.qbr %[tmp_t5], %[tmp_t1] \n"
+ "preceu.ph.qbl %[tmp_t1], %[tmp_t1] \n"
+ "preceu.ph.qbr %[tmp_t6], %[tmp_t2] \n"
+ "preceu.ph.qbl %[tmp_t2], %[tmp_t2] \n"
+ "preceu.ph.qbr %[tmp_t7], %[tmp_t3] \n"
+ "preceu.ph.qbl %[tmp_t3], %[tmp_t3] \n"
+ "preceu.ph.qbr %[tmp_t8], %[tmp_t4] \n"
+ "preceu.ph.qbl %[tmp_t4], %[tmp_t4] \n"
+ "mult $ac0, %[const5], %[const5] \n"
+ "mult $ac1, %[const5], %[const5] \n"
+ "mult $ac2, %[const5], %[const5] \n"
+ "mult $ac3, %[const5], %[const5] \n"
+ "dpa.w.ph $ac0, %[tmp_t5], %[const1] \n"
+ "dpa.w.ph $ac1, %[tmp_t6], %[const1] \n"
+ "dpa.w.ph $ac2, %[tmp_t7], %[const1] \n"
+ "dpa.w.ph $ac3, %[tmp_t8], %[const1] \n"
+ "dpa.w.ph $ac0, %[tmp_t1], %[const2] \n"
+ "dpa.w.ph $ac1, %[tmp_t2], %[const2] \n"
+ "dpa.w.ph $ac2, %[tmp_t3], %[const2] \n"
+ "dpa.w.ph $ac3, %[tmp_t4], %[const2] \n"
+ "extr_r.w %[tmp_t1], $ac0, 8 \n"
+ "extr_r.w %[tmp_t2], $ac1, 8 \n"
+ "extr_r.w %[tmp_t3], $ac2, 8 \n"
+ "extr_r.w %[tmp_t4], $ac3, 8 \n"
+ "addiu %[src_argb0],%[src_argb0], 16 \n"
+ "addiu %[dst_y], %[dst_y], 4 \n"
+ "sb %[tmp_t1], -4(%[dst_y]) \n"
+ "sb %[tmp_t2], -3(%[dst_y]) \n"
+ "sb %[tmp_t3], -2(%[dst_y]) \n"
+ "sb %[tmp_t4], -1(%[dst_y]) \n"
+ ".set pop \n"
+ : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
+ [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4),
+ [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
+ [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8),
+ [src_argb0] "+r"(src_argb0), [dst_y] "+r"(dst_y)
+ : [const1] "r"(const1), [const2] "r"(const2), [const5] "r"(const5)
+ : "hi", "lo", "$ac1lo", "$ac1hi", "$ac2lo", "$ac2hi", "$ac3lo",
+ "$ac3hi");
+ }
+}
+
+void RGBAToUVRow_DSPR2(const uint8* src_rgb0,
+ int src_stride_rgb,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
+ const uint8* src_rgb1 = src_rgb0 + src_stride_rgb;
+ int x;
+ int const1 = 0xffb60070;
+ int const2 = 0x0000ffda;
+ int const3 = 0xffa2ffee;
+ int const4 = 0x00000070;
+ int const5 = 0x100;
+
+ for (x = 0; x < width - 1; x += 2) {
+ int tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;
+ int tmp_t6, tmp_t7, tmp_t8;
+ __asm__ __volatile__(
+ ".set push \n"
+ ".set noreorder \n"
+ "ulw %[tmp_t1], 0+1(%[src_rgb0]) \n"
+ "ulw %[tmp_t2], 4+1(%[src_rgb0]) \n"
+ "ulw %[tmp_t3], 0+1(%[src_rgb1]) \n"
+ "ulw %[tmp_t4], 4+1(%[src_rgb1]) \n"
+ "preceu.ph.qbr %[tmp_t5], %[tmp_t1] \n"
+ "preceu.ph.qbl %[tmp_t1], %[tmp_t1] \n"
+ "preceu.ph.qbr %[tmp_t6], %[tmp_t2] \n"
+ "preceu.ph.qbl %[tmp_t2], %[tmp_t2] \n"
+ "preceu.ph.qbr %[tmp_t7], %[tmp_t3] \n"
+ "preceu.ph.qbl %[tmp_t3], %[tmp_t3] \n"
+ "preceu.ph.qbr %[tmp_t8], %[tmp_t4] \n"
+ "preceu.ph.qbl %[tmp_t4], %[tmp_t4] \n"
+ "addu.ph %[tmp_t5], %[tmp_t5], %[tmp_t6] \n"
+ "addu.ph %[tmp_t7], %[tmp_t7], %[tmp_t8] \n"
+ "addu.ph %[tmp_t1], %[tmp_t1], %[tmp_t2] \n"
+ "addu.ph %[tmp_t3], %[tmp_t3], %[tmp_t4] \n"
+ "addu.ph %[tmp_t5], %[tmp_t5], %[tmp_t7] \n"
+ "addu.ph %[tmp_t1], %[tmp_t1], %[tmp_t3] \n"
+ "shrl.ph %[tmp_t5], %[tmp_t5], 2 \n"
+ "shrl.ph %[tmp_t1], %[tmp_t1], 2 \n"
+ "mult $ac0, %[const5], %[const5] \n"
+ "mult $ac1, %[const5], %[const5] \n"
+ "dpaq_s.w.ph $ac0, %[tmp_t5], %[const1] \n"
+ "dpaq_s.w.ph $ac1, %[tmp_t5], %[const3] \n"
+ "dpaq_s.w.ph $ac0, %[tmp_t1], %[const2] \n"
+ "dpaq_s.w.ph $ac1, %[tmp_t1], %[const4] \n"
+ "extr_r.w %[tmp_t7], $ac0, 9 \n"
+ "extr_r.w %[tmp_t8], $ac1, 9 \n"
+ "addiu %[src_rgb0], %[src_rgb0], 8 \n"
+ "addiu %[src_rgb1], %[src_rgb1], 8 \n"
+ "addiu %[dst_u], %[dst_u], 1 \n"
+ "addiu %[dst_v], %[dst_v], 1 \n"
+ "sb %[tmp_t7], -1(%[dst_u]) \n"
+ "sb %[tmp_t8], -1(%[dst_v]) \n"
+ ".set pop \n"
+ : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
+ [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4),
+ [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
+ [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8),
+ [src_rgb0] "+r"(src_rgb0), [src_rgb1] "+r"(src_rgb1),
+ [dst_u] "+r"(dst_u), [dst_v] "+r"(dst_v)
+ : [const1] "r"(const1), [const2] "r"(const2), [const3] "r"(const3),
+ [const4] "r"(const4), [const5] "r"(const5)
+ : "hi", "lo", "$ac1lo", "$ac1hi");
+ }
+}
+
+void RGBAToYRow_DSPR2(const uint8* src_argb0, uint8* dst_y, int width) {
+ int x;
+ int const1 = 0x00420081;
+ int const2 = 0x00190000;
+ int const5 = 0x40;
+ for (x = 0; x < width; x += 4) {
+ int tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;
+ int tmp_t6, tmp_t7, tmp_t8;
+ __asm__ __volatile__(
+ ".set push \n"
+ ".set noreorder \n"
+ "lw %[tmp_t1], 0(%[src_argb0]) \n"
+ "lw %[tmp_t2], 4(%[src_argb0]) \n"
+ "lw %[tmp_t3], 8(%[src_argb0]) \n"
+ "lw %[tmp_t4], 12(%[src_argb0]) \n"
+ "preceu.ph.qbl %[tmp_t5], %[tmp_t1] \n"
+ "preceu.ph.qbr %[tmp_t1], %[tmp_t1] \n"
+ "preceu.ph.qbl %[tmp_t6], %[tmp_t2] \n"
+ "preceu.ph.qbr %[tmp_t2], %[tmp_t2] \n"
+ "preceu.ph.qbl %[tmp_t7], %[tmp_t3] \n"
+ "preceu.ph.qbr %[tmp_t3], %[tmp_t3] \n"
+ "preceu.ph.qbl %[tmp_t8], %[tmp_t4] \n"
+ "preceu.ph.qbr %[tmp_t4], %[tmp_t4] \n"
+ "mult $ac0, %[const5], %[const5] \n"
+ "mult $ac1, %[const5], %[const5] \n"
+ "mult $ac2, %[const5], %[const5] \n"
+ "mult $ac3, %[const5], %[const5] \n"
+ "dpa.w.ph $ac0, %[tmp_t5], %[const1] \n"
+ "dpa.w.ph $ac1, %[tmp_t6], %[const1] \n"
+ "dpa.w.ph $ac2, %[tmp_t7], %[const1] \n"
+ "dpa.w.ph $ac3, %[tmp_t8], %[const1] \n"
+ "dpa.w.ph $ac0, %[tmp_t1], %[const2] \n"
+ "dpa.w.ph $ac1, %[tmp_t2], %[const2] \n"
+ "dpa.w.ph $ac2, %[tmp_t3], %[const2] \n"
+ "dpa.w.ph $ac3, %[tmp_t4], %[const2] \n"
+ "extr_r.w %[tmp_t1], $ac0, 8 \n"
+ "extr_r.w %[tmp_t2], $ac1, 8 \n"
+ "extr_r.w %[tmp_t3], $ac2, 8 \n"
+ "extr_r.w %[tmp_t4], $ac3, 8 \n"
+ "addiu %[dst_y], %[dst_y], 4 \n"
+ "addiu %[src_argb0],%[src_argb0], 16 \n"
+ "sb %[tmp_t1], -4(%[dst_y]) \n"
+ "sb %[tmp_t2], -3(%[dst_y]) \n"
+ "sb %[tmp_t3], -2(%[dst_y]) \n"
+ "sb %[tmp_t4], -1(%[dst_y]) \n"
+ ".set pop \n"
+ : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
+ [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4),
+ [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
+ [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8),
+ [src_argb0] "+r"(src_argb0), [dst_y] "+r"(dst_y)
+ : [const1] "r"(const1), [const2] "r"(const2), [const5] "r"(const5)
+ : "hi", "lo", "$ac1lo", "$ac1hi", "$ac2lo", "$ac2hi", "$ac3lo",
+ "$ac3hi");
+ }
+}
+
+void ARGBToUVRow_DSPR2(const uint8* src_rgb0,
+ int src_stride_rgb,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
+ const uint8* src_rgb1 = src_rgb0 + src_stride_rgb;
+ int x;
+ int const1 = 0xffb60070;
+ int const2 = 0x0000ffda;
+ int const3 = 0xffa2ffee;
+ int const4 = 0x00000070;
+ int const5 = 0x100;
+
+ for (x = 0; x < width - 1; x += 2) {
+ int tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;
+ int tmp_t6, tmp_t7, tmp_t8;
+ __asm__ __volatile__(
+ ".set push \n"
+ ".set noreorder \n"
+ "lw %[tmp_t1], 0(%[src_rgb0]) \n"
+ "lw %[tmp_t2], 4(%[src_rgb0]) \n"
+ "lw %[tmp_t3], 0(%[src_rgb1]) \n"
+ "lw %[tmp_t4], 4(%[src_rgb1]) \n"
+ "preceu.ph.qbr %[tmp_t5], %[tmp_t1] \n"
+ "preceu.ph.qbl %[tmp_t1], %[tmp_t1] \n"
+ "preceu.ph.qbr %[tmp_t6], %[tmp_t2] \n"
+ "preceu.ph.qbl %[tmp_t2], %[tmp_t2] \n"
+ "preceu.ph.qbr %[tmp_t7], %[tmp_t3] \n"
+ "preceu.ph.qbl %[tmp_t3], %[tmp_t3] \n"
+ "preceu.ph.qbr %[tmp_t8], %[tmp_t4] \n"
+ "preceu.ph.qbl %[tmp_t4], %[tmp_t4] \n"
+ "addu.ph %[tmp_t5], %[tmp_t5], %[tmp_t6] \n"
+ "addu.ph %[tmp_t7], %[tmp_t7], %[tmp_t8] \n"
+ "addu.ph %[tmp_t1], %[tmp_t1], %[tmp_t2] \n"
+ "addu.ph %[tmp_t3], %[tmp_t3], %[tmp_t4] \n"
+ "addu.ph %[tmp_t5], %[tmp_t5], %[tmp_t7] \n"
+ "addu.ph %[tmp_t1], %[tmp_t1], %[tmp_t3] \n"
+ "shrl.ph %[tmp_t5], %[tmp_t5], 2 \n"
+ "shrl.ph %[tmp_t1], %[tmp_t1], 2 \n"
+ "mult $ac0, %[const5], %[const5] \n"
+ "mult $ac1, %[const5], %[const5] \n"
+ "dpaq_s.w.ph $ac0, %[tmp_t5], %[const1] \n"
+ "dpaq_s.w.ph $ac1, %[tmp_t5], %[const3] \n"
+ "dpaq_s.w.ph $ac0, %[tmp_t1], %[const2] \n"
+ "dpaq_s.w.ph $ac1, %[tmp_t1], %[const4] \n"
+ "extr_r.w %[tmp_t7], $ac0, 9 \n"
+ "extr_r.w %[tmp_t8], $ac1, 9 \n"
+ "addiu %[src_rgb0], %[src_rgb0], 8 \n"
+ "addiu %[src_rgb1], %[src_rgb1], 8 \n"
+ "addiu %[dst_u], %[dst_u], 1 \n"
+ "addiu %[dst_v], %[dst_v], 1 \n"
+ "sb %[tmp_t7], -1(%[dst_u]) \n"
+ "sb %[tmp_t8], -1(%[dst_v]) \n"
+ ".set pop \n"
+ : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
+ [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4),
+ [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
+ [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8),
+ [src_rgb0] "+r"(src_rgb0), [src_rgb1] "+r"(src_rgb1),
+ [dst_u] "+r"(dst_u), [dst_v] "+r"(dst_v)
+ : [const1] "r"(const1), [const2] "r"(const2), [const3] "r"(const3),
+ [const4] "r"(const4), [const5] "r"(const5)
+ : "hi", "lo", "$ac1lo", "$ac1hi");
+ }
+}
+
+#endif // __mips_dsp_rev >= 2
+
+#endif // defined(__mips__)
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/libyuv/src/main/cpp/libyuv/source/row_gcc.cc b/libyuv/src/main/cpp/libyuv/source/row_gcc.cc
new file mode 100644
index 0000000..8735070
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/source/row_gcc.cc
@@ -0,0 +1,5755 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for GCC x86 and x64.
+#if !defined(LIBYUV_DISABLE_X86) && \
+ (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
+
+#if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
+
+// Constants for ARGB
+static vec8 kARGBToY = {13, 65, 33, 0, 13, 65, 33, 0,
+ 13, 65, 33, 0, 13, 65, 33, 0};
+
+// JPeg full range.
+static vec8 kARGBToYJ = {15, 75, 38, 0, 15, 75, 38, 0,
+ 15, 75, 38, 0, 15, 75, 38, 0};
+#endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
+
+#if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
+
+static vec8 kARGBToU = {112, -74, -38, 0, 112, -74, -38, 0,
+ 112, -74, -38, 0, 112, -74, -38, 0};
+
+static vec8 kARGBToUJ = {127, -84, -43, 0, 127, -84, -43, 0,
+ 127, -84, -43, 0, 127, -84, -43, 0};
+
+static vec8 kARGBToV = {
+ -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
+};
+
+static vec8 kARGBToVJ = {-20, -107, 127, 0, -20, -107, 127, 0,
+ -20, -107, 127, 0, -20, -107, 127, 0};
+
+// Constants for BGRA
+static vec8 kBGRAToY = {0, 33, 65, 13, 0, 33, 65, 13,
+ 0, 33, 65, 13, 0, 33, 65, 13};
+
+static vec8 kBGRAToU = {0, -38, -74, 112, 0, -38, -74, 112,
+ 0, -38, -74, 112, 0, -38, -74, 112};
+
+static vec8 kBGRAToV = {0, 112, -94, -18, 0, 112, -94, -18,
+ 0, 112, -94, -18, 0, 112, -94, -18};
+
+// Constants for ABGR
+static vec8 kABGRToY = {33, 65, 13, 0, 33, 65, 13, 0,
+ 33, 65, 13, 0, 33, 65, 13, 0};
+
+static vec8 kABGRToU = {-38, -74, 112, 0, -38, -74, 112, 0,
+ -38, -74, 112, 0, -38, -74, 112, 0};
+
+static vec8 kABGRToV = {112, -94, -18, 0, 112, -94, -18, 0,
+ 112, -94, -18, 0, 112, -94, -18, 0};
+
+// Constants for RGBA.
+static vec8 kRGBAToY = {0, 13, 65, 33, 0, 13, 65, 33,
+ 0, 13, 65, 33, 0, 13, 65, 33};
+
+static vec8 kRGBAToU = {0, 112, -74, -38, 0, 112, -74, -38,
+ 0, 112, -74, -38, 0, 112, -74, -38};
+
+static vec8 kRGBAToV = {0, -18, -94, 112, 0, -18, -94, 112,
+ 0, -18, -94, 112, 0, -18, -94, 112};
+
+static uvec8 kAddY16 = {16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
+ 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u};
+
+// 7 bit fixed point 0.5.
+static vec16 kAddYJ64 = {64, 64, 64, 64, 64, 64, 64, 64};
+
+static uvec8 kAddUV128 = {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
+ 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
+
+static uvec16 kAddUVJ128 = {0x8080u, 0x8080u, 0x8080u, 0x8080u,
+ 0x8080u, 0x8080u, 0x8080u, 0x8080u};
+#endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
+
+#ifdef HAS_RGB24TOARGBROW_SSSE3
+
+// Shuffle table for converting RGB24 to ARGB.
+static uvec8 kShuffleMaskRGB24ToARGB = {0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u,
+ 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u};
+
+// Shuffle table for converting RAW to ARGB.
+static uvec8 kShuffleMaskRAWToARGB = {2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u,
+ 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u};
+
+// Shuffle table for converting RAW to RGB24. First 8.
+static const uvec8 kShuffleMaskRAWToRGB24_0 = {
+ 2u, 1u, 0u, 5u, 4u, 3u, 8u, 7u,
+ 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
+
+// Shuffle table for converting RAW to RGB24. Middle 8.
+static const uvec8 kShuffleMaskRAWToRGB24_1 = {
+ 2u, 7u, 6u, 5u, 10u, 9u, 8u, 13u,
+ 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
+
+// Shuffle table for converting RAW to RGB24. Last 8.
+static const uvec8 kShuffleMaskRAWToRGB24_2 = {
+ 8u, 7u, 12u, 11u, 10u, 15u, 14u, 13u,
+ 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
+
+// Shuffle table for converting ARGB to RGB24.
+static uvec8 kShuffleMaskARGBToRGB24 = {
+ 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u};
+
+// Shuffle table for converting ARGB to RAW.
+static uvec8 kShuffleMaskARGBToRAW = {
+ 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u};
+
+// Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4
+static uvec8 kShuffleMaskARGBToRGB24_0 = {
+ 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u};
+
+// YUY2 shuf 16 Y to 32 Y.
+static const lvec8 kShuffleYUY2Y = {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10,
+ 10, 12, 12, 14, 14, 0, 0, 2, 2, 4, 4,
+ 6, 6, 8, 8, 10, 10, 12, 12, 14, 14};
+
+// YUY2 shuf 8 UV to 16 UV.
+static const lvec8 kShuffleYUY2UV = {1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9,
+ 11, 13, 15, 13, 15, 1, 3, 1, 3, 5, 7,
+ 5, 7, 9, 11, 9, 11, 13, 15, 13, 15};
+
+// UYVY shuf 16 Y to 32 Y.
+static const lvec8 kShuffleUYVYY = {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11,
+ 11, 13, 13, 15, 15, 1, 1, 3, 3, 5, 5,
+ 7, 7, 9, 9, 11, 11, 13, 13, 15, 15};
+
+// UYVY shuf 8 UV to 16 UV.
+static const lvec8 kShuffleUYVYUV = {0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8,
+ 10, 12, 14, 12, 14, 0, 2, 0, 2, 4, 6,
+ 4, 6, 8, 10, 8, 10, 12, 14, 12, 14};
+
+// NV21 shuf 8 VU to 16 UV.
+static const lvec8 kShuffleNV21 = {
+ 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
+ 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
+};
+#endif // HAS_RGB24TOARGBROW_SSSE3
+
+#ifdef HAS_J400TOARGBROW_SSE2
+void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int width) {
+ asm volatile (
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pslld $0x18,%%xmm5 \n"
+ LABELALIGN
+ "1: \n"
+ "movq " MEMACCESS(0) ",%%xmm0 \n"
+ "lea " MEMLEA(0x8,0) ",%0 \n"
+ "punpcklbw %%xmm0,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklwd %%xmm0,%%xmm0 \n"
+ "punpckhwd %%xmm1,%%xmm1 \n"
+ "por %%xmm5,%%xmm0 \n"
+ "por %%xmm5,%%xmm1 \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
+ "lea " MEMLEA(0x20,1) ",%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ :: "memory", "cc", "xmm0", "xmm1", "xmm5"
+ );
+}
+#endif // HAS_J400TOARGBROW_SSE2
+
+#ifdef HAS_RGB24TOARGBROW_SSSE3
+void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int width) {
+ asm volatile (
+ "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
+ "pslld $0x18,%%xmm5 \n"
+ "movdqa %3,%%xmm4 \n"
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqu " MEMACCESS2(0x20,0) ",%%xmm3 \n"
+ "lea " MEMLEA(0x30,0) ",%0 \n"
+ "movdqa %%xmm3,%%xmm2 \n"
+ "palignr $0x8,%%xmm1,%%xmm2 \n"
+ "pshufb %%xmm4,%%xmm2 \n"
+ "por %%xmm5,%%xmm2 \n"
+ "palignr $0xc,%%xmm0,%%xmm1 \n"
+ "pshufb %%xmm4,%%xmm0 \n"
+ "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n"
+ "por %%xmm5,%%xmm0 \n"
+ "pshufb %%xmm4,%%xmm1 \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ "por %%xmm5,%%xmm1 \n"
+ "palignr $0x4,%%xmm3,%%xmm3 \n"
+ "pshufb %%xmm4,%%xmm3 \n"
+ "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
+ "por %%xmm5,%%xmm3 \n"
+ "movdqu %%xmm3," MEMACCESS2(0x30,1) " \n"
+ "lea " MEMLEA(0x40,1) ",%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_rgb24), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "m"(kShuffleMaskRGB24ToARGB) // %3
+ : "memory", "cc" , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+}
+
+void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int width) {
+ asm volatile (
+ "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
+ "pslld $0x18,%%xmm5 \n"
+ "movdqa %3,%%xmm4 \n"
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqu " MEMACCESS2(0x20,0) ",%%xmm3 \n"
+ "lea " MEMLEA(0x30,0) ",%0 \n"
+ "movdqa %%xmm3,%%xmm2 \n"
+ "palignr $0x8,%%xmm1,%%xmm2 \n"
+ "pshufb %%xmm4,%%xmm2 \n"
+ "por %%xmm5,%%xmm2 \n"
+ "palignr $0xc,%%xmm0,%%xmm1 \n"
+ "pshufb %%xmm4,%%xmm0 \n"
+ "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n"
+ "por %%xmm5,%%xmm0 \n"
+ "pshufb %%xmm4,%%xmm1 \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ "por %%xmm5,%%xmm1 \n"
+ "palignr $0x4,%%xmm3,%%xmm3 \n"
+ "pshufb %%xmm4,%%xmm3 \n"
+ "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
+ "por %%xmm5,%%xmm3 \n"
+ "movdqu %%xmm3," MEMACCESS2(0x30,1) " \n"
+ "lea " MEMLEA(0x40,1) ",%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_raw), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "m"(kShuffleMaskRAWToARGB) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+}
+
+void RAWToRGB24Row_SSSE3(const uint8* src_raw, uint8* dst_rgb24, int width) {
+ asm volatile (
+ "movdqa %3,%%xmm3 \n"
+ "movdqa %4,%%xmm4 \n"
+ "movdqa %5,%%xmm5 \n"
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x4,0) ",%%xmm1 \n"
+ "movdqu " MEMACCESS2(0x8,0) ",%%xmm2 \n"
+ "lea " MEMLEA(0x18,0) ",%0 \n"
+ "pshufb %%xmm3,%%xmm0 \n"
+ "pshufb %%xmm4,%%xmm1 \n"
+ "pshufb %%xmm5,%%xmm2 \n"
+ "movq %%xmm0," MEMACCESS(1) " \n"
+ "movq %%xmm1," MEMACCESS2(0x8,1) " \n"
+ "movq %%xmm2," MEMACCESS2(0x10,1) " \n"
+ "lea " MEMLEA(0x18,1) ",%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_raw), // %0
+ "+r"(dst_rgb24), // %1
+ "+r"(width) // %2
+ : "m"(kShuffleMaskRAWToRGB24_0), // %3
+ "m"(kShuffleMaskRAWToRGB24_1), // %4
+ "m"(kShuffleMaskRAWToRGB24_2) // %5
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+}
+
+void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int width) {
+ asm volatile (
+ "mov $0x1080108,%%eax \n"
+ "movd %%eax,%%xmm5 \n"
+ "pshufd $0x0,%%xmm5,%%xmm5 \n"
+ "mov $0x20802080,%%eax \n"
+ "movd %%eax,%%xmm6 \n"
+ "pshufd $0x0,%%xmm6,%%xmm6 \n"
+ "pcmpeqb %%xmm3,%%xmm3 \n"
+ "psllw $0xb,%%xmm3 \n"
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "psllw $0xa,%%xmm4 \n"
+ "psrlw $0x5,%%xmm4 \n"
+ "pcmpeqb %%xmm7,%%xmm7 \n"
+ "psllw $0x8,%%xmm7 \n"
+ "sub %0,%1 \n"
+ "sub %0,%1 \n"
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "pand %%xmm3,%%xmm1 \n"
+ "psllw $0xb,%%xmm2 \n"
+ "pmulhuw %%xmm5,%%xmm1 \n"
+ "pmulhuw %%xmm5,%%xmm2 \n"
+ "psllw $0x8,%%xmm1 \n"
+ "por %%xmm2,%%xmm1 \n"
+ "pand %%xmm4,%%xmm0 \n"
+ "pmulhuw %%xmm6,%%xmm0 \n"
+ "por %%xmm7,%%xmm0 \n"
+ "movdqa %%xmm1,%%xmm2 \n"
+ "punpcklbw %%xmm0,%%xmm1 \n"
+ "punpckhbw %%xmm0,%%xmm2 \n"
+ MEMOPMEM(movdqu,xmm1,0x00,1,0,2) // movdqu %%xmm1,(%1,%0,2)
+ MEMOPMEM(movdqu,xmm2,0x10,1,0,2) // movdqu %%xmm2,0x10(%1,%0,2)
+ "lea " MEMLEA(0x10,0) ",%0 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ :
+ : "memory", "cc", "eax", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+ );
+}
+
+void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int width) {
+ asm volatile (
+ "mov $0x1080108,%%eax \n"
+ "movd %%eax,%%xmm5 \n"
+ "pshufd $0x0,%%xmm5,%%xmm5 \n"
+ "mov $0x42004200,%%eax \n"
+ "movd %%eax,%%xmm6 \n"
+ "pshufd $0x0,%%xmm6,%%xmm6 \n"
+ "pcmpeqb %%xmm3,%%xmm3 \n"
+ "psllw $0xb,%%xmm3 \n"
+ "movdqa %%xmm3,%%xmm4 \n"
+ "psrlw $0x6,%%xmm4 \n"
+ "pcmpeqb %%xmm7,%%xmm7 \n"
+ "psllw $0x8,%%xmm7 \n"
+ "sub %0,%1 \n"
+ "sub %0,%1 \n"
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "psllw $0x1,%%xmm1 \n"
+ "psllw $0xb,%%xmm2 \n"
+ "pand %%xmm3,%%xmm1 \n"
+ "pmulhuw %%xmm5,%%xmm2 \n"
+ "pmulhuw %%xmm5,%%xmm1 \n"
+ "psllw $0x8,%%xmm1 \n"
+ "por %%xmm2,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "pand %%xmm4,%%xmm0 \n"
+ "psraw $0x8,%%xmm2 \n"
+ "pmulhuw %%xmm6,%%xmm0 \n"
+ "pand %%xmm7,%%xmm2 \n"
+ "por %%xmm2,%%xmm0 \n"
+ "movdqa %%xmm1,%%xmm2 \n"
+ "punpcklbw %%xmm0,%%xmm1 \n"
+ "punpckhbw %%xmm0,%%xmm2 \n"
+ MEMOPMEM(movdqu,xmm1,0x00,1,0,2) // movdqu %%xmm1,(%1,%0,2)
+ MEMOPMEM(movdqu,xmm2,0x10,1,0,2) // movdqu %%xmm2,0x10(%1,%0,2)
+ "lea " MEMLEA(0x10,0) ",%0 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ :
+ : "memory", "cc", "eax", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+ );
+}
+
+void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int width) {
+ asm volatile (
+ "mov $0xf0f0f0f,%%eax \n"
+ "movd %%eax,%%xmm4 \n"
+ "pshufd $0x0,%%xmm4,%%xmm4 \n"
+ "movdqa %%xmm4,%%xmm5 \n"
+ "pslld $0x4,%%xmm5 \n"
+ "sub %0,%1 \n"
+ "sub %0,%1 \n"
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "pand %%xmm4,%%xmm0 \n"
+ "pand %%xmm5,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm2,%%xmm3 \n"
+ "psllw $0x4,%%xmm1 \n"
+ "psrlw $0x4,%%xmm3 \n"
+ "por %%xmm1,%%xmm0 \n"
+ "por %%xmm3,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklbw %%xmm2,%%xmm0 \n"
+ "punpckhbw %%xmm2,%%xmm1 \n"
+ MEMOPMEM(movdqu,xmm0,0x00,1,0,2) // movdqu %%xmm0,(%1,%0,2)
+ MEMOPMEM(movdqu,xmm1,0x10,1,0,2) // movdqu %%xmm1,0x10(%1,%0,2)
+ "lea " MEMLEA(0x10,0) ",%0 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ :
+ : "memory", "cc", "eax", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+}
+
+void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int width) {
+ asm volatile (
+ "movdqa %3,%%xmm6 \n"
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
+ "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
+ "lea " MEMLEA(0x40,0) ",%0 \n"
+ "pshufb %%xmm6,%%xmm0 \n"
+ "pshufb %%xmm6,%%xmm1 \n"
+ "pshufb %%xmm6,%%xmm2 \n"
+ "pshufb %%xmm6,%%xmm3 \n"
+ "movdqa %%xmm1,%%xmm4 \n"
+ "psrldq $0x4,%%xmm1 \n"
+ "pslldq $0xc,%%xmm4 \n"
+ "movdqa %%xmm2,%%xmm5 \n"
+ "por %%xmm4,%%xmm0 \n"
+ "pslldq $0x8,%%xmm5 \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ "por %%xmm5,%%xmm1 \n"
+ "psrldq $0x8,%%xmm2 \n"
+ "pslldq $0x4,%%xmm3 \n"
+ "por %%xmm3,%%xmm2 \n"
+ "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
+ "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n"
+ "lea " MEMLEA(0x30,1) ",%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ : "m"(kShuffleMaskARGBToRGB24) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+ );
+}
+
+void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int width) {
+ asm volatile (
+ "movdqa %3,%%xmm6 \n"
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
+ "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
+ "lea " MEMLEA(0x40,0) ",%0 \n"
+ "pshufb %%xmm6,%%xmm0 \n"
+ "pshufb %%xmm6,%%xmm1 \n"
+ "pshufb %%xmm6,%%xmm2 \n"
+ "pshufb %%xmm6,%%xmm3 \n"
+ "movdqa %%xmm1,%%xmm4 \n"
+ "psrldq $0x4,%%xmm1 \n"
+ "pslldq $0xc,%%xmm4 \n"
+ "movdqa %%xmm2,%%xmm5 \n"
+ "por %%xmm4,%%xmm0 \n"
+ "pslldq $0x8,%%xmm5 \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ "por %%xmm5,%%xmm1 \n"
+ "psrldq $0x8,%%xmm2 \n"
+ "pslldq $0x4,%%xmm3 \n"
+ "por %%xmm3,%%xmm2 \n"
+ "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
+ "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n"
+ "lea " MEMLEA(0x30,1) ",%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ : "m"(kShuffleMaskARGBToRAW) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+ );
+}
+
+void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int width) {
+ asm volatile (
+ "pcmpeqb %%xmm3,%%xmm3 \n"
+ "psrld $0x1b,%%xmm3 \n"
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "psrld $0x1a,%%xmm4 \n"
+ "pslld $0x5,%%xmm4 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pslld $0xb,%%xmm5 \n"
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "pslld $0x8,%%xmm0 \n"
+ "psrld $0x3,%%xmm1 \n"
+ "psrld $0x5,%%xmm2 \n"
+ "psrad $0x10,%%xmm0 \n"
+ "pand %%xmm3,%%xmm1 \n"
+ "pand %%xmm4,%%xmm2 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "por %%xmm2,%%xmm1 \n"
+ "por %%xmm1,%%xmm0 \n"
+ "packssdw %%xmm0,%%xmm0 \n"
+ "lea " MEMLEA(0x10,0) ",%0 \n"
+ "movq %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x8,1) ",%1 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+}
+
+void ARGBToRGB565DitherRow_SSE2(const uint8* src,
+ uint8* dst,
+ const uint32 dither4,
+ int width) {
+ asm volatile(
+ "movd %3,%%xmm6 \n"
+ "punpcklbw %%xmm6,%%xmm6 \n"
+ "movdqa %%xmm6,%%xmm7 \n"
+ "punpcklwd %%xmm6,%%xmm6 \n"
+ "punpckhwd %%xmm7,%%xmm7 \n"
+ "pcmpeqb %%xmm3,%%xmm3 \n"
+ "psrld $0x1b,%%xmm3 \n"
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "psrld $0x1a,%%xmm4 \n"
+ "pslld $0x5,%%xmm4 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pslld $0xb,%%xmm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "paddusb %%xmm6,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "pslld $0x8,%%xmm0 \n"
+ "psrld $0x3,%%xmm1 \n"
+ "psrld $0x5,%%xmm2 \n"
+ "psrad $0x10,%%xmm0 \n"
+ "pand %%xmm3,%%xmm1 \n"
+ "pand %%xmm4,%%xmm2 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "por %%xmm2,%%xmm1 \n"
+ "por %%xmm1,%%xmm0 \n"
+ "packssdw %%xmm0,%%xmm0 \n"
+ "lea 0x10(%0),%0 \n"
+ "movq %%xmm0,(%1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ : "m"(dither4) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
+}
+
+#ifdef HAS_ARGBTORGB565DITHERROW_AVX2
+void ARGBToRGB565DitherRow_AVX2(const uint8* src,
+ uint8* dst,
+ const uint32 dither4,
+ int width) {
+ asm volatile(
+ "vbroadcastss %3,%%xmm6 \n"
+ "vpunpcklbw %%xmm6,%%xmm6,%%xmm6 \n"
+ "vpermq $0xd8,%%ymm6,%%ymm6 \n"
+ "vpunpcklwd %%ymm6,%%ymm6,%%ymm6 \n"
+ "vpcmpeqb %%ymm3,%%ymm3,%%ymm3 \n"
+ "vpsrld $0x1b,%%ymm3,%%ymm3 \n"
+ "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
+ "vpsrld $0x1a,%%ymm4,%%ymm4 \n"
+ "vpslld $0x5,%%ymm4,%%ymm4 \n"
+ "vpslld $0xb,%%ymm3,%%ymm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vpaddusb %%ymm6,%%ymm0,%%ymm0 \n"
+ "vpsrld $0x5,%%ymm0,%%ymm2 \n"
+ "vpsrld $0x3,%%ymm0,%%ymm1 \n"
+ "vpsrld $0x8,%%ymm0,%%ymm0 \n"
+ "vpand %%ymm4,%%ymm2,%%ymm2 \n"
+ "vpand %%ymm3,%%ymm1,%%ymm1 \n"
+ "vpand %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpor %%ymm2,%%ymm1,%%ymm1 \n"
+ "vpor %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpackusdw %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "lea 0x20(%0),%0 \n"
+ "vmovdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ : "m"(dither4) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
+}
+#endif // HAS_ARGBTORGB565DITHERROW_AVX2
+
+void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int width) {
+ asm volatile (
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "psrld $0x1b,%%xmm4 \n"
+ "movdqa %%xmm4,%%xmm5 \n"
+ "pslld $0x5,%%xmm5 \n"
+ "movdqa %%xmm4,%%xmm6 \n"
+ "pslld $0xa,%%xmm6 \n"
+ "pcmpeqb %%xmm7,%%xmm7 \n"
+ "pslld $0xf,%%xmm7 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm3 \n"
+ "psrad $0x10,%%xmm0 \n"
+ "psrld $0x3,%%xmm1 \n"
+ "psrld $0x6,%%xmm2 \n"
+ "psrld $0x9,%%xmm3 \n"
+ "pand %%xmm7,%%xmm0 \n"
+ "pand %%xmm4,%%xmm1 \n"
+ "pand %%xmm5,%%xmm2 \n"
+ "pand %%xmm6,%%xmm3 \n"
+ "por %%xmm1,%%xmm0 \n"
+ "por %%xmm3,%%xmm2 \n"
+ "por %%xmm2,%%xmm0 \n"
+ "packssdw %%xmm0,%%xmm0 \n"
+ "lea " MEMLEA(0x10,0) ",%0 \n"
+ "movq %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x8,1) ",%1 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ :: "memory", "cc",
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+ );
+}
+
+void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int width) {
+ asm volatile (
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "psllw $0xc,%%xmm4 \n"
+ "movdqa %%xmm4,%%xmm3 \n"
+ "psrlw $0x8,%%xmm3 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "pand %%xmm3,%%xmm0 \n"
+ "pand %%xmm4,%%xmm1 \n"
+ "psrlq $0x4,%%xmm0 \n"
+ "psrlq $0x8,%%xmm1 \n"
+ "por %%xmm1,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "lea " MEMLEA(0x10,0) ",%0 \n"
+ "movq %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x8,1) ",%1 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
+ );
+}
+#endif // HAS_RGB24TOARGBROW_SSSE3
+
+#ifdef HAS_ARGBTOYROW_SSSE3
+// Convert 16 ARGB pixels (64 bytes) to 16 Y values.
+void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
+ asm volatile (
+ "movdqa %3,%%xmm4 \n"
+ "movdqa %4,%%xmm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
+ "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm1 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm4,%%xmm3 \n"
+ "lea " MEMLEA(0x40,0) ",%0 \n"
+ "phaddw %%xmm1,%%xmm0 \n"
+ "phaddw %%xmm3,%%xmm2 \n"
+ "psrlw $0x7,%%xmm0 \n"
+ "psrlw $0x7,%%xmm2 \n"
+ "packuswb %%xmm2,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ : "m"(kARGBToY), // %3
+ "m"(kAddY16) // %4
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+}
+#endif // HAS_ARGBTOYROW_SSSE3
+
+#ifdef HAS_ARGBTOYJROW_SSSE3
+// Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
+// Same as ARGBToYRow but different coefficients, no add 16, but do rounding.
+void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
+ asm volatile (
+ "movdqa %3,%%xmm4 \n"
+ "movdqa %4,%%xmm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
+ "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm1 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm4,%%xmm3 \n"
+ "lea " MEMLEA(0x40,0) ",%0 \n"
+ "phaddw %%xmm1,%%xmm0 \n"
+ "phaddw %%xmm3,%%xmm2 \n"
+ "paddw %%xmm5,%%xmm0 \n"
+ "paddw %%xmm5,%%xmm2 \n"
+ "psrlw $0x7,%%xmm0 \n"
+ "psrlw $0x7,%%xmm2 \n"
+ "packuswb %%xmm2,%%xmm0 \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ : "m"(kARGBToYJ), // %3
+ "m"(kAddYJ64) // %4
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+}
+#endif // HAS_ARGBTOYJROW_SSSE3
+
+#ifdef HAS_ARGBTOYROW_AVX2
+// vpermd for vphaddw + vpackuswb vpermd.
+static const lvec32 kPermdARGBToY_AVX = {0, 4, 1, 5, 2, 6, 3, 7};
+
+// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
+void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) {
+ asm volatile (
+ "vbroadcastf128 %3,%%ymm4 \n"
+ "vbroadcastf128 %4,%%ymm5 \n"
+ "vmovdqu %5,%%ymm6 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
+ "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
+ "vmovdqu " MEMACCESS2(0x40,0) ",%%ymm2 \n"
+ "vmovdqu " MEMACCESS2(0x60,0) ",%%ymm3 \n"
+ "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
+ "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
+ "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
+ "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
+ "lea " MEMLEA(0x80,0) ",%0 \n"
+ "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" // mutates.
+ "vphaddw %%ymm3,%%ymm2,%%ymm2 \n"
+ "vpsrlw $0x7,%%ymm0,%%ymm0 \n"
+ "vpsrlw $0x7,%%ymm2,%%ymm2 \n"
+ "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates.
+ "vpermd %%ymm0,%%ymm6,%%ymm0 \n" // unmutate.
+ "vpaddb %%ymm5,%%ymm0,%%ymm0 \n" // add 16 for Y
+ "vmovdqu %%ymm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x20,1) ",%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ : "m"(kARGBToY), // %3
+ "m"(kAddY16), // %4
+ "m"(kPermdARGBToY_AVX) // %5
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+ );
+}
+#endif // HAS_ARGBTOYROW_AVX2
+
+#ifdef HAS_ARGBTOYJROW_AVX2
+// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
+void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) {
+ asm volatile (
+ "vbroadcastf128 %3,%%ymm4 \n"
+ "vbroadcastf128 %4,%%ymm5 \n"
+ "vmovdqu %5,%%ymm6 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
+ "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
+ "vmovdqu " MEMACCESS2(0x40,0) ",%%ymm2 \n"
+ "vmovdqu " MEMACCESS2(0x60,0) ",%%ymm3 \n"
+ "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
+ "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
+ "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
+ "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
+ "lea " MEMLEA(0x80,0) ",%0 \n"
+ "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" // mutates.
+ "vphaddw %%ymm3,%%ymm2,%%ymm2 \n"
+ "vpaddw %%ymm5,%%ymm0,%%ymm0 \n" // Add .5 for rounding.
+ "vpaddw %%ymm5,%%ymm2,%%ymm2 \n"
+ "vpsrlw $0x7,%%ymm0,%%ymm0 \n"
+ "vpsrlw $0x7,%%ymm2,%%ymm2 \n"
+ "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates.
+ "vpermd %%ymm0,%%ymm6,%%ymm0 \n" // unmutate.
+ "vmovdqu %%ymm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x20,1) ",%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ : "m"(kARGBToYJ), // %3
+ "m"(kAddYJ64), // %4
+ "m"(kPermdARGBToY_AVX) // %5
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+ );
+}
+#endif // HAS_ARGBTOYJROW_AVX2
+
+#ifdef HAS_ARGBTOUVROW_SSSE3
+void ARGBToUVRow_SSSE3(const uint8* src_argb0,
+ int src_stride_argb,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
+ asm volatile (
+ "movdqa %5,%%xmm3 \n"
+ "movdqa %6,%%xmm4 \n"
+ "movdqa %7,%%xmm5 \n"
+ "sub %1,%2 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7
+ "pavgb %%xmm7,%%xmm1 \n"
+ "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
+ MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
+ MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7
+ "pavgb %%xmm7,%%xmm6 \n"
+
+ "lea " MEMLEA(0x40,0) ",%0 \n"
+ "movdqa %%xmm0,%%xmm7 \n"
+ "shufps $0x88,%%xmm1,%%xmm0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqa %%xmm2,%%xmm7 \n"
+ "shufps $0x88,%%xmm6,%%xmm2 \n"
+ "shufps $0xdd,%%xmm6,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm2,%%xmm6 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm3,%%xmm1 \n"
+ "pmaddubsw %%xmm3,%%xmm6 \n"
+ "phaddw %%xmm2,%%xmm0 \n"
+ "phaddw %%xmm6,%%xmm1 \n"
+ "psraw $0x8,%%xmm0 \n"
+ "psraw $0x8,%%xmm1 \n"
+ "packsswb %%xmm1,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
+ "movlps %%xmm0," MEMACCESS(1) " \n"
+ MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
+ "lea " MEMLEA(0x8,1) ",%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_argb0), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+rm"(width) // %3
+ : "r"((intptr_t)(src_stride_argb)), // %4
+ "m"(kARGBToV), // %5
+ "m"(kARGBToU), // %6
+ "m"(kAddUV128) // %7
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
+ );
+}
+#endif // HAS_ARGBTOUVROW_SSSE3
+
+#ifdef HAS_ARGBTOUVROW_AVX2
+// vpshufb for vphaddw + vpackuswb packed to shorts.
+static const lvec8 kShufARGBToUV_AVX = {
+ 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
+ 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15};
+void ARGBToUVRow_AVX2(const uint8* src_argb0,
+ int src_stride_argb,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
+ asm volatile (
+ "vbroadcastf128 %5,%%ymm5 \n"
+ "vbroadcastf128 %6,%%ymm6 \n"
+ "vbroadcastf128 %7,%%ymm7 \n"
+ "sub %1,%2 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
+ "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
+ "vmovdqu " MEMACCESS2(0x40,0) ",%%ymm2 \n"
+ "vmovdqu " MEMACCESS2(0x60,0) ",%%ymm3 \n"
+ VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0) // vpavgb (%0,%4,1),%%ymm0,%%ymm0
+ VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
+ VMEMOPREG(vpavgb,0x40,0,4,1,ymm2,ymm2)
+ VMEMOPREG(vpavgb,0x60,0,4,1,ymm3,ymm3)
+ "lea " MEMLEA(0x80,0) ",%0 \n"
+ "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n"
+ "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n"
+ "vpavgb %%ymm4,%%ymm0,%%ymm0 \n"
+ "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n"
+ "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n"
+ "vpavgb %%ymm4,%%ymm2,%%ymm2 \n"
+
+ "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n"
+ "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n"
+ "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n"
+ "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n"
+ "vphaddw %%ymm3,%%ymm1,%%ymm1 \n"
+ "vphaddw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpsraw $0x8,%%ymm1,%%ymm1 \n"
+ "vpsraw $0x8,%%ymm0,%%ymm0 \n"
+ "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpshufb %8,%%ymm0,%%ymm0 \n"
+ "vpaddb %%ymm5,%%ymm0,%%ymm0 \n"
+
+ "vextractf128 $0x0,%%ymm0," MEMACCESS(1) " \n"
+ VEXTOPMEM(vextractf128,1,ymm0,0x0,1,2,1) // vextractf128 $1,%%ymm0,(%1,%2,1)
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x20,%3 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_argb0), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+rm"(width) // %3
+ : "r"((intptr_t)(src_stride_argb)), // %4
+ "m"(kAddUV128), // %5
+ "m"(kARGBToV), // %6
+ "m"(kARGBToU), // %7
+ "m"(kShufARGBToUV_AVX) // %8
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+ );
+}
+#endif // HAS_ARGBTOUVROW_AVX2
+
+#ifdef HAS_ARGBTOUVJROW_AVX2
+void ARGBToUVJRow_AVX2(const uint8* src_argb0,
+ int src_stride_argb,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
+ asm volatile (
+ "vbroadcastf128 %5,%%ymm5 \n"
+ "vbroadcastf128 %6,%%ymm6 \n"
+ "vbroadcastf128 %7,%%ymm7 \n"
+ "sub %1,%2 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
+ "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
+ "vmovdqu " MEMACCESS2(0x40,0) ",%%ymm2 \n"
+ "vmovdqu " MEMACCESS2(0x60,0) ",%%ymm3 \n"
+ VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0) // vpavgb (%0,%4,1),%%ymm0,%%ymm0
+ VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
+ VMEMOPREG(vpavgb,0x40,0,4,1,ymm2,ymm2)
+ VMEMOPREG(vpavgb,0x60,0,4,1,ymm3,ymm3)
+ "lea " MEMLEA(0x80,0) ",%0 \n"
+ "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n"
+ "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n"
+ "vpavgb %%ymm4,%%ymm0,%%ymm0 \n"
+ "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n"
+ "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n"
+ "vpavgb %%ymm4,%%ymm2,%%ymm2 \n"
+
+ "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n"
+ "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n"
+ "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n"
+ "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n"
+ "vphaddw %%ymm3,%%ymm1,%%ymm1 \n"
+ "vphaddw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpaddw %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpaddw %%ymm5,%%ymm1,%%ymm1 \n"
+ "vpsraw $0x8,%%ymm1,%%ymm1 \n"
+ "vpsraw $0x8,%%ymm0,%%ymm0 \n"
+ "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpshufb %8,%%ymm0,%%ymm0 \n"
+
+ "vextractf128 $0x0,%%ymm0," MEMACCESS(1) " \n"
+ VEXTOPMEM(vextractf128,1,ymm0,0x0,1,2,1) // vextractf128 $1,%%ymm0,(%1,%2,1)
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x20,%3 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_argb0), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+rm"(width) // %3
+ : "r"((intptr_t)(src_stride_argb)), // %4
+ "m"(kAddUVJ128), // %5
+ "m"(kARGBToVJ), // %6
+ "m"(kARGBToUJ), // %7
+ "m"(kShufARGBToUV_AVX) // %8
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+ );
+}
+#endif // HAS_ARGBTOUVJROW_AVX2
+
+#ifdef HAS_ARGBTOUVJROW_SSSE3
+void ARGBToUVJRow_SSSE3(const uint8* src_argb0,
+ int src_stride_argb,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
+ asm volatile (
+ "movdqa %5,%%xmm3 \n"
+ "movdqa %6,%%xmm4 \n"
+ "movdqa %7,%%xmm5 \n"
+ "sub %1,%2 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7
+ "pavgb %%xmm7,%%xmm1 \n"
+ "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
+ MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
+ MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7
+ "pavgb %%xmm7,%%xmm6 \n"
+
+ "lea " MEMLEA(0x40,0) ",%0 \n"
+ "movdqa %%xmm0,%%xmm7 \n"
+ "shufps $0x88,%%xmm1,%%xmm0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqa %%xmm2,%%xmm7 \n"
+ "shufps $0x88,%%xmm6,%%xmm2 \n"
+ "shufps $0xdd,%%xmm6,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm2,%%xmm6 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm3,%%xmm1 \n"
+ "pmaddubsw %%xmm3,%%xmm6 \n"
+ "phaddw %%xmm2,%%xmm0 \n"
+ "phaddw %%xmm6,%%xmm1 \n"
+ "paddw %%xmm5,%%xmm0 \n"
+ "paddw %%xmm5,%%xmm1 \n"
+ "psraw $0x8,%%xmm0 \n"
+ "psraw $0x8,%%xmm1 \n"
+ "packsswb %%xmm1,%%xmm0 \n"
+ "movlps %%xmm0," MEMACCESS(1) " \n"
+ MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
+ "lea " MEMLEA(0x8,1) ",%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_argb0), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+rm"(width) // %3
+ : "r"((intptr_t)(src_stride_argb)), // %4
+ "m"(kARGBToVJ), // %5
+ "m"(kARGBToUJ), // %6
+ "m"(kAddUVJ128) // %7
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
+ );
+}
+#endif // HAS_ARGBTOUVJROW_SSSE3
+
+#ifdef HAS_ARGBTOUV444ROW_SSSE3
+void ARGBToUV444Row_SSSE3(const uint8* src_argb,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
+ asm volatile (
+ "movdqa %4,%%xmm3 \n"
+ "movdqa %5,%%xmm4 \n"
+ "movdqa %6,%%xmm5 \n"
+ "sub %1,%2 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
+ "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm1 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm4,%%xmm6 \n"
+ "phaddw %%xmm1,%%xmm0 \n"
+ "phaddw %%xmm6,%%xmm2 \n"
+ "psraw $0x8,%%xmm0 \n"
+ "psraw $0x8,%%xmm2 \n"
+ "packsswb %%xmm2,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
+ "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
+ "pmaddubsw %%xmm3,%%xmm0 \n"
+ "pmaddubsw %%xmm3,%%xmm1 \n"
+ "pmaddubsw %%xmm3,%%xmm2 \n"
+ "pmaddubsw %%xmm3,%%xmm6 \n"
+ "phaddw %%xmm1,%%xmm0 \n"
+ "phaddw %%xmm6,%%xmm2 \n"
+ "psraw $0x8,%%xmm0 \n"
+ "psraw $0x8,%%xmm2 \n"
+ "packsswb %%xmm2,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
+ "lea " MEMLEA(0x40,0) ",%0 \n"
+ MEMOPMEM(movdqu,xmm0,0x00,1,2,1) // movdqu %%xmm0,(%1,%2,1)
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+rm"(width) // %3
+ : "m"(kARGBToV), // %4
+ "m"(kARGBToU), // %5
+ "m"(kAddUV128) // %6
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm6"
+ );
+}
+#endif // HAS_ARGBTOUV444ROW_SSSE3
+
+void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int width) {
+ asm volatile (
+ "movdqa %4,%%xmm5 \n"
+ "movdqa %3,%%xmm4 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
+ "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm1 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm4,%%xmm3 \n"
+ "lea " MEMLEA(0x40,0) ",%0 \n"
+ "phaddw %%xmm1,%%xmm0 \n"
+ "phaddw %%xmm3,%%xmm2 \n"
+ "psrlw $0x7,%%xmm0 \n"
+ "psrlw $0x7,%%xmm2 \n"
+ "packuswb %%xmm2,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_bgra), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ : "m"(kBGRAToY), // %3
+ "m"(kAddY16) // %4
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+}
+
+void BGRAToUVRow_SSSE3(const uint8* src_bgra0,
+ int src_stride_bgra,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
+ asm volatile (
+ "movdqa %5,%%xmm3 \n"
+ "movdqa %6,%%xmm4 \n"
+ "movdqa %7,%%xmm5 \n"
+ "sub %1,%2 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7
+ "pavgb %%xmm7,%%xmm1 \n"
+ "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
+ MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
+ MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7
+ "pavgb %%xmm7,%%xmm6 \n"
+
+ "lea " MEMLEA(0x40,0) ",%0 \n"
+ "movdqa %%xmm0,%%xmm7 \n"
+ "shufps $0x88,%%xmm1,%%xmm0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqa %%xmm2,%%xmm7 \n"
+ "shufps $0x88,%%xmm6,%%xmm2 \n"
+ "shufps $0xdd,%%xmm6,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm2,%%xmm6 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm3,%%xmm1 \n"
+ "pmaddubsw %%xmm3,%%xmm6 \n"
+ "phaddw %%xmm2,%%xmm0 \n"
+ "phaddw %%xmm6,%%xmm1 \n"
+ "psraw $0x8,%%xmm0 \n"
+ "psraw $0x8,%%xmm1 \n"
+ "packsswb %%xmm1,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
+ "movlps %%xmm0," MEMACCESS(1) " \n"
+ MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
+ "lea " MEMLEA(0x8,1) ",%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_bgra0), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+rm"(width) // %3
+ : "r"((intptr_t)(src_stride_bgra)), // %4
+ "m"(kBGRAToV), // %5
+ "m"(kBGRAToU), // %6
+ "m"(kAddUV128) // %7
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
+ );
+}
+
+void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int width) {
+ asm volatile (
+ "movdqa %4,%%xmm5 \n"
+ "movdqa %3,%%xmm4 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
+ "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm1 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm4,%%xmm3 \n"
+ "lea " MEMLEA(0x40,0) ",%0 \n"
+ "phaddw %%xmm1,%%xmm0 \n"
+ "phaddw %%xmm3,%%xmm2 \n"
+ "psrlw $0x7,%%xmm0 \n"
+ "psrlw $0x7,%%xmm2 \n"
+ "packuswb %%xmm2,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_abgr), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ : "m"(kABGRToY), // %3
+ "m"(kAddY16) // %4
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+}
+
+void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int width) {
+ asm volatile (
+ "movdqa %4,%%xmm5 \n"
+ "movdqa %3,%%xmm4 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
+ "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm1 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm4,%%xmm3 \n"
+ "lea " MEMLEA(0x40,0) ",%0 \n"
+ "phaddw %%xmm1,%%xmm0 \n"
+ "phaddw %%xmm3,%%xmm2 \n"
+ "psrlw $0x7,%%xmm0 \n"
+ "psrlw $0x7,%%xmm2 \n"
+ "packuswb %%xmm2,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_rgba), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ : "m"(kRGBAToY), // %3
+ "m"(kAddY16) // %4
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+}
+
+void ABGRToUVRow_SSSE3(const uint8* src_abgr0,
+ int src_stride_abgr,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
+ asm volatile (
+ "movdqa %5,%%xmm3 \n"
+ "movdqa %6,%%xmm4 \n"
+ "movdqa %7,%%xmm5 \n"
+ "sub %1,%2 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7
+ "pavgb %%xmm7,%%xmm1 \n"
+ "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
+ MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
+ MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7
+ "pavgb %%xmm7,%%xmm6 \n"
+
+ "lea " MEMLEA(0x40,0) ",%0 \n"
+ "movdqa %%xmm0,%%xmm7 \n"
+ "shufps $0x88,%%xmm1,%%xmm0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqa %%xmm2,%%xmm7 \n"
+ "shufps $0x88,%%xmm6,%%xmm2 \n"
+ "shufps $0xdd,%%xmm6,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm2,%%xmm6 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm3,%%xmm1 \n"
+ "pmaddubsw %%xmm3,%%xmm6 \n"
+ "phaddw %%xmm2,%%xmm0 \n"
+ "phaddw %%xmm6,%%xmm1 \n"
+ "psraw $0x8,%%xmm0 \n"
+ "psraw $0x8,%%xmm1 \n"
+ "packsswb %%xmm1,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
+ "movlps %%xmm0," MEMACCESS(1) " \n"
+ MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
+ "lea " MEMLEA(0x8,1) ",%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_abgr0), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+rm"(width) // %3
+ : "r"((intptr_t)(src_stride_abgr)), // %4
+ "m"(kABGRToV), // %5
+ "m"(kABGRToU), // %6
+ "m"(kAddUV128) // %7
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
+ );
+}
+
+void RGBAToUVRow_SSSE3(const uint8* src_rgba0,
+ int src_stride_rgba,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
+ asm volatile (
+ "movdqa %5,%%xmm3 \n"
+ "movdqa %6,%%xmm4 \n"
+ "movdqa %7,%%xmm5 \n"
+ "sub %1,%2 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7
+ "pavgb %%xmm7,%%xmm1 \n"
+ "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
+ MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
+ MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7
+ "pavgb %%xmm7,%%xmm6 \n"
+
+ "lea " MEMLEA(0x40,0) ",%0 \n"
+ "movdqa %%xmm0,%%xmm7 \n"
+ "shufps $0x88,%%xmm1,%%xmm0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqa %%xmm2,%%xmm7 \n"
+ "shufps $0x88,%%xmm6,%%xmm2 \n"
+ "shufps $0xdd,%%xmm6,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm2,%%xmm6 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm3,%%xmm1 \n"
+ "pmaddubsw %%xmm3,%%xmm6 \n"
+ "phaddw %%xmm2,%%xmm0 \n"
+ "phaddw %%xmm6,%%xmm1 \n"
+ "psraw $0x8,%%xmm0 \n"
+ "psraw $0x8,%%xmm1 \n"
+ "packsswb %%xmm1,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
+ "movlps %%xmm0," MEMACCESS(1) " \n"
+ MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
+ "lea " MEMLEA(0x8,1) ",%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_rgba0), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+rm"(width) // %3
+ : "r"((intptr_t)(src_stride_rgba)), // %4
+ "m"(kRGBAToV), // %5
+ "m"(kRGBAToU), // %6
+ "m"(kAddUV128) // %7
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
+ );
+}
+
+#if defined(HAS_I422TOARGBROW_SSSE3) || defined(HAS_I422TOARGBROW_AVX2)
+
+// Read 8 UV from 444
+#define READYUV444 \
+ "movq " MEMACCESS([u_buf]) ",%%xmm0 \n" \
+ MEMOPREG(movq, 0x00, [u_buf], [v_buf], 1, xmm1) \
+ "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \
+ "punpcklbw %%xmm1,%%xmm0 \n" \
+ "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \
+ "punpcklbw %%xmm4,%%xmm4 \n" \
+ "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n"
+
+// Read 4 UV from 422, upsample to 8 UV
+#define READYUV422 \
+ "movd " MEMACCESS([u_buf]) ",%%xmm0 \n" \
+ MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1) \
+ "lea " MEMLEA(0x4, [u_buf]) ",%[u_buf] \n" \
+ "punpcklbw %%xmm1,%%xmm0 \n" \
+ "punpcklwd %%xmm0,%%xmm0 \n" \
+ "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \
+ "punpcklbw %%xmm4,%%xmm4 \n" \
+ "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n"
+
+// Read 4 UV from 422, upsample to 8 UV. With 8 Alpha.
+#define READYUVA422 \
+ "movd " MEMACCESS([u_buf]) ",%%xmm0 \n" \
+ MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1) \
+ "lea " MEMLEA(0x4, [u_buf]) ",%[u_buf] \n" \
+ "punpcklbw %%xmm1,%%xmm0 \n" \
+ "punpcklwd %%xmm0,%%xmm0 \n" \
+ "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \
+ "punpcklbw %%xmm4,%%xmm4 \n" \
+ "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" \
+ "movq " MEMACCESS([a_buf]) ",%%xmm5 \n" \
+ "lea " MEMLEA(0x8, [a_buf]) ",%[a_buf] \n"
+
+// Read 4 UV from NV12, upsample to 8 UV
+#define READNV12 \
+ "movq " MEMACCESS([uv_buf]) ",%%xmm0 \n" \
+ "lea " MEMLEA(0x8, [uv_buf]) ",%[uv_buf] \n" \
+ "punpcklwd %%xmm0,%%xmm0 \n" \
+ "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \
+ "punpcklbw %%xmm4,%%xmm4 \n" \
+ "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n"
+
+// Read 4 VU from NV21, upsample to 8 UV
+#define READNV21 \
+ "movq " MEMACCESS([vu_buf]) ",%%xmm0 \n" \
+ "lea " MEMLEA(0x8, [vu_buf]) ",%[vu_buf] \n" \
+ "pshufb %[kShuffleNV21], %%xmm0 \n" \
+ "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \
+ "punpcklbw %%xmm4,%%xmm4 \n" \
+ "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n"
+
+// Read 4 YUY2 with 8 Y and update 4 UV to 8 UV.
+#define READYUY2 \
+ "movdqu " MEMACCESS([yuy2_buf]) ",%%xmm4 \n" \
+ "pshufb %[kShuffleYUY2Y], %%xmm4 \n" \
+ "movdqu " MEMACCESS([yuy2_buf]) ",%%xmm0 \n" \
+ "pshufb %[kShuffleYUY2UV], %%xmm0 \n" \
+ "lea " MEMLEA(0x10, [yuy2_buf]) ",%[yuy2_buf] \n"
+
+// Read 4 UYVY with 8 Y and update 4 UV to 8 UV.
+#define READUYVY \
+ "movdqu " MEMACCESS([uyvy_buf]) ",%%xmm4 \n" \
+ "pshufb %[kShuffleUYVYY], %%xmm4 \n" \
+ "movdqu " MEMACCESS([uyvy_buf]) ",%%xmm0 \n" \
+ "pshufb %[kShuffleUYVYUV], %%xmm0 \n" \
+ "lea " MEMLEA(0x10, [uyvy_buf]) ",%[uyvy_buf] \n"
+
+#if defined(__x86_64__)
+#define YUVTORGB_SETUP(yuvconstants) \
+ "movdqa " MEMACCESS([yuvconstants]) ",%%xmm8 \n" \
+ "movdqa " MEMACCESS2(32, [yuvconstants]) ",%%xmm9 \n" \
+ "movdqa " MEMACCESS2(64, [yuvconstants]) ",%%xmm10 \n" \
+ "movdqa " MEMACCESS2(96, [yuvconstants]) ",%%xmm11 \n" \
+ "movdqa " MEMACCESS2(128, [yuvconstants]) ",%%xmm12 \n" \
+ "movdqa " MEMACCESS2(160, [yuvconstants]) ",%%xmm13 \n" \
+ "movdqa " MEMACCESS2(192, [yuvconstants]) ",%%xmm14 \n"
+// Convert 8 pixels: 8 UV and 8 Y
+#define YUVTORGB(yuvconstants) \
+ "movdqa %%xmm0,%%xmm1 \n" \
+ "movdqa %%xmm0,%%xmm2 \n" \
+ "movdqa %%xmm0,%%xmm3 \n" \
+ "movdqa %%xmm11,%%xmm0 \n" \
+ "pmaddubsw %%xmm8,%%xmm1 \n" \
+ "psubw %%xmm1,%%xmm0 \n" \
+ "movdqa %%xmm12,%%xmm1 \n" \
+ "pmaddubsw %%xmm9,%%xmm2 \n" \
+ "psubw %%xmm2,%%xmm1 \n" \
+ "movdqa %%xmm13,%%xmm2 \n" \
+ "pmaddubsw %%xmm10,%%xmm3 \n" \
+ "psubw %%xmm3,%%xmm2 \n" \
+ "pmulhuw %%xmm14,%%xmm4 \n" \
+ "paddsw %%xmm4,%%xmm0 \n" \
+ "paddsw %%xmm4,%%xmm1 \n" \
+ "paddsw %%xmm4,%%xmm2 \n" \
+ "psraw $0x6,%%xmm0 \n" \
+ "psraw $0x6,%%xmm1 \n" \
+ "psraw $0x6,%%xmm2 \n" \
+ "packuswb %%xmm0,%%xmm0 \n" \
+ "packuswb %%xmm1,%%xmm1 \n" \
+ "packuswb %%xmm2,%%xmm2 \n"
+#define YUVTORGB_REGS \
+ "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",
+
+#else
+#define YUVTORGB_SETUP(yuvconstants)
+// Convert 8 pixels: 8 UV and 8 Y
+#define YUVTORGB(yuvconstants) \
+ "movdqa %%xmm0,%%xmm1 \n" \
+ "movdqa %%xmm0,%%xmm2 \n" \
+ "movdqa %%xmm0,%%xmm3 \n" \
+ "movdqa " MEMACCESS2(96, [yuvconstants]) ",%%xmm0 \n" \
+ "pmaddubsw " MEMACCESS([yuvconstants]) ",%%xmm1 \n" \
+ "psubw %%xmm1,%%xmm0 \n" \
+ "movdqa " MEMACCESS2(128, [yuvconstants]) ",%%xmm1 \n" \
+ "pmaddubsw " MEMACCESS2(32, [yuvconstants]) ",%%xmm2 \n" \
+ "psubw %%xmm2,%%xmm1 \n" \
+ "movdqa " MEMACCESS2(160, [yuvconstants]) ",%%xmm2 \n" \
+ "pmaddubsw " MEMACCESS2(64, [yuvconstants]) ",%%xmm3 \n" \
+ "psubw %%xmm3,%%xmm2 \n" \
+ "pmulhuw " MEMACCESS2(192, [yuvconstants]) ",%%xmm4 \n" \
+ "paddsw %%xmm4,%%xmm0 \n" \
+ "paddsw %%xmm4,%%xmm1 \n" \
+ "paddsw %%xmm4,%%xmm2 \n" \
+ "psraw $0x6,%%xmm0 \n" \
+ "psraw $0x6,%%xmm1 \n" \
+ "psraw $0x6,%%xmm2 \n" \
+ "packuswb %%xmm0,%%xmm0 \n" \
+ "packuswb %%xmm1,%%xmm1 \n" \
+ "packuswb %%xmm2,%%xmm2 \n"
+#define YUVTORGB_REGS
+#endif
+
+// Store 8 ARGB values.
+#define STOREARGB \
+ "punpcklbw %%xmm1,%%xmm0 \n" \
+ "punpcklbw %%xmm5,%%xmm2 \n" \
+ "movdqa %%xmm0,%%xmm1 \n" \
+ "punpcklwd %%xmm2,%%xmm0 \n" \
+ "punpckhwd %%xmm2,%%xmm1 \n" \
+ "movdqu %%xmm0," MEMACCESS([dst_argb]) " \n" \
+ "movdqu %%xmm1," MEMACCESS2(0x10, [dst_argb]) " \n" \
+ "lea " MEMLEA(0x20, [dst_argb]) ", %[dst_argb] \n"
+
+// Store 8 RGBA values.
+#define STORERGBA \
+ "pcmpeqb %%xmm5,%%xmm5 \n" \
+ "punpcklbw %%xmm2,%%xmm1 \n" \
+ "punpcklbw %%xmm0,%%xmm5 \n" \
+ "movdqa %%xmm5,%%xmm0 \n" \
+ "punpcklwd %%xmm1,%%xmm5 \n" \
+ "punpckhwd %%xmm1,%%xmm0 \n" \
+ "movdqu %%xmm5," MEMACCESS([dst_rgba]) " \n" \
+ "movdqu %%xmm0," MEMACCESS2(0x10, [dst_rgba]) " \n" \
+ "lea " MEMLEA(0x20, [dst_rgba]) ",%[dst_rgba] \n"
+
+void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP(yuvconstants)
+ "sub %[u_buf],%[v_buf] \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ READYUV444
+ YUVTORGB(yuvconstants)
+ STOREARGB
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [u_buf]"+r"(u_buf), // %[u_buf]
+ [v_buf]"+r"(v_buf), // %[v_buf]
+ [dst_argb]"+r"(dst_argb), // %[dst_argb]
+ [width]"+rm"(width) // %[width]
+ : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
+ : "memory", "cc", NACL_R14 YUVTORGB_REGS
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+}
+
+void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* dst_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP(yuvconstants)
+ "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"
+ "movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n"
+ "sub %[u_buf],%[v_buf] \n"
+
+ LABELALIGN
+ "1: \n"
+ READYUV422
+ YUVTORGB(yuvconstants)
+ "punpcklbw %%xmm1,%%xmm0 \n"
+ "punpcklbw %%xmm2,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklwd %%xmm2,%%xmm0 \n"
+ "punpckhwd %%xmm2,%%xmm1 \n"
+ "pshufb %%xmm5,%%xmm0 \n"
+ "pshufb %%xmm6,%%xmm1 \n"
+ "palignr $0xc,%%xmm0,%%xmm1 \n"
+ "movq %%xmm0," MEMACCESS([dst_rgb24]) "\n"
+ "movdqu %%xmm1," MEMACCESS2(0x8,[dst_rgb24]) "\n"
+ "lea " MEMLEA(0x18,[dst_rgb24]) ",%[dst_rgb24] \n"
+ "subl $0x8,%[width] \n"
+ "jg 1b \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [u_buf]"+r"(u_buf), // %[u_buf]
+ [v_buf]"+r"(v_buf), // %[v_buf]
+ [dst_rgb24]"+r"(dst_rgb24), // %[dst_rgb24]
+#if defined(__i386__)
+ [width]"+m"(width) // %[width]
+#else
+ [width]"+rm"(width) // %[width]
+#endif
+ : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
+ [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0),
+ [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24)
+ : "memory", "cc", NACL_R14 YUVTORGB_REGS
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+ );
+}
+
+void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP(yuvconstants)
+ "sub %[u_buf],%[v_buf] \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ READYUV422
+ YUVTORGB(yuvconstants)
+ STOREARGB
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [u_buf]"+r"(u_buf), // %[u_buf]
+ [v_buf]"+r"(v_buf), // %[v_buf]
+ [dst_argb]"+r"(dst_argb), // %[dst_argb]
+ [width]"+rm"(width) // %[width]
+ : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
+ : "memory", "cc", NACL_R14 YUVTORGB_REGS
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+}
+
+#ifdef HAS_I422ALPHATOARGBROW_SSSE3
+void OMITFP I422AlphaToARGBRow_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ const uint8* a_buf,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ // clang-format off
+ asm volatile (
+ YUVTORGB_SETUP(yuvconstants)
+ "sub %[u_buf],%[v_buf] \n"
+
+ LABELALIGN
+ "1: \n"
+ READYUVA422
+ YUVTORGB(yuvconstants)
+ STOREARGB
+ "subl $0x8,%[width] \n"
+ "jg 1b \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [u_buf]"+r"(u_buf), // %[u_buf]
+ [v_buf]"+r"(v_buf), // %[v_buf]
+ [a_buf]"+r"(a_buf), // %[a_buf]
+ [dst_argb]"+r"(dst_argb), // %[dst_argb]
+#if defined(__i386__)
+ [width]"+m"(width) // %[width]
+#else
+ [width]"+rm"(width) // %[width]
+#endif
+ : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
+ : "memory", "cc", NACL_R14 YUVTORGB_REGS
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+ // clang-format on
+}
+#endif // HAS_I422ALPHATOARGBROW_SSSE3
+
+void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf,
+ const uint8* uv_buf,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ // clang-format off
+ asm volatile (
+ YUVTORGB_SETUP(yuvconstants)
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ READNV12
+ YUVTORGB(yuvconstants)
+ STOREARGB
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [uv_buf]"+r"(uv_buf), // %[uv_buf]
+ [dst_argb]"+r"(dst_argb), // %[dst_argb]
+ [width]"+rm"(width) // %[width]
+ : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
+ : "memory", "cc", YUVTORGB_REGS // Does not use r14.
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+ // clang-format on
+}
+
+void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf,
+ const uint8* vu_buf,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ // clang-format off
+ asm volatile (
+ YUVTORGB_SETUP(yuvconstants)
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ READNV21
+ YUVTORGB(yuvconstants)
+ STOREARGB
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [vu_buf]"+r"(vu_buf), // %[vu_buf]
+ [dst_argb]"+r"(dst_argb), // %[dst_argb]
+ [width]"+rm"(width) // %[width]
+ : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
+ [kShuffleNV21]"m"(kShuffleNV21)
+ : "memory", "cc", YUVTORGB_REGS // Does not use r14.
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+ // clang-format on
+}
+
+void OMITFP YUY2ToARGBRow_SSSE3(const uint8* yuy2_buf,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ // clang-format off
+ asm volatile (
+ YUVTORGB_SETUP(yuvconstants)
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ READYUY2
+ YUVTORGB(yuvconstants)
+ STOREARGB
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
+ : [yuy2_buf]"+r"(yuy2_buf), // %[yuy2_buf]
+ [dst_argb]"+r"(dst_argb), // %[dst_argb]
+ [width]"+rm"(width) // %[width]
+ : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
+ [kShuffleYUY2Y]"m"(kShuffleYUY2Y),
+ [kShuffleYUY2UV]"m"(kShuffleYUY2UV)
+ : "memory", "cc", YUVTORGB_REGS // Does not use r14.
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+ // clang-format on
+}
+
+void OMITFP UYVYToARGBRow_SSSE3(const uint8* uyvy_buf,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ // clang-format off
+ asm volatile (
+ YUVTORGB_SETUP(yuvconstants)
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ READUYVY
+ YUVTORGB(yuvconstants)
+ STOREARGB
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
+ : [uyvy_buf]"+r"(uyvy_buf), // %[uyvy_buf]
+ [dst_argb]"+r"(dst_argb), // %[dst_argb]
+ [width]"+rm"(width) // %[width]
+ : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
+ [kShuffleUYVYY]"m"(kShuffleUYVYY),
+ [kShuffleUYVYUV]"m"(kShuffleUYVYUV)
+ : "memory", "cc", YUVTORGB_REGS // Does not use r14.
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+ // clang-format on
+}
+
+void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* dst_rgba,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP(yuvconstants)
+ "sub %[u_buf],%[v_buf] \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ READYUV422
+ YUVTORGB(yuvconstants)
+ STORERGBA
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [u_buf]"+r"(u_buf), // %[u_buf]
+ [v_buf]"+r"(v_buf), // %[v_buf]
+ [dst_rgba]"+r"(dst_rgba), // %[dst_rgba]
+ [width]"+rm"(width) // %[width]
+ : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
+ : "memory", "cc", NACL_R14 YUVTORGB_REGS
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+}
+
+#endif // HAS_I422TOARGBROW_SSSE3
+
+// Read 16 UV from 444
+#define READYUV444_AVX2 \
+ "vmovdqu " MEMACCESS([u_buf]) ",%%xmm0 \n" \
+ MEMOPREG(vmovdqu, 0x00, [u_buf], [v_buf], 1, xmm1) \
+ "lea " MEMLEA(0x10, [u_buf]) ",%[u_buf] \n" \
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
+ "vpermq $0xd8,%%ymm1,%%ymm1 \n" \
+ "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \
+ "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \
+ "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
+ "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
+ "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n"
+
+// Read 8 UV from 422, upsample to 16 UV.
+#define READYUV422_AVX2 \
+ "vmovq " MEMACCESS([u_buf]) ",%%xmm0 \n" \
+ MEMOPREG(vmovq, 0x00, [u_buf], [v_buf], 1, xmm1) \
+ "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \
+ "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
+ "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \
+ "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \
+ "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
+ "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
+ "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n"
+
+// Read 8 UV from 422, upsample to 16 UV. With 16 Alpha.
+#define READYUVA422_AVX2 \
+ "vmovq " MEMACCESS([u_buf]) ",%%xmm0 \n" \
+ MEMOPREG(vmovq, 0x00, [u_buf], [v_buf], 1, xmm1) \
+ "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \
+ "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
+ "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \
+ "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \
+ "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
+ "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
+ "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" \
+ "vmovdqu " MEMACCESS([a_buf]) ",%%xmm5 \n" \
+ "vpermq $0xd8,%%ymm5,%%ymm5 \n" \
+ "lea " MEMLEA(0x10, [a_buf]) ",%[a_buf] \n"
+
+// Read 8 UV from NV12, upsample to 16 UV.
+#define READNV12_AVX2 \
+ "vmovdqu " MEMACCESS([uv_buf]) ",%%xmm0 \n" \
+ "lea " MEMLEA(0x10, [uv_buf]) ",%[uv_buf] \n" \
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
+ "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \
+ "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \
+ "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
+ "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
+ "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n"
+
+// Read 8 VU from NV21, upsample to 16 UV.
+#define READNV21_AVX2 \
+ "vmovdqu " MEMACCESS([vu_buf]) ",%%xmm0 \n" \
+ "lea " MEMLEA(0x10, [vu_buf]) ",%[vu_buf] \n" \
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
+ "vpshufb %[kShuffleNV21], %%ymm0, %%ymm0 \n" \
+ "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \
+ "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
+ "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
+ "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n"
+
+// Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV.
+#define READYUY2_AVX2 \
+ "vmovdqu " MEMACCESS([yuy2_buf]) ",%%ymm4 \n" \
+ "vpshufb %[kShuffleYUY2Y], %%ymm4, %%ymm4 \n" \
+ "vmovdqu " MEMACCESS([yuy2_buf]) ",%%ymm0 \n" \
+ "vpshufb %[kShuffleYUY2UV], %%ymm0, %%ymm0 \n" \
+ "lea " MEMLEA(0x20, [yuy2_buf]) ",%[yuy2_buf] \n"
+
+// Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV.
+#define READUYVY_AVX2 \
+ "vmovdqu " MEMACCESS([uyvy_buf]) ",%%ymm4 \n" \
+ "vpshufb %[kShuffleUYVYY], %%ymm4, %%ymm4 \n" \
+ "vmovdqu " MEMACCESS([uyvy_buf]) ",%%ymm0 \n" \
+ "vpshufb %[kShuffleUYVYUV], %%ymm0, %%ymm0 \n" \
+ "lea " MEMLEA(0x20, [uyvy_buf]) ",%[uyvy_buf] \n"
+
+#if defined(__x86_64__)
+#define YUVTORGB_SETUP_AVX2(yuvconstants) \
+ "vmovdqa " MEMACCESS([yuvconstants]) ",%%ymm8 \n" \
+ "vmovdqa " MEMACCESS2(32, [yuvconstants]) ",%%ymm9 \n" \
+ "vmovdqa " MEMACCESS2(64, [yuvconstants]) ",%%ymm10 \n" \
+ "vmovdqa " MEMACCESS2(96, [yuvconstants]) ",%%ymm11 \n" \
+ "vmovdqa " MEMACCESS2(128, [yuvconstants]) ",%%ymm12 \n" \
+ "vmovdqa " MEMACCESS2(160, [yuvconstants]) ",%%ymm13 \n" \
+ "vmovdqa " MEMACCESS2(192, [yuvconstants]) ",%%ymm14 \n"
+
+#define YUVTORGB_AVX2(yuvconstants) \
+ "vpmaddubsw %%ymm10,%%ymm0,%%ymm2 \n" \
+ "vpmaddubsw %%ymm9,%%ymm0,%%ymm1 \n" \
+ "vpmaddubsw %%ymm8,%%ymm0,%%ymm0 \n" \
+ "vpsubw %%ymm2,%%ymm13,%%ymm2 \n" \
+ "vpsubw %%ymm1,%%ymm12,%%ymm1 \n" \
+ "vpsubw %%ymm0,%%ymm11,%%ymm0 \n" \
+ "vpmulhuw %%ymm14,%%ymm4,%%ymm4 \n" \
+ "vpaddsw %%ymm4,%%ymm0,%%ymm0 \n" \
+ "vpaddsw %%ymm4,%%ymm1,%%ymm1 \n" \
+ "vpaddsw %%ymm4,%%ymm2,%%ymm2 \n" \
+ "vpsraw $0x6,%%ymm0,%%ymm0 \n" \
+ "vpsraw $0x6,%%ymm1,%%ymm1 \n" \
+ "vpsraw $0x6,%%ymm2,%%ymm2 \n" \
+ "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \
+ "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" \
+ "vpackuswb %%ymm2,%%ymm2,%%ymm2 \n"
+
+#define YUVTORGB_REGS_AVX2 \
+ "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",
+
+#else // Convert 16 pixels: 16 UV and 16 Y.
+
+#define YUVTORGB_SETUP_AVX2(yuvconstants)
+#define YUVTORGB_AVX2(yuvconstants) \
+ "vpmaddubsw " MEMACCESS2(64, [yuvconstants]) ",%%ymm0,%%ymm2 \n" \
+ "vpmaddubsw " MEMACCESS2(32, [yuvconstants]) ",%%ymm0,%%ymm1 \n" \
+ "vpmaddubsw " MEMACCESS([yuvconstants]) ",%%ymm0,%%ymm0 \n" \
+ "vmovdqu " MEMACCESS2(160, [yuvconstants]) ",%%ymm3 \n" \
+ "vpsubw %%ymm2,%%ymm3,%%ymm2 \n" \
+ "vmovdqu " MEMACCESS2(128, [yuvconstants]) ",%%ymm3 \n" \
+ "vpsubw %%ymm1,%%ymm3,%%ymm1 \n" \
+ "vmovdqu " MEMACCESS2(96, [yuvconstants]) ",%%ymm3 \n" \
+ "vpsubw %%ymm0,%%ymm3,%%ymm0 \n" \
+ "vpmulhuw " MEMACCESS2(192, [yuvconstants]) ",%%ymm4,%%ymm4 \n" \
+ "vpaddsw %%ymm4,%%ymm0,%%ymm0 \n" \
+ "vpaddsw %%ymm4,%%ymm1,%%ymm1 \n" \
+ "vpaddsw %%ymm4,%%ymm2,%%ymm2 \n" \
+ "vpsraw $0x6,%%ymm0,%%ymm0 \n" \
+ "vpsraw $0x6,%%ymm1,%%ymm1 \n" \
+ "vpsraw $0x6,%%ymm2,%%ymm2 \n" \
+ "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \
+ "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" \
+ "vpackuswb %%ymm2,%%ymm2,%%ymm2 \n"
+#define YUVTORGB_REGS_AVX2
+#endif
+
+// Store 16 ARGB values.
+#define STOREARGB_AVX2 \
+ "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
+ "vpunpcklbw %%ymm5,%%ymm2,%%ymm2 \n" \
+ "vpermq $0xd8,%%ymm2,%%ymm2 \n" \
+ "vpunpcklwd %%ymm2,%%ymm0,%%ymm1 \n" \
+ "vpunpckhwd %%ymm2,%%ymm0,%%ymm0 \n" \
+ "vmovdqu %%ymm1," MEMACCESS([dst_argb]) " \n" \
+ "vmovdqu %%ymm0," MEMACCESS2(0x20, [dst_argb]) " \n" \
+ "lea " MEMLEA(0x40, [dst_argb]) ", %[dst_argb] \n"
+
+#ifdef HAS_I444TOARGBROW_AVX2
+// 16 pixels
+// 16 UV values with 16 Y producing 16 ARGB (64 bytes).
+void OMITFP I444ToARGBRow_AVX2(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP_AVX2(yuvconstants)
+ "sub %[u_buf],%[v_buf] \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ READYUV444_AVX2
+ YUVTORGB_AVX2(yuvconstants)
+ STOREARGB_AVX2
+ "sub $0x10,%[width] \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [u_buf]"+r"(u_buf), // %[u_buf]
+ [v_buf]"+r"(v_buf), // %[v_buf]
+ [dst_argb]"+r"(dst_argb), // %[dst_argb]
+ [width]"+rm"(width) // %[width]
+ : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
+ : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+}
+#endif // HAS_I444TOARGBROW_AVX2
+
+#if defined(HAS_I422TOARGBROW_AVX2)
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
+void OMITFP I422ToARGBRow_AVX2(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP_AVX2(yuvconstants)
+ "sub %[u_buf],%[v_buf] \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ READYUV422_AVX2
+ YUVTORGB_AVX2(yuvconstants)
+ STOREARGB_AVX2
+ "sub $0x10,%[width] \n"
+ "jg 1b \n"
+
+ "vzeroupper \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [u_buf]"+r"(u_buf), // %[u_buf]
+ [v_buf]"+r"(v_buf), // %[v_buf]
+ [dst_argb]"+r"(dst_argb), // %[dst_argb]
+ [width]"+rm"(width) // %[width]
+ : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
+ : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+}
+#endif // HAS_I422TOARGBROW_AVX2
+
+#if defined(HAS_I422ALPHATOARGBROW_AVX2)
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB.
+void OMITFP I422AlphaToARGBRow_AVX2(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ const uint8* a_buf,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ // clang-format off
+ asm volatile (
+ YUVTORGB_SETUP_AVX2(yuvconstants)
+ "sub %[u_buf],%[v_buf] \n"
+
+ LABELALIGN
+ "1: \n"
+ READYUVA422_AVX2
+ YUVTORGB_AVX2(yuvconstants)
+ STOREARGB_AVX2
+ "subl $0x10,%[width] \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [u_buf]"+r"(u_buf), // %[u_buf]
+ [v_buf]"+r"(v_buf), // %[v_buf]
+ [a_buf]"+r"(a_buf), // %[a_buf]
+ [dst_argb]"+r"(dst_argb), // %[dst_argb]
+#if defined(__i386__)
+ [width]"+m"(width) // %[width]
+#else
+ [width]"+rm"(width) // %[width]
+#endif
+ : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
+ : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+ // clang-format on
+}
+#endif // HAS_I422ALPHATOARGBROW_AVX2
+
+#if defined(HAS_I422TORGBAROW_AVX2)
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes).
+void OMITFP I422ToRGBARow_AVX2(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP_AVX2(yuvconstants)
+ "sub %[u_buf],%[v_buf] \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ READYUV422_AVX2
+ YUVTORGB_AVX2(yuvconstants)
+
+ // Step 3: Weave into RGBA
+ "vpunpcklbw %%ymm2,%%ymm1,%%ymm1 \n"
+ "vpermq $0xd8,%%ymm1,%%ymm1 \n"
+ "vpunpcklbw %%ymm0,%%ymm5,%%ymm2 \n"
+ "vpermq $0xd8,%%ymm2,%%ymm2 \n"
+ "vpunpcklwd %%ymm1,%%ymm2,%%ymm0 \n"
+ "vpunpckhwd %%ymm1,%%ymm2,%%ymm1 \n"
+ "vmovdqu %%ymm0," MEMACCESS([dst_argb]) "\n"
+ "vmovdqu %%ymm1," MEMACCESS2(0x20,[dst_argb]) "\n"
+ "lea " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n"
+ "sub $0x10,%[width] \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [u_buf]"+r"(u_buf), // %[u_buf]
+ [v_buf]"+r"(v_buf), // %[v_buf]
+ [dst_argb]"+r"(dst_argb), // %[dst_argb]
+ [width]"+rm"(width) // %[width]
+ : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
+ : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+}
+#endif // HAS_I422TORGBAROW_AVX2
+
+#if defined(HAS_NV12TOARGBROW_AVX2)
+// 16 pixels.
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
+void OMITFP NV12ToARGBRow_AVX2(const uint8* y_buf,
+ const uint8* uv_buf,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ // clang-format off
+ asm volatile (
+ YUVTORGB_SETUP_AVX2(yuvconstants)
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ READNV12_AVX2
+ YUVTORGB_AVX2(yuvconstants)
+ STOREARGB_AVX2
+ "sub $0x10,%[width] \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [uv_buf]"+r"(uv_buf), // %[uv_buf]
+ [dst_argb]"+r"(dst_argb), // %[dst_argb]
+ [width]"+rm"(width) // %[width]
+ : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
+ : "memory", "cc", YUVTORGB_REGS_AVX2 // Does not use r14.
+ "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+ // clang-format on
+}
+#endif // HAS_NV12TOARGBROW_AVX2
+
+#if defined(HAS_NV21TOARGBROW_AVX2)
+// 16 pixels.
+// 8 VU values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
+void OMITFP NV21ToARGBRow_AVX2(const uint8* y_buf,
+ const uint8* vu_buf,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ // clang-format off
+ asm volatile (
+ YUVTORGB_SETUP_AVX2(yuvconstants)
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ READNV21_AVX2
+ YUVTORGB_AVX2(yuvconstants)
+ STOREARGB_AVX2
+ "sub $0x10,%[width] \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [vu_buf]"+r"(vu_buf), // %[vu_buf]
+ [dst_argb]"+r"(dst_argb), // %[dst_argb]
+ [width]"+rm"(width) // %[width]
+ : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
+ [kShuffleNV21]"m"(kShuffleNV21)
+ : "memory", "cc", YUVTORGB_REGS_AVX2 // Does not use r14.
+ "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+ // clang-format on
+}
+#endif // HAS_NV21TOARGBROW_AVX2
+
+#if defined(HAS_YUY2TOARGBROW_AVX2)
+// 16 pixels.
+// 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
+void OMITFP YUY2ToARGBRow_AVX2(const uint8* yuy2_buf,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ // clang-format off
+ asm volatile (
+ YUVTORGB_SETUP_AVX2(yuvconstants)
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ READYUY2_AVX2
+ YUVTORGB_AVX2(yuvconstants)
+ STOREARGB_AVX2
+ "sub $0x10,%[width] \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : [yuy2_buf]"+r"(yuy2_buf), // %[yuy2_buf]
+ [dst_argb]"+r"(dst_argb), // %[dst_argb]
+ [width]"+rm"(width) // %[width]
+ : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
+ [kShuffleYUY2Y]"m"(kShuffleYUY2Y),
+ [kShuffleYUY2UV]"m"(kShuffleYUY2UV)
+ : "memory", "cc", YUVTORGB_REGS_AVX2 // Does not use r14.
+ "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+ // clang-format on
+}
+#endif // HAS_YUY2TOARGBROW_AVX2
+
+#if defined(HAS_UYVYTOARGBROW_AVX2)
+// 16 pixels.
+// 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
+void OMITFP UYVYToARGBRow_AVX2(const uint8* uyvy_buf,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ // clang-format off
+ asm volatile (
+ YUVTORGB_SETUP_AVX2(yuvconstants)
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ READUYVY_AVX2
+ YUVTORGB_AVX2(yuvconstants)
+ STOREARGB_AVX2
+ "sub $0x10,%[width] \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : [uyvy_buf]"+r"(uyvy_buf), // %[uyvy_buf]
+ [dst_argb]"+r"(dst_argb), // %[dst_argb]
+ [width]"+rm"(width) // %[width]
+ : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
+ [kShuffleUYVYY]"m"(kShuffleUYVYY),
+ [kShuffleUYVYUV]"m"(kShuffleUYVYUV)
+ : "memory", "cc", YUVTORGB_REGS_AVX2 // Does not use r14.
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+ // clang-format on
+}
+#endif // HAS_UYVYTOARGBROW_AVX2
+
+#ifdef HAS_I400TOARGBROW_SSE2
+void I400ToARGBRow_SSE2(const uint8* y_buf, uint8* dst_argb, int width) {
+ asm volatile (
+ "mov $0x4a354a35,%%eax \n" // 4a35 = 18997 = 1.164
+ "movd %%eax,%%xmm2 \n"
+ "pshufd $0x0,%%xmm2,%%xmm2 \n"
+ "mov $0x04880488,%%eax \n" // 0488 = 1160 = 1.164 * 16
+ "movd %%eax,%%xmm3 \n"
+ "pshufd $0x0,%%xmm3,%%xmm3 \n"
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "pslld $0x18,%%xmm4 \n"
+
+ LABELALIGN
+ "1: \n"
+ // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
+ "movq " MEMACCESS(0) ",%%xmm0 \n"
+ "lea " MEMLEA(0x8,0) ",%0 \n"
+ "punpcklbw %%xmm0,%%xmm0 \n"
+ "pmulhuw %%xmm2,%%xmm0 \n"
+ "psubusw %%xmm3,%%xmm0 \n"
+ "psrlw $6, %%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+
+ // Step 2: Weave into ARGB
+ "punpcklbw %%xmm0,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklwd %%xmm0,%%xmm0 \n"
+ "punpckhwd %%xmm1,%%xmm1 \n"
+ "por %%xmm4,%%xmm0 \n"
+ "por %%xmm4,%%xmm1 \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
+ "lea " MEMLEA(0x20,1) ",%1 \n"
+
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ : "+r"(y_buf), // %0
+ "+r"(dst_argb), // %1
+ "+rm"(width) // %2
+ :
+ : "memory", "cc", "eax"
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
+ );
+}
+#endif // HAS_I400TOARGBROW_SSE2
+
+#ifdef HAS_I400TOARGBROW_AVX2
+// 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).
+// note: vpunpcklbw mutates and vpackuswb unmutates.
+void I400ToARGBRow_AVX2(const uint8* y_buf, uint8* dst_argb, int width) {
+ asm volatile (
+ "mov $0x4a354a35,%%eax \n" // 0488 = 1160 = 1.164 * 16
+ "vmovd %%eax,%%xmm2 \n"
+ "vbroadcastss %%xmm2,%%ymm2 \n"
+ "mov $0x4880488,%%eax \n" // 4a35 = 18997 = 1.164
+ "vmovd %%eax,%%xmm3 \n"
+ "vbroadcastss %%xmm3,%%ymm3 \n"
+ "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
+ "vpslld $0x18,%%ymm4,%%ymm4 \n"
+
+ LABELALIGN
+ "1: \n"
+ // Step 1: Scale Y contribution to 16 G values. G = (y - 16) * 1.164
+ "vmovdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "lea " MEMLEA(0x10,0) ",%0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpsubusw %%ymm3,%%ymm0,%%ymm0 \n"
+ "vpsrlw $0x6,%%ymm0,%%ymm0 \n"
+ "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpunpcklbw %%ymm0,%%ymm0,%%ymm1 \n"
+ "vpermq $0xd8,%%ymm1,%%ymm1 \n"
+ "vpunpcklwd %%ymm1,%%ymm1,%%ymm0 \n"
+ "vpunpckhwd %%ymm1,%%ymm1,%%ymm1 \n"
+ "vpor %%ymm4,%%ymm0,%%ymm0 \n"
+ "vpor %%ymm4,%%ymm1,%%ymm1 \n"
+ "vmovdqu %%ymm0," MEMACCESS(1) " \n"
+ "vmovdqu %%ymm1," MEMACCESS2(0x20,1) " \n"
+ "lea " MEMLEA(0x40,1) ",%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(y_buf), // %0
+ "+r"(dst_argb), // %1
+ "+rm"(width) // %2
+ :
+ : "memory", "cc", "eax"
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
+ );
+}
+#endif // HAS_I400TOARGBROW_AVX2
+
+#ifdef HAS_MIRRORROW_SSSE3
+// Shuffle table for reversing the bytes.
+static uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u,
+ 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u};
+
+void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
+ intptr_t temp_width = (intptr_t)(width);
+ asm volatile (
+ "movdqa %3,%%xmm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ MEMOPREG(movdqu,-0x10,0,2,1,xmm0) // movdqu -0x10(%0,%2),%%xmm0
+ "pshufb %%xmm5,%%xmm0 \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(temp_width) // %2
+ : "m"(kShuffleMirror) // %3
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm5"
+ );
+}
+#endif // HAS_MIRRORROW_SSSE3
+
+#ifdef HAS_MIRRORROW_AVX2
+void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
+ intptr_t temp_width = (intptr_t)(width);
+ asm volatile (
+ "vbroadcastf128 %3,%%ymm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ MEMOPREG(vmovdqu,-0x20,0,2,1,ymm0) // vmovdqu -0x20(%0,%2),%%ymm0
+ "vpshufb %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpermq $0x4e,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x20,1) ",%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(temp_width) // %2
+ : "m"(kShuffleMirror) // %3
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm5"
+ );
+}
+#endif // HAS_MIRRORROW_AVX2
+
+#ifdef HAS_MIRRORUVROW_SSSE3
+// Shuffle table for reversing the bytes of UV channels.
+static uvec8 kShuffleMirrorUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u,
+ 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u};
+void MirrorUVRow_SSSE3(const uint8* src,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
+ intptr_t temp_width = (intptr_t)(width);
+ asm volatile (
+ "movdqa %4,%%xmm1 \n"
+ "lea " MEMLEA4(-0x10,0,3,2) ",%0 \n"
+ "sub %1,%2 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "lea " MEMLEA(-0x10,0) ",%0 \n"
+ "pshufb %%xmm1,%%xmm0 \n"
+ "movlpd %%xmm0," MEMACCESS(1) " \n"
+ MEMOPMEM(movhpd,xmm0,0x00,1,2,1) // movhpd %%xmm0,(%1,%2)
+ "lea " MEMLEA(0x8,1) ",%1 \n"
+ "sub $8,%3 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(temp_width) // %3
+ : "m"(kShuffleMirrorUV) // %4
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1"
+ );
+}
+#endif // HAS_MIRRORUVROW_SSSE3
+
+#ifdef HAS_ARGBMIRRORROW_SSE2
+
+void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
+ intptr_t temp_width = (intptr_t)(width);
+ asm volatile (
+ "lea " MEMLEA4(-0x10,0,2,4) ",%0 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "pshufd $0x1b,%%xmm0,%%xmm0 \n"
+ "lea " MEMLEA(-0x10,0) ",%0 \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(temp_width) // %2
+ :
+ : "memory", "cc"
+ , "xmm0"
+ );
+}
+#endif // HAS_ARGBMIRRORROW_SSE2
+
+#ifdef HAS_ARGBMIRRORROW_AVX2
+// Shuffle table for reversing the bytes.
+static const ulvec32 kARGBShuffleMirror_AVX2 = {7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u};
+void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
+ intptr_t temp_width = (intptr_t)(width);
+ asm volatile (
+ "vmovdqu %3,%%ymm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ VMEMOPREG(vpermd,-0x20,0,2,4,ymm5,ymm0) // vpermd -0x20(%0,%2,4),ymm5,ymm0
+ "vmovdqu %%ymm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x20,1) ",%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(temp_width) // %2
+ : "m"(kARGBShuffleMirror_AVX2) // %3
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm5"
+ );
+}
+#endif // HAS_ARGBMIRRORROW_AVX2
+
+#ifdef HAS_SPLITUVROW_AVX2
+void SplitUVRow_AVX2(const uint8* src_uv,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
+ asm volatile (
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
+ "sub %1,%2 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
+ "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
+ "lea " MEMLEA(0x40,0) ",%0 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm2 \n"
+ "vpsrlw $0x8,%%ymm1,%%ymm3 \n"
+ "vpand %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpand %%ymm5,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpackuswb %%ymm3,%%ymm2,%%ymm2 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm2,%%ymm2 \n"
+ "vmovdqu %%ymm0," MEMACCESS(1) " \n"
+ MEMOPMEM(vmovdqu,ymm2,0x00,1,2,1) // vmovdqu %%ymm2,(%1,%2)
+ "lea " MEMLEA(0x20,1) ",%1 \n"
+ "sub $0x20,%3 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_uv), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+ );
+}
+#endif // HAS_SPLITUVROW_AVX2
+
+#ifdef HAS_SPLITUVROW_SSE2
+void SplitUVRow_SSE2(const uint8* src_uv,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
+ asm volatile (
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $0x8,%%xmm5 \n"
+ "sub %1,%2 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "movdqa %%xmm1,%%xmm3 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "psrlw $0x8,%%xmm2 \n"
+ "psrlw $0x8,%%xmm3 \n"
+ "packuswb %%xmm3,%%xmm2 \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ MEMOPMEM(movdqu,xmm2,0x00,1,2,1) // movdqu %%xmm2,(%1,%2)
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_uv), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+ );
+}
+#endif // HAS_SPLITUVROW_SSE2
+
+#ifdef HAS_MERGEUVROW_AVX2
+void MergeUVRow_AVX2(const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_uv,
+ int width) {
+ asm volatile (
+ "sub %0,%1 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
+ MEMOPREG(vmovdqu,0x00,0,1,1,ymm1) // vmovdqu (%0,%1,1),%%ymm1
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "vpunpcklbw %%ymm1,%%ymm0,%%ymm2 \n"
+ "vpunpckhbw %%ymm1,%%ymm0,%%ymm0 \n"
+ "vextractf128 $0x0,%%ymm2," MEMACCESS(2) " \n"
+ "vextractf128 $0x0,%%ymm0," MEMACCESS2(0x10,2) "\n"
+ "vextractf128 $0x1,%%ymm2," MEMACCESS2(0x20,2) "\n"
+ "vextractf128 $0x1,%%ymm0," MEMACCESS2(0x30,2) "\n"
+ "lea " MEMLEA(0x40,2) ",%2 \n"
+ "sub $0x20,%3 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_u), // %0
+ "+r"(src_v), // %1
+ "+r"(dst_uv), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2"
+ );
+}
+#endif // HAS_MERGEUVROW_AVX2
+
+#ifdef HAS_MERGEUVROW_SSE2
+void MergeUVRow_SSE2(const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_uv,
+ int width) {
+ asm volatile (
+ "sub %0,%1 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1
+ "lea " MEMLEA(0x10,0) ",%0 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "punpcklbw %%xmm1,%%xmm0 \n"
+ "punpckhbw %%xmm1,%%xmm2 \n"
+ "movdqu %%xmm0," MEMACCESS(2) " \n"
+ "movdqu %%xmm2," MEMACCESS2(0x10,2) " \n"
+ "lea " MEMLEA(0x20,2) ",%2 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_u), // %0
+ "+r"(src_v), // %1
+ "+r"(dst_uv), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2"
+ );
+}
+#endif // HAS_MERGEUVROW_SSE2
+
+#ifdef HAS_COPYROW_SSE2
+void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
+ asm volatile (
+ "test $0xf,%0 \n"
+ "jne 2f \n"
+ "test $0xf,%1 \n"
+ "jne 2f \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "movdqa %%xmm0," MEMACCESS(1) " \n"
+ "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n"
+ "lea " MEMLEA(0x20,1) ",%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
+ "jmp 9f \n"
+
+ LABELALIGN
+ "2: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
+ "lea " MEMLEA(0x20,1) ",%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 2b \n"
+ "9: \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(count) // %2
+ :
+ : "memory", "cc"
+ , "xmm0", "xmm1"
+ );
+}
+#endif // HAS_COPYROW_SSE2
+
+#ifdef HAS_COPYROW_AVX
+void CopyRow_AVX(const uint8* src, uint8* dst, int count) {
+ asm volatile (
+ LABELALIGN
+ "1: \n"
+ "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
+ "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
+ "lea " MEMLEA(0x40,0) ",%0 \n"
+ "vmovdqu %%ymm0," MEMACCESS(1) " \n"
+ "vmovdqu %%ymm1," MEMACCESS2(0x20,1) " \n"
+ "lea " MEMLEA(0x40,1) ",%1 \n"
+ "sub $0x40,%2 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(count) // %2
+ :
+ : "memory", "cc"
+ , "xmm0", "xmm1"
+ );
+}
+#endif // HAS_COPYROW_AVX
+
+#ifdef HAS_COPYROW_ERMS
+// Multiple of 1.
+void CopyRow_ERMS(const uint8* src, uint8* dst, int width) {
+ size_t width_tmp = (size_t)(width);
+ asm volatile("rep movsb " MEMMOVESTRING(0, 1) " \n"
+ : "+S"(src), // %0
+ "+D"(dst), // %1
+ "+c"(width_tmp) // %2
+ :
+ : "memory", "cc");
+}
+#endif // HAS_COPYROW_ERMS
+
+#ifdef HAS_ARGBCOPYALPHAROW_SSE2
+// width in pixels
+void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
+ asm volatile (
+ "pcmpeqb %%xmm0,%%xmm0 \n"
+ "pslld $0x18,%%xmm0 \n"
+ "pcmpeqb %%xmm1,%%xmm1 \n"
+ "psrld $0x8,%%xmm1 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm2 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm3 \n"
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "movdqu " MEMACCESS(1) ",%%xmm4 \n"
+ "movdqu " MEMACCESS2(0x10,1) ",%%xmm5 \n"
+ "pand %%xmm0,%%xmm2 \n"
+ "pand %%xmm0,%%xmm3 \n"
+ "pand %%xmm1,%%xmm4 \n"
+ "pand %%xmm1,%%xmm5 \n"
+ "por %%xmm4,%%xmm2 \n"
+ "por %%xmm5,%%xmm3 \n"
+ "movdqu %%xmm2," MEMACCESS(1) " \n"
+ "movdqu %%xmm3," MEMACCESS2(0x10,1) " \n"
+ "lea " MEMLEA(0x20,1) ",%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ :
+ : "memory", "cc"
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+}
+#endif // HAS_ARGBCOPYALPHAROW_SSE2
+
+#ifdef HAS_ARGBCOPYALPHAROW_AVX2
+// width in pixels
+void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
+ asm volatile (
+ "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpsrld $0x8,%%ymm0,%%ymm0 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu " MEMACCESS(0) ",%%ymm1 \n"
+ "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm2 \n"
+ "lea " MEMLEA(0x40,0) ",%0 \n"
+ "vpblendvb %%ymm0," MEMACCESS(1) ",%%ymm1,%%ymm1 \n"
+ "vpblendvb %%ymm0," MEMACCESS2(0x20,1) ",%%ymm2,%%ymm2 \n"
+ "vmovdqu %%ymm1," MEMACCESS(1) " \n"
+ "vmovdqu %%ymm2," MEMACCESS2(0x20,1) " \n"
+ "lea " MEMLEA(0x40,1) ",%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ :
+ : "memory", "cc"
+ , "xmm0", "xmm1", "xmm2"
+ );
+}
+#endif // HAS_ARGBCOPYALPHAROW_AVX2
+
+#ifdef HAS_ARGBEXTRACTALPHAROW_SSE2
+// width in pixels
+void ARGBExtractAlphaRow_SSE2(const uint8* src_argb, uint8* dst_a, int width) {
+ asm volatile (
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ", %%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10, 0) ", %%xmm1 \n"
+ "lea " MEMLEA(0x20, 0) ", %0 \n"
+ "psrld $0x18, %%xmm0 \n"
+ "psrld $0x18, %%xmm1 \n"
+ "packssdw %%xmm1, %%xmm0 \n"
+ "packuswb %%xmm0, %%xmm0 \n"
+ "movq %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x8, 1) ", %1 \n"
+ "sub $0x8, %2 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_a), // %1
+ "+rm"(width) // %2
+ :
+ : "memory", "cc"
+ , "xmm0", "xmm1"
+ );
+}
+#endif // HAS_ARGBEXTRACTALPHAROW_SSE2
+
+#ifdef HAS_ARGBEXTRACTALPHAROW_AVX2
+static const uvec8 kShuffleAlphaShort_AVX2 = {
+ 3u, 128u, 128u, 128u, 7u, 128u, 128u, 128u,
+ 11u, 128u, 128u, 128u, 15u, 128u, 128u, 128u};
+
+void ARGBExtractAlphaRow_AVX2(const uint8* src_argb, uint8* dst_a, int width) {
+ asm volatile (
+ "vmovdqa %3,%%ymm4 \n"
+ "vbroadcastf128 %4,%%ymm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu " MEMACCESS(0) ", %%ymm0 \n"
+ "vmovdqu " MEMACCESS2(0x20, 0) ", %%ymm1 \n"
+ "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" // vpsrld $0x18, %%ymm0
+ "vpshufb %%ymm5,%%ymm1,%%ymm1 \n"
+ "vmovdqu " MEMACCESS2(0x40, 0) ", %%ymm2 \n"
+ "vmovdqu " MEMACCESS2(0x60, 0) ", %%ymm3 \n"
+ "lea " MEMLEA(0x80, 0) ", %0 \n"
+ "vpackssdw %%ymm1, %%ymm0, %%ymm0 \n" // mutates
+ "vpshufb %%ymm5,%%ymm2,%%ymm2 \n"
+ "vpshufb %%ymm5,%%ymm3,%%ymm3 \n"
+ "vpackssdw %%ymm3, %%ymm2, %%ymm2 \n" // mutates
+ "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates.
+ "vpermd %%ymm0,%%ymm4,%%ymm0 \n" // unmutate.
+ "vmovdqu %%ymm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x20,1) ",%1 \n"
+ "sub $0x20, %2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_a), // %1
+ "+rm"(width) // %2
+ : "m"(kPermdARGBToY_AVX), // %3
+ "m"(kShuffleAlphaShort_AVX2) // %4
+ : "memory", "cc"
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+}
+#endif // HAS_ARGBEXTRACTALPHAROW_AVX2
+
+#ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
+// width in pixels
+void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
+ asm volatile (
+ "pcmpeqb %%xmm0,%%xmm0 \n"
+ "pslld $0x18,%%xmm0 \n"
+ "pcmpeqb %%xmm1,%%xmm1 \n"
+ "psrld $0x8,%%xmm1 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movq " MEMACCESS(0) ",%%xmm2 \n"
+ "lea " MEMLEA(0x8,0) ",%0 \n"
+ "punpcklbw %%xmm2,%%xmm2 \n"
+ "punpckhwd %%xmm2,%%xmm3 \n"
+ "punpcklwd %%xmm2,%%xmm2 \n"
+ "movdqu " MEMACCESS(1) ",%%xmm4 \n"
+ "movdqu " MEMACCESS2(0x10,1) ",%%xmm5 \n"
+ "pand %%xmm0,%%xmm2 \n"
+ "pand %%xmm0,%%xmm3 \n"
+ "pand %%xmm1,%%xmm4 \n"
+ "pand %%xmm1,%%xmm5 \n"
+ "por %%xmm4,%%xmm2 \n"
+ "por %%xmm5,%%xmm3 \n"
+ "movdqu %%xmm2," MEMACCESS(1) " \n"
+ "movdqu %%xmm3," MEMACCESS2(0x10,1) " \n"
+ "lea " MEMLEA(0x20,1) ",%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ :
+ : "memory", "cc"
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+}
+#endif // HAS_ARGBCOPYYTOALPHAROW_SSE2
+
+#ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
+// width in pixels
+void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
+ asm volatile (
+ "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpsrld $0x8,%%ymm0,%%ymm0 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vpmovzxbd " MEMACCESS(0) ",%%ymm1 \n"
+ "vpmovzxbd " MEMACCESS2(0x8,0) ",%%ymm2 \n"
+ "lea " MEMLEA(0x10,0) ",%0 \n"
+ "vpslld $0x18,%%ymm1,%%ymm1 \n"
+ "vpslld $0x18,%%ymm2,%%ymm2 \n"
+ "vpblendvb %%ymm0," MEMACCESS(1) ",%%ymm1,%%ymm1 \n"
+ "vpblendvb %%ymm0," MEMACCESS2(0x20,1) ",%%ymm2,%%ymm2 \n"
+ "vmovdqu %%ymm1," MEMACCESS(1) " \n"
+ "vmovdqu %%ymm2," MEMACCESS2(0x20,1) " \n"
+ "lea " MEMLEA(0x40,1) ",%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ :
+ : "memory", "cc"
+ , "xmm0", "xmm1", "xmm2"
+ );
+}
+#endif // HAS_ARGBCOPYYTOALPHAROW_AVX2
+
+#ifdef HAS_SETROW_X86
+void SetRow_X86(uint8* dst, uint8 v8, int width) {
+ size_t width_tmp = (size_t)(width >> 2);
+ const uint32 v32 = v8 * 0x01010101u; // Duplicate byte to all bytes.
+ asm volatile("rep stosl " MEMSTORESTRING(eax, 0) " \n"
+ : "+D"(dst), // %0
+ "+c"(width_tmp) // %1
+ : "a"(v32) // %2
+ : "memory", "cc");
+}
+
+void SetRow_ERMS(uint8* dst, uint8 v8, int width) {
+ size_t width_tmp = (size_t)(width);
+ asm volatile("rep stosb " MEMSTORESTRING(al, 0) " \n"
+ : "+D"(dst), // %0
+ "+c"(width_tmp) // %1
+ : "a"(v8) // %2
+ : "memory", "cc");
+}
+
+void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int width) {
+ size_t width_tmp = (size_t)(width);
+ asm volatile("rep stosl " MEMSTORESTRING(eax, 0) " \n"
+ : "+D"(dst_argb), // %0
+ "+c"(width_tmp) // %1
+ : "a"(v32) // %2
+ : "memory", "cc");
+}
+#endif // HAS_SETROW_X86
+
+#ifdef HAS_YUY2TOYROW_SSE2
+void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int width) {
+ asm volatile (
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $0x8,%%xmm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_yuy2), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "memory", "cc"
+ , "xmm0", "xmm1", "xmm5"
+ );
+}
+
+void YUY2ToUVRow_SSE2(const uint8* src_yuy2,
+ int stride_yuy2,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
+ asm volatile (
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $0x8,%%xmm5 \n"
+ "sub %1,%2 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2
+ MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "pavgb %%xmm2,%%xmm0 \n"
+ "pavgb %%xmm3,%%xmm1 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm1 \n"
+ "movq %%xmm0," MEMACCESS(1) " \n"
+ MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2)
+ "lea " MEMLEA(0x8,1) ",%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_yuy2), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(width) // %3
+ : "r"((intptr_t)(stride_yuy2)) // %4
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+ );
+}
+
+void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
+ asm volatile (
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $0x8,%%xmm5 \n"
+ "sub %1,%2 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm1 \n"
+ "movq %%xmm0," MEMACCESS(1) " \n"
+ MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2)
+ "lea " MEMLEA(0x8,1) ",%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_yuy2), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm5"
+ );
+}
+
+void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int width) {
+ asm volatile (
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_uyvy), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "memory", "cc"
+ , "xmm0", "xmm1"
+ );
+}
+
+void UYVYToUVRow_SSE2(const uint8* src_uyvy,
+ int stride_uyvy,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
+ asm volatile (
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $0x8,%%xmm5 \n"
+ "sub %1,%2 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2
+ MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "pavgb %%xmm2,%%xmm0 \n"
+ "pavgb %%xmm3,%%xmm1 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm1 \n"
+ "movq %%xmm0," MEMACCESS(1) " \n"
+ MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2)
+ "lea " MEMLEA(0x8,1) ",%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_uyvy), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(width) // %3
+ : "r"((intptr_t)(stride_uyvy)) // %4
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+ );
+}
+
+void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
+ asm volatile (
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $0x8,%%xmm5 \n"
+ "sub %1,%2 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm1 \n"
+ "movq %%xmm0," MEMACCESS(1) " \n"
+ MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2)
+ "lea " MEMLEA(0x8,1) ",%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_uyvy), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm5"
+ );
+}
+#endif // HAS_YUY2TOYROW_SSE2
+
+#ifdef HAS_YUY2TOYROW_AVX2
+void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int width) {
+ asm volatile (
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
+ "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
+ "lea " MEMLEA(0x40,0) ",%0 \n"
+ "vpand %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpand %%ymm5,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x20,1) ",%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_yuy2), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "memory", "cc"
+ , "xmm0", "xmm1", "xmm5"
+ );
+}
+
+void YUY2ToUVRow_AVX2(const uint8* src_yuy2,
+ int stride_yuy2,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
+ asm volatile (
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
+ "sub %1,%2 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
+ "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
+ VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0) // vpavgb (%0,%4,1),%%ymm0,%%ymm0
+ VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
+ "lea " MEMLEA(0x40,0) ",%0 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
+ "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpand %%ymm5,%%ymm0,%%ymm1 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
+ "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm1,%%ymm1 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
+ VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x20,%3 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_yuy2), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(width) // %3
+ : "r"((intptr_t)(stride_yuy2)) // %4
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm5"
+ );
+}
+
+void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
+ asm volatile (
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
+ "sub %1,%2 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
+ "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
+ "lea " MEMLEA(0x40,0) ",%0 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
+ "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpand %%ymm5,%%ymm0,%%ymm1 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
+ "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm1,%%ymm1 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
+ VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x20,%3 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_yuy2), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm5"
+ );
+}
+
+void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int width) {
+ asm volatile (
+ LABELALIGN
+ "1: \n"
+ "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
+ "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
+ "lea " MEMLEA(0x40,0) ",%0 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
+ "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x20,1) ",%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_uyvy), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "memory", "cc"
+ , "xmm0", "xmm1", "xmm5"
+ );
+}
+void UYVYToUVRow_AVX2(const uint8* src_uyvy,
+ int stride_uyvy,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
+ asm volatile (
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
+ "sub %1,%2 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
+ "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
+ VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0) // vpavgb (%0,%4,1),%%ymm0,%%ymm0
+ VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
+ "lea " MEMLEA(0x40,0) ",%0 \n"
+ "vpand %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpand %%ymm5,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpand %%ymm5,%%ymm0,%%ymm1 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
+ "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm1,%%ymm1 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
+ VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x20,%3 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_uyvy), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(width) // %3
+ : "r"((intptr_t)(stride_uyvy)) // %4
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm5"
+ );
+}
+
+void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
+ asm volatile (
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
+ "sub %1,%2 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
+ "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
+ "lea " MEMLEA(0x40,0) ",%0 \n"
+ "vpand %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpand %%ymm5,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpand %%ymm5,%%ymm0,%%ymm1 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
+ "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm1,%%ymm1 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
+ VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x20,%3 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_uyvy), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm5"
+ );
+}
+#endif // HAS_YUY2TOYROW_AVX2
+
+#ifdef HAS_ARGBBLENDROW_SSSE3
+// Shuffle table for isolating alpha.
+static uvec8 kShuffleAlpha = {3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
+ 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80};
+
+// Blend 8 pixels at a time
+void ARGBBlendRow_SSSE3(const uint8* src_argb0,
+ const uint8* src_argb1,
+ uint8* dst_argb,
+ int width) {
+ asm volatile (
+ "pcmpeqb %%xmm7,%%xmm7 \n"
+ "psrlw $0xf,%%xmm7 \n"
+ "pcmpeqb %%xmm6,%%xmm6 \n"
+ "psrlw $0x8,%%xmm6 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psllw $0x8,%%xmm5 \n"
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "pslld $0x18,%%xmm4 \n"
+ "sub $0x4,%3 \n"
+ "jl 49f \n"
+
+ // 4 pixel loop.
+ LABELALIGN
+ "40: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm3 \n"
+ "lea " MEMLEA(0x10,0) ",%0 \n"
+ "movdqa %%xmm3,%%xmm0 \n"
+ "pxor %%xmm4,%%xmm3 \n"
+ "movdqu " MEMACCESS(1) ",%%xmm2 \n"
+ "pshufb %4,%%xmm3 \n"
+ "pand %%xmm6,%%xmm2 \n"
+ "paddw %%xmm7,%%xmm3 \n"
+ "pmullw %%xmm3,%%xmm2 \n"
+ "movdqu " MEMACCESS(1) ",%%xmm1 \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "por %%xmm4,%%xmm0 \n"
+ "pmullw %%xmm3,%%xmm1 \n"
+ "psrlw $0x8,%%xmm2 \n"
+ "paddusb %%xmm2,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "paddusb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0," MEMACCESS(2) " \n"
+ "lea " MEMLEA(0x10,2) ",%2 \n"
+ "sub $0x4,%3 \n"
+ "jge 40b \n"
+
+ "49: \n"
+ "add $0x3,%3 \n"
+ "jl 99f \n"
+
+ // 1 pixel loop.
+ "91: \n"
+ "movd " MEMACCESS(0) ",%%xmm3 \n"
+ "lea " MEMLEA(0x4,0) ",%0 \n"
+ "movdqa %%xmm3,%%xmm0 \n"
+ "pxor %%xmm4,%%xmm3 \n"
+ "movd " MEMACCESS(1) ",%%xmm2 \n"
+ "pshufb %4,%%xmm3 \n"
+ "pand %%xmm6,%%xmm2 \n"
+ "paddw %%xmm7,%%xmm3 \n"
+ "pmullw %%xmm3,%%xmm2 \n"
+ "movd " MEMACCESS(1) ",%%xmm1 \n"
+ "lea " MEMLEA(0x4,1) ",%1 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "por %%xmm4,%%xmm0 \n"
+ "pmullw %%xmm3,%%xmm1 \n"
+ "psrlw $0x8,%%xmm2 \n"
+ "paddusb %%xmm2,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "paddusb %%xmm1,%%xmm0 \n"
+ "movd %%xmm0," MEMACCESS(2) " \n"
+ "lea " MEMLEA(0x4,2) ",%2 \n"
+ "sub $0x1,%3 \n"
+ "jge 91b \n"
+ "99: \n"
+ : "+r"(src_argb0), // %0
+ "+r"(src_argb1), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ : "m"(kShuffleAlpha) // %4
+ : "memory", "cc"
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+ );
+}
+#endif // HAS_ARGBBLENDROW_SSSE3
+
+#ifdef HAS_BLENDPLANEROW_SSSE3
+// Blend 8 pixels at a time.
+// unsigned version of math
+// =((A2*C2)+(B2*(255-C2))+255)/256
+// signed version of math
+// =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
+void BlendPlaneRow_SSSE3(const uint8* src0,
+ const uint8* src1,
+ const uint8* alpha,
+ uint8* dst,
+ int width) {
+ asm volatile(
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psllw $0x8,%%xmm5 \n"
+ "mov $0x80808080,%%eax \n"
+ "movd %%eax,%%xmm6 \n"
+ "pshufd $0x0,%%xmm6,%%xmm6 \n"
+ "mov $0x807f807f,%%eax \n"
+ "movd %%eax,%%xmm7 \n"
+ "pshufd $0x0,%%xmm7,%%xmm7 \n"
+ "sub %2,%0 \n"
+ "sub %2,%1 \n"
+ "sub %2,%3 \n"
+
+ // 8 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movq (%2),%%xmm0 \n"
+ "punpcklbw %%xmm0,%%xmm0 \n"
+ "pxor %%xmm5,%%xmm0 \n"
+ "movq (%0,%2,1),%%xmm1 \n"
+ "movq (%1,%2,1),%%xmm2 \n"
+ "punpcklbw %%xmm2,%%xmm1 \n"
+ "psubb %%xmm6,%%xmm1 \n"
+ "pmaddubsw %%xmm1,%%xmm0 \n"
+ "paddw %%xmm7,%%xmm0 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movq %%xmm0,(%3,%2,1) \n"
+ "lea 0x8(%2),%2 \n"
+ "sub $0x8,%4 \n"
+ "jg 1b \n"
+ : "+r"(src0), // %0
+ "+r"(src1), // %1
+ "+r"(alpha), // %2
+ "+r"(dst), // %3
+ "+rm"(width) // %4
+ ::"memory",
+ "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm5", "xmm6", "xmm7");
+}
+#endif // HAS_BLENDPLANEROW_SSSE3
+
+#ifdef HAS_BLENDPLANEROW_AVX2
+// Blend 32 pixels at a time.
+// unsigned version of math
+// =((A2*C2)+(B2*(255-C2))+255)/256
+// signed version of math
+// =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
+void BlendPlaneRow_AVX2(const uint8* src0,
+ const uint8* src1,
+ const uint8* alpha,
+ uint8* dst,
+ int width) {
+ asm volatile(
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ "vpsllw $0x8,%%ymm5,%%ymm5 \n"
+ "mov $0x80808080,%%eax \n"
+ "vmovd %%eax,%%xmm6 \n"
+ "vbroadcastss %%xmm6,%%ymm6 \n"
+ "mov $0x807f807f,%%eax \n"
+ "vmovd %%eax,%%xmm7 \n"
+ "vbroadcastss %%xmm7,%%ymm7 \n"
+ "sub %2,%0 \n"
+ "sub %2,%1 \n"
+ "sub %2,%3 \n"
+
+ // 32 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%2),%%ymm0 \n"
+ "vpunpckhbw %%ymm0,%%ymm0,%%ymm3 \n"
+ "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpxor %%ymm5,%%ymm3,%%ymm3 \n"
+ "vpxor %%ymm5,%%ymm0,%%ymm0 \n"
+ "vmovdqu (%0,%2,1),%%ymm1 \n"
+ "vmovdqu (%1,%2,1),%%ymm2 \n"
+ "vpunpckhbw %%ymm2,%%ymm1,%%ymm4 \n"
+ "vpunpcklbw %%ymm2,%%ymm1,%%ymm1 \n"
+ "vpsubb %%ymm6,%%ymm4,%%ymm4 \n"
+ "vpsubb %%ymm6,%%ymm1,%%ymm1 \n"
+ "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
+ "vpmaddubsw %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpaddw %%ymm7,%%ymm3,%%ymm3 \n"
+ "vpaddw %%ymm7,%%ymm0,%%ymm0 \n"
+ "vpsrlw $0x8,%%ymm3,%%ymm3 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
+ "vpackuswb %%ymm3,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,(%3,%2,1) \n"
+ "lea 0x20(%2),%2 \n"
+ "sub $0x20,%4 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src0), // %0
+ "+r"(src1), // %1
+ "+r"(alpha), // %2
+ "+r"(dst), // %3
+ "+rm"(width) // %4
+ ::"memory",
+ "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
+}
+#endif // HAS_BLENDPLANEROW_AVX2
+
+#ifdef HAS_ARGBATTENUATEROW_SSSE3
+// Shuffle table duplicating alpha
+static uvec8 kShuffleAlpha0 = {3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u,
+ 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u};
+static uvec8 kShuffleAlpha1 = {11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
+ 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u};
+// Attenuate 4 pixels at a time.
+void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
+ asm volatile (
+ "pcmpeqb %%xmm3,%%xmm3 \n"
+ "pslld $0x18,%%xmm3 \n"
+ "movdqa %3,%%xmm4 \n"
+ "movdqa %4,%%xmm5 \n"
+
+ // 4 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "pshufb %%xmm4,%%xmm0 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm1 \n"
+ "punpcklbw %%xmm1,%%xmm1 \n"
+ "pmulhuw %%xmm1,%%xmm0 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm1 \n"
+ "pshufb %%xmm5,%%xmm1 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm2 \n"
+ "punpckhbw %%xmm2,%%xmm2 \n"
+ "pmulhuw %%xmm2,%%xmm1 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm2 \n"
+ "lea " MEMLEA(0x10,0) ",%0 \n"
+ "pand %%xmm3,%%xmm2 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "por %%xmm2,%%xmm0 \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "m"(kShuffleAlpha0), // %3
+ "m"(kShuffleAlpha1) // %4
+ : "memory", "cc"
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+}
+#endif // HAS_ARGBATTENUATEROW_SSSE3
+
+#ifdef HAS_ARGBATTENUATEROW_AVX2
+// Shuffle table duplicating alpha.
+static const uvec8 kShuffleAlpha_AVX2 = {6u, 7u, 6u, 7u, 6u, 7u,
+ 128u, 128u, 14u, 15u, 14u, 15u,
+ 14u, 15u, 128u, 128u};
+// Attenuate 8 pixels at a time.
+void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
+ asm volatile (
+ "vbroadcastf128 %3,%%ymm4 \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ "vpslld $0x18,%%ymm5,%%ymm5 \n"
+ "sub %0,%1 \n"
+
+ // 8 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "vmovdqu " MEMACCESS(0) ",%%ymm6 \n"
+ "vpunpcklbw %%ymm6,%%ymm6,%%ymm0 \n"
+ "vpunpckhbw %%ymm6,%%ymm6,%%ymm1 \n"
+ "vpshufb %%ymm4,%%ymm0,%%ymm2 \n"
+ "vpshufb %%ymm4,%%ymm1,%%ymm3 \n"
+ "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n"
+ "vpand %%ymm5,%%ymm6,%%ymm6 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
+ "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpor %%ymm6,%%ymm0,%%ymm0 \n"
+ MEMOPMEM(vmovdqu,ymm0,0x00,0,1,1) // vmovdqu %%ymm0,(%0,%1)
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "m"(kShuffleAlpha_AVX2) // %3
+ : "memory", "cc"
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+ );
+}
+#endif // HAS_ARGBATTENUATEROW_AVX2
+
+#ifdef HAS_ARGBUNATTENUATEROW_SSE2
+// Unattenuate 4 pixels at a time.
+void ARGBUnattenuateRow_SSE2(const uint8* src_argb,
+ uint8* dst_argb,
+ int width) {
+ uintptr_t alpha;
+ asm volatile (
+ // 4 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movzb " MEMACCESS2(0x03,0) ",%3 \n"
+ "punpcklbw %%xmm0,%%xmm0 \n"
+ MEMOPREG(movd,0x00,4,3,4,xmm2) // movd 0x0(%4,%3,4),%%xmm2
+ "movzb " MEMACCESS2(0x07,0) ",%3 \n"
+ MEMOPREG(movd,0x00,4,3,4,xmm3) // movd 0x0(%4,%3,4),%%xmm3
+ "pshuflw $0x40,%%xmm2,%%xmm2 \n"
+ "pshuflw $0x40,%%xmm3,%%xmm3 \n"
+ "movlhps %%xmm3,%%xmm2 \n"
+ "pmulhuw %%xmm2,%%xmm0 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm1 \n"
+ "movzb " MEMACCESS2(0x0b,0) ",%3 \n"
+ "punpckhbw %%xmm1,%%xmm1 \n"
+ MEMOPREG(movd,0x00,4,3,4,xmm2) // movd 0x0(%4,%3,4),%%xmm2
+ "movzb " MEMACCESS2(0x0f,0) ",%3 \n"
+ MEMOPREG(movd,0x00,4,3,4,xmm3) // movd 0x0(%4,%3,4),%%xmm3
+ "pshuflw $0x40,%%xmm2,%%xmm2 \n"
+ "pshuflw $0x40,%%xmm3,%%xmm3 \n"
+ "movlhps %%xmm3,%%xmm2 \n"
+ "pmulhuw %%xmm2,%%xmm1 \n"
+ "lea " MEMLEA(0x10,0) ",%0 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width), // %2
+ "=&r"(alpha) // %3
+ : "r"(fixed_invtbl8) // %4
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+}
+#endif // HAS_ARGBUNATTENUATEROW_SSE2
+
+#ifdef HAS_ARGBUNATTENUATEROW_AVX2
+// Shuffle table duplicating alpha.
+static const uvec8 kUnattenShuffleAlpha_AVX2 = {
+ 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u};
+// Unattenuate 8 pixels at a time.
+void ARGBUnattenuateRow_AVX2(const uint8* src_argb,
+ uint8* dst_argb,
+ int width) {
+ uintptr_t alpha;
+ asm volatile (
+ "sub %0,%1 \n"
+ "vbroadcastf128 %5,%%ymm5 \n"
+
+ // 8 pixel loop.
+ LABELALIGN
+ "1: \n"
+ // replace VPGATHER
+ "movzb " MEMACCESS2(0x03,0) ",%3 \n"
+ MEMOPREG(vmovd,0x00,4,3,4,xmm0) // vmovd 0x0(%4,%3,4),%%xmm0
+ "movzb " MEMACCESS2(0x07,0) ",%3 \n"
+ MEMOPREG(vmovd,0x00,4,3,4,xmm1) // vmovd 0x0(%4,%3,4),%%xmm1
+ "movzb " MEMACCESS2(0x0b,0) ",%3 \n"
+ "vpunpckldq %%xmm1,%%xmm0,%%xmm6 \n"
+ MEMOPREG(vmovd,0x00,4,3,4,xmm2) // vmovd 0x0(%4,%3,4),%%xmm2
+ "movzb " MEMACCESS2(0x0f,0) ",%3 \n"
+ MEMOPREG(vmovd,0x00,4,3,4,xmm3) // vmovd 0x0(%4,%3,4),%%xmm3
+ "movzb " MEMACCESS2(0x13,0) ",%3 \n"
+ "vpunpckldq %%xmm3,%%xmm2,%%xmm7 \n"
+ MEMOPREG(vmovd,0x00,4,3,4,xmm0) // vmovd 0x0(%4,%3,4),%%xmm0
+ "movzb " MEMACCESS2(0x17,0) ",%3 \n"
+ MEMOPREG(vmovd,0x00,4,3,4,xmm1) // vmovd 0x0(%4,%3,4),%%xmm1
+ "movzb " MEMACCESS2(0x1b,0) ",%3 \n"
+ "vpunpckldq %%xmm1,%%xmm0,%%xmm0 \n"
+ MEMOPREG(vmovd,0x00,4,3,4,xmm2) // vmovd 0x0(%4,%3,4),%%xmm2
+ "movzb " MEMACCESS2(0x1f,0) ",%3 \n"
+ MEMOPREG(vmovd,0x00,4,3,4,xmm3) // vmovd 0x0(%4,%3,4),%%xmm3
+ "vpunpckldq %%xmm3,%%xmm2,%%xmm2 \n"
+ "vpunpcklqdq %%xmm7,%%xmm6,%%xmm3 \n"
+ "vpunpcklqdq %%xmm2,%%xmm0,%%xmm0 \n"
+ "vinserti128 $0x1,%%xmm0,%%ymm3,%%ymm3 \n"
+ // end of VPGATHER
+
+ "vmovdqu " MEMACCESS(0) ",%%ymm6 \n"
+ "vpunpcklbw %%ymm6,%%ymm6,%%ymm0 \n"
+ "vpunpckhbw %%ymm6,%%ymm6,%%ymm1 \n"
+ "vpunpcklwd %%ymm3,%%ymm3,%%ymm2 \n"
+ "vpunpckhwd %%ymm3,%%ymm3,%%ymm3 \n"
+ "vpshufb %%ymm5,%%ymm2,%%ymm2 \n"
+ "vpshufb %%ymm5,%%ymm3,%%ymm3 \n"
+ "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ MEMOPMEM(vmovdqu,ymm0,0x00,0,1,1) // vmovdqu %%ymm0,(%0,%1)
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width), // %2
+ "=&r"(alpha) // %3
+ : "r"(fixed_invtbl8), // %4
+ "m"(kUnattenShuffleAlpha_AVX2) // %5
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+ );
+}
+#endif // HAS_ARGBUNATTENUATEROW_AVX2
+
+#ifdef HAS_ARGBGRAYROW_SSSE3
+// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
+void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
+ asm volatile (
+ "movdqa %3,%%xmm4 \n"
+ "movdqa %4,%%xmm5 \n"
+
+ // 8 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm1 \n"
+ "phaddw %%xmm1,%%xmm0 \n"
+ "paddw %%xmm5,%%xmm0 \n"
+ "psrlw $0x7,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm2 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm3 \n"
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "psrld $0x18,%%xmm2 \n"
+ "psrld $0x18,%%xmm3 \n"
+ "packuswb %%xmm3,%%xmm2 \n"
+ "packuswb %%xmm2,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm3 \n"
+ "punpcklbw %%xmm0,%%xmm0 \n"
+ "punpcklbw %%xmm2,%%xmm3 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklwd %%xmm3,%%xmm0 \n"
+ "punpckhwd %%xmm3,%%xmm1 \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
+ "lea " MEMLEA(0x20,1) ",%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "m"(kARGBToYJ), // %3
+ "m"(kAddYJ64) // %4
+ : "memory", "cc"
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+}
+#endif // HAS_ARGBGRAYROW_SSSE3
+
+#ifdef HAS_ARGBSEPIAROW_SSSE3
+// b = (r * 35 + g * 68 + b * 17) >> 7
+// g = (r * 45 + g * 88 + b * 22) >> 7
+// r = (r * 50 + g * 98 + b * 24) >> 7
+// Constant for ARGB color to sepia tone
+static vec8 kARGBToSepiaB = {17, 68, 35, 0, 17, 68, 35, 0,
+ 17, 68, 35, 0, 17, 68, 35, 0};
+
+static vec8 kARGBToSepiaG = {22, 88, 45, 0, 22, 88, 45, 0,
+ 22, 88, 45, 0, 22, 88, 45, 0};
+
+static vec8 kARGBToSepiaR = {24, 98, 50, 0, 24, 98, 50, 0,
+ 24, 98, 50, 0, 24, 98, 50, 0};
+
+// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
+void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
+ asm volatile (
+ "movdqa %2,%%xmm2 \n"
+ "movdqa %3,%%xmm3 \n"
+ "movdqa %4,%%xmm4 \n"
+
+ // 8 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm6 \n"
+ "pmaddubsw %%xmm2,%%xmm0 \n"
+ "pmaddubsw %%xmm2,%%xmm6 \n"
+ "phaddw %%xmm6,%%xmm0 \n"
+ "psrlw $0x7,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm5 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "pmaddubsw %%xmm3,%%xmm5 \n"
+ "pmaddubsw %%xmm3,%%xmm1 \n"
+ "phaddw %%xmm1,%%xmm5 \n"
+ "psrlw $0x7,%%xmm5 \n"
+ "packuswb %%xmm5,%%xmm5 \n"
+ "punpcklbw %%xmm5,%%xmm0 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm5 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "pmaddubsw %%xmm4,%%xmm5 \n"
+ "pmaddubsw %%xmm4,%%xmm1 \n"
+ "phaddw %%xmm1,%%xmm5 \n"
+ "psrlw $0x7,%%xmm5 \n"
+ "packuswb %%xmm5,%%xmm5 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm6 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "psrld $0x18,%%xmm6 \n"
+ "psrld $0x18,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm6 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "punpcklbw %%xmm6,%%xmm5 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklwd %%xmm5,%%xmm0 \n"
+ "punpckhwd %%xmm5,%%xmm1 \n"
+ "movdqu %%xmm0," MEMACCESS(0) " \n"
+ "movdqu %%xmm1," MEMACCESS2(0x10,0) " \n"
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "sub $0x8,%1 \n"
+ "jg 1b \n"
+ : "+r"(dst_argb), // %0
+ "+r"(width) // %1
+ : "m"(kARGBToSepiaB), // %2
+ "m"(kARGBToSepiaG), // %3
+ "m"(kARGBToSepiaR) // %4
+ : "memory", "cc"
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+ );
+}
+#endif // HAS_ARGBSEPIAROW_SSSE3
+
+#ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
+// Tranform 8 ARGB pixels (32 bytes) with color matrix.
+// Same as Sepia except matrix is provided.
+void ARGBColorMatrixRow_SSSE3(const uint8* src_argb,
+ uint8* dst_argb,
+ const int8* matrix_argb,
+ int width) {
+ asm volatile (
+ "movdqu " MEMACCESS(3) ",%%xmm5 \n"
+ "pshufd $0x00,%%xmm5,%%xmm2 \n"
+ "pshufd $0x55,%%xmm5,%%xmm3 \n"
+ "pshufd $0xaa,%%xmm5,%%xmm4 \n"
+ "pshufd $0xff,%%xmm5,%%xmm5 \n"
+
+ // 8 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm7 \n"
+ "pmaddubsw %%xmm2,%%xmm0 \n"
+ "pmaddubsw %%xmm2,%%xmm7 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm6 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "pmaddubsw %%xmm3,%%xmm6 \n"
+ "pmaddubsw %%xmm3,%%xmm1 \n"
+ "phaddsw %%xmm7,%%xmm0 \n"
+ "phaddsw %%xmm1,%%xmm6 \n"
+ "psraw $0x6,%%xmm0 \n"
+ "psraw $0x6,%%xmm6 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "punpcklbw %%xmm6,%%xmm0 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm1 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm7 \n"
+ "pmaddubsw %%xmm4,%%xmm1 \n"
+ "pmaddubsw %%xmm4,%%xmm7 \n"
+ "phaddsw %%xmm7,%%xmm1 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm6 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm7 \n"
+ "pmaddubsw %%xmm5,%%xmm6 \n"
+ "pmaddubsw %%xmm5,%%xmm7 \n"
+ "phaddsw %%xmm7,%%xmm6 \n"
+ "psraw $0x6,%%xmm1 \n"
+ "psraw $0x6,%%xmm6 \n"
+ "packuswb %%xmm1,%%xmm1 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "punpcklbw %%xmm6,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm6 \n"
+ "punpcklwd %%xmm1,%%xmm0 \n"
+ "punpckhwd %%xmm1,%%xmm6 \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ "movdqu %%xmm6," MEMACCESS2(0x10,1) " \n"
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "lea " MEMLEA(0x20,1) ",%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "r"(matrix_argb) // %3
+ : "memory", "cc"
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+ );
+}
+#endif // HAS_ARGBCOLORMATRIXROW_SSSE3
+
+#ifdef HAS_ARGBQUANTIZEROW_SSE2
+// Quantize 4 ARGB pixels (16 bytes).
+void ARGBQuantizeRow_SSE2(uint8* dst_argb,
+ int scale,
+ int interval_size,
+ int interval_offset,
+ int width) {
+ asm volatile (
+ "movd %2,%%xmm2 \n"
+ "movd %3,%%xmm3 \n"
+ "movd %4,%%xmm4 \n"
+ "pshuflw $0x40,%%xmm2,%%xmm2 \n"
+ "pshufd $0x44,%%xmm2,%%xmm2 \n"
+ "pshuflw $0x40,%%xmm3,%%xmm3 \n"
+ "pshufd $0x44,%%xmm3,%%xmm3 \n"
+ "pshuflw $0x40,%%xmm4,%%xmm4 \n"
+ "pshufd $0x44,%%xmm4,%%xmm4 \n"
+ "pxor %%xmm5,%%xmm5 \n"
+ "pcmpeqb %%xmm6,%%xmm6 \n"
+ "pslld $0x18,%%xmm6 \n"
+
+ // 4 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "punpcklbw %%xmm5,%%xmm0 \n"
+ "pmulhuw %%xmm2,%%xmm0 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm1 \n"
+ "punpckhbw %%xmm5,%%xmm1 \n"
+ "pmulhuw %%xmm2,%%xmm1 \n"
+ "pmullw %%xmm3,%%xmm0 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm7 \n"
+ "pmullw %%xmm3,%%xmm1 \n"
+ "pand %%xmm6,%%xmm7 \n"
+ "paddw %%xmm4,%%xmm0 \n"
+ "paddw %%xmm4,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "por %%xmm7,%%xmm0 \n"
+ "movdqu %%xmm0," MEMACCESS(0) " \n"
+ "lea " MEMLEA(0x10,0) ",%0 \n"
+ "sub $0x4,%1 \n"
+ "jg 1b \n"
+ : "+r"(dst_argb), // %0
+ "+r"(width) // %1
+ : "r"(scale), // %2
+ "r"(interval_size), // %3
+ "r"(interval_offset) // %4
+ : "memory", "cc"
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+ );
+}
+#endif // HAS_ARGBQUANTIZEROW_SSE2
+
+#ifdef HAS_ARGBSHADEROW_SSE2
+// Shade 4 pixels at a time by specified value.
+void ARGBShadeRow_SSE2(const uint8* src_argb,
+ uint8* dst_argb,
+ int width,
+ uint32 value) {
+ asm volatile (
+ "movd %3,%%xmm2 \n"
+ "punpcklbw %%xmm2,%%xmm2 \n"
+ "punpcklqdq %%xmm2,%%xmm2 \n"
+
+ // 4 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "lea " MEMLEA(0x10,0) ",%0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklbw %%xmm0,%%xmm0 \n"
+ "punpckhbw %%xmm1,%%xmm1 \n"
+ "pmulhuw %%xmm2,%%xmm0 \n"
+ "pmulhuw %%xmm2,%%xmm1 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "r"(value) // %3
+ : "memory", "cc"
+ , "xmm0", "xmm1", "xmm2"
+ );
+}
+#endif // HAS_ARGBSHADEROW_SSE2
+
+#ifdef HAS_ARGBMULTIPLYROW_SSE2
+// Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
+void ARGBMultiplyRow_SSE2(const uint8* src_argb0,
+ const uint8* src_argb1,
+ uint8* dst_argb,
+ int width) {
+ asm volatile (
+ "pxor %%xmm5,%%xmm5 \n"
+
+ // 4 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "lea " MEMLEA(0x10,0) ",%0 \n"
+ "movdqu " MEMACCESS(1) ",%%xmm2 \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "movdqu %%xmm0,%%xmm1 \n"
+ "movdqu %%xmm2,%%xmm3 \n"
+ "punpcklbw %%xmm0,%%xmm0 \n"
+ "punpckhbw %%xmm1,%%xmm1 \n"
+ "punpcklbw %%xmm5,%%xmm2 \n"
+ "punpckhbw %%xmm5,%%xmm3 \n"
+ "pmulhuw %%xmm2,%%xmm0 \n"
+ "pmulhuw %%xmm3,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0," MEMACCESS(2) " \n"
+ "lea " MEMLEA(0x10,2) ",%2 \n"
+ "sub $0x4,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_argb0), // %0
+ "+r"(src_argb1), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc"
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+ );
+}
+#endif // HAS_ARGBMULTIPLYROW_SSE2
+
+#ifdef HAS_ARGBMULTIPLYROW_AVX2
+// Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
+void ARGBMultiplyRow_AVX2(const uint8* src_argb0,
+ const uint8* src_argb1,
+ uint8* dst_argb,
+ int width) {
+ asm volatile (
+ "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
+
+ // 4 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "vmovdqu " MEMACCESS(0) ",%%ymm1 \n"
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "vmovdqu " MEMACCESS(1) ",%%ymm3 \n"
+ "lea " MEMLEA(0x20,1) ",%1 \n"
+ "vpunpcklbw %%ymm1,%%ymm1,%%ymm0 \n"
+ "vpunpckhbw %%ymm1,%%ymm1,%%ymm1 \n"
+ "vpunpcklbw %%ymm5,%%ymm3,%%ymm2 \n"
+ "vpunpckhbw %%ymm5,%%ymm3,%%ymm3 \n"
+ "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0," MEMACCESS(2) " \n"
+ "lea " MEMLEA(0x20,2) ",%2 \n"
+ "sub $0x8,%3 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_argb0), // %0
+ "+r"(src_argb1), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc"
+#if defined(__AVX2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+#endif
+ );
+}
+#endif // HAS_ARGBMULTIPLYROW_AVX2
+
+#ifdef HAS_ARGBADDROW_SSE2
+// Add 2 rows of ARGB pixels together, 4 pixels at a time.
+void ARGBAddRow_SSE2(const uint8* src_argb0,
+ const uint8* src_argb1,
+ uint8* dst_argb,
+ int width) {
+ asm volatile (
+ // 4 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "lea " MEMLEA(0x10,0) ",%0 \n"
+ "movdqu " MEMACCESS(1) ",%%xmm1 \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "paddusb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0," MEMACCESS(2) " \n"
+ "lea " MEMLEA(0x10,2) ",%2 \n"
+ "sub $0x4,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_argb0), // %0
+ "+r"(src_argb1), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc"
+ , "xmm0", "xmm1"
+ );
+}
+#endif // HAS_ARGBADDROW_SSE2
+
+#ifdef HAS_ARGBADDROW_AVX2
+// Add 2 rows of ARGB pixels together, 4 pixels at a time.
+void ARGBAddRow_AVX2(const uint8* src_argb0,
+ const uint8* src_argb1,
+ uint8* dst_argb,
+ int width) {
+ asm volatile (
+ // 4 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "vpaddusb " MEMACCESS(1) ",%%ymm0,%%ymm0 \n"
+ "lea " MEMLEA(0x20,1) ",%1 \n"
+ "vmovdqu %%ymm0," MEMACCESS(2) " \n"
+ "lea " MEMLEA(0x20,2) ",%2 \n"
+ "sub $0x8,%3 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_argb0), // %0
+ "+r"(src_argb1), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc"
+ , "xmm0"
+ );
+}
+#endif // HAS_ARGBADDROW_AVX2
+
+#ifdef HAS_ARGBSUBTRACTROW_SSE2
+// Subtract 2 rows of ARGB pixels, 4 pixels at a time.
+void ARGBSubtractRow_SSE2(const uint8* src_argb0,
+ const uint8* src_argb1,
+ uint8* dst_argb,
+ int width) {
+ asm volatile (
+ // 4 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "lea " MEMLEA(0x10,0) ",%0 \n"
+ "movdqu " MEMACCESS(1) ",%%xmm1 \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "psubusb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0," MEMACCESS(2) " \n"
+ "lea " MEMLEA(0x10,2) ",%2 \n"
+ "sub $0x4,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_argb0), // %0
+ "+r"(src_argb1), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc"
+ , "xmm0", "xmm1"
+ );
+}
+#endif // HAS_ARGBSUBTRACTROW_SSE2
+
+#ifdef HAS_ARGBSUBTRACTROW_AVX2
+// Subtract 2 rows of ARGB pixels, 8 pixels at a time.
+void ARGBSubtractRow_AVX2(const uint8* src_argb0,
+ const uint8* src_argb1,
+ uint8* dst_argb,
+ int width) {
+ asm volatile (
+ // 4 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "vpsubusb " MEMACCESS(1) ",%%ymm0,%%ymm0 \n"
+ "lea " MEMLEA(0x20,1) ",%1 \n"
+ "vmovdqu %%ymm0," MEMACCESS(2) " \n"
+ "lea " MEMLEA(0x20,2) ",%2 \n"
+ "sub $0x8,%3 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_argb0), // %0
+ "+r"(src_argb1), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc"
+ , "xmm0"
+ );
+}
+#endif // HAS_ARGBSUBTRACTROW_AVX2
+
+#ifdef HAS_SOBELXROW_SSE2
+// SobelX as a matrix is
+// -1 0 1
+// -2 0 2
+// -1 0 1
+void SobelXRow_SSE2(const uint8* src_y0,
+ const uint8* src_y1,
+ const uint8* src_y2,
+ uint8* dst_sobelx,
+ int width) {
+ asm volatile (
+ "sub %0,%1 \n"
+ "sub %0,%2 \n"
+ "sub %0,%3 \n"
+ "pxor %%xmm5,%%xmm5 \n"
+
+ // 8 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movq " MEMACCESS(0) ",%%xmm0 \n"
+ "movq " MEMACCESS2(0x2,0) ",%%xmm1 \n"
+ "punpcklbw %%xmm5,%%xmm0 \n"
+ "punpcklbw %%xmm5,%%xmm1 \n"
+ "psubw %%xmm1,%%xmm0 \n"
+ MEMOPREG(movq,0x00,0,1,1,xmm1) // movq (%0,%1,1),%%xmm1
+ MEMOPREG(movq,0x02,0,1,1,xmm2) // movq 0x2(%0,%1,1),%%xmm2
+ "punpcklbw %%xmm5,%%xmm1 \n"
+ "punpcklbw %%xmm5,%%xmm2 \n"
+ "psubw %%xmm2,%%xmm1 \n"
+ MEMOPREG(movq,0x00,0,2,1,xmm2) // movq (%0,%2,1),%%xmm2
+ MEMOPREG(movq,0x02,0,2,1,xmm3) // movq 0x2(%0,%2,1),%%xmm3
+ "punpcklbw %%xmm5,%%xmm2 \n"
+ "punpcklbw %%xmm5,%%xmm3 \n"
+ "psubw %%xmm3,%%xmm2 \n"
+ "paddw %%xmm2,%%xmm0 \n"
+ "paddw %%xmm1,%%xmm0 \n"
+ "paddw %%xmm1,%%xmm0 \n"
+ "pxor %%xmm1,%%xmm1 \n"
+ "psubw %%xmm0,%%xmm1 \n"
+ "pmaxsw %%xmm1,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ MEMOPMEM(movq,xmm0,0x00,0,3,1) // movq %%xmm0,(%0,%3,1)
+ "lea " MEMLEA(0x8,0) ",%0 \n"
+ "sub $0x8,%4 \n"
+ "jg 1b \n"
+ : "+r"(src_y0), // %0
+ "+r"(src_y1), // %1
+ "+r"(src_y2), // %2
+ "+r"(dst_sobelx), // %3
+ "+r"(width) // %4
+ :
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+ );
+}
+#endif // HAS_SOBELXROW_SSE2
+
+#ifdef HAS_SOBELYROW_SSE2
+// SobelY as a matrix is
+// -1 -2 -1
+// 0 0 0
+// 1 2 1
+void SobelYRow_SSE2(const uint8* src_y0,
+ const uint8* src_y1,
+ uint8* dst_sobely,
+ int width) {
+ asm volatile (
+ "sub %0,%1 \n"
+ "sub %0,%2 \n"
+ "pxor %%xmm5,%%xmm5 \n"
+
+ // 8 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movq " MEMACCESS(0) ",%%xmm0 \n"
+ MEMOPREG(movq,0x00,0,1,1,xmm1) // movq (%0,%1,1),%%xmm1
+ "punpcklbw %%xmm5,%%xmm0 \n"
+ "punpcklbw %%xmm5,%%xmm1 \n"
+ "psubw %%xmm1,%%xmm0 \n"
+ "movq " MEMACCESS2(0x1,0) ",%%xmm1 \n"
+ MEMOPREG(movq,0x01,0,1,1,xmm2) // movq 0x1(%0,%1,1),%%xmm2
+ "punpcklbw %%xmm5,%%xmm1 \n"
+ "punpcklbw %%xmm5,%%xmm2 \n"
+ "psubw %%xmm2,%%xmm1 \n"
+ "movq " MEMACCESS2(0x2,0) ",%%xmm2 \n"
+ MEMOPREG(movq,0x02,0,1,1,xmm3) // movq 0x2(%0,%1,1),%%xmm3
+ "punpcklbw %%xmm5,%%xmm2 \n"
+ "punpcklbw %%xmm5,%%xmm3 \n"
+ "psubw %%xmm3,%%xmm2 \n"
+ "paddw %%xmm2,%%xmm0 \n"
+ "paddw %%xmm1,%%xmm0 \n"
+ "paddw %%xmm1,%%xmm0 \n"
+ "pxor %%xmm1,%%xmm1 \n"
+ "psubw %%xmm0,%%xmm1 \n"
+ "pmaxsw %%xmm1,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ MEMOPMEM(movq,xmm0,0x00,0,2,1) // movq %%xmm0,(%0,%2,1)
+ "lea " MEMLEA(0x8,0) ",%0 \n"
+ "sub $0x8,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_y0), // %0
+ "+r"(src_y1), // %1
+ "+r"(dst_sobely), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+ );
+}
+#endif // HAS_SOBELYROW_SSE2
+
+#ifdef HAS_SOBELROW_SSE2
+// Adds Sobel X and Sobel Y and stores Sobel into ARGB.
+// A = 255
+// R = Sobel
+// G = Sobel
+// B = Sobel
+void SobelRow_SSE2(const uint8* src_sobelx,
+ const uint8* src_sobely,
+ uint8* dst_argb,
+ int width) {
+ asm volatile (
+ "sub %0,%1 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pslld $0x18,%%xmm5 \n"
+
+ // 8 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1
+ "lea " MEMLEA(0x10,0) ",%0 \n"
+ "paddusb %%xmm1,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "punpcklbw %%xmm0,%%xmm2 \n"
+ "punpckhbw %%xmm0,%%xmm0 \n"
+ "movdqa %%xmm2,%%xmm1 \n"
+ "punpcklwd %%xmm2,%%xmm1 \n"
+ "punpckhwd %%xmm2,%%xmm2 \n"
+ "por %%xmm5,%%xmm1 \n"
+ "por %%xmm5,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm3 \n"
+ "punpcklwd %%xmm0,%%xmm3 \n"
+ "punpckhwd %%xmm0,%%xmm0 \n"
+ "por %%xmm5,%%xmm3 \n"
+ "por %%xmm5,%%xmm0 \n"
+ "movdqu %%xmm1," MEMACCESS(2) " \n"
+ "movdqu %%xmm2," MEMACCESS2(0x10,2) " \n"
+ "movdqu %%xmm3," MEMACCESS2(0x20,2) " \n"
+ "movdqu %%xmm0," MEMACCESS2(0x30,2) " \n"
+ "lea " MEMLEA(0x40,2) ",%2 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_sobelx), // %0
+ "+r"(src_sobely), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+ );
+}
+#endif // HAS_SOBELROW_SSE2
+
+#ifdef HAS_SOBELTOPLANEROW_SSE2
+// Adds Sobel X and Sobel Y and stores Sobel into a plane.
+void SobelToPlaneRow_SSE2(const uint8* src_sobelx,
+ const uint8* src_sobely,
+ uint8* dst_y,
+ int width) {
+ asm volatile (
+ "sub %0,%1 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pslld $0x18,%%xmm5 \n"
+
+ // 8 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1
+ "lea " MEMLEA(0x10,0) ",%0 \n"
+ "paddusb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0," MEMACCESS(2) " \n"
+ "lea " MEMLEA(0x10,2) ",%2 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_sobelx), // %0
+ "+r"(src_sobely), // %1
+ "+r"(dst_y), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1"
+ );
+}
+#endif // HAS_SOBELTOPLANEROW_SSE2
+
+#ifdef HAS_SOBELXYROW_SSE2
+// Mixes Sobel X, Sobel Y and Sobel into ARGB.
+// A = 255
+// R = Sobel X
+// G = Sobel
+// B = Sobel Y
+void SobelXYRow_SSE2(const uint8* src_sobelx,
+ const uint8* src_sobely,
+ uint8* dst_argb,
+ int width) {
+ asm volatile (
+ "sub %0,%1 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+
+ // 8 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1
+ "lea " MEMLEA(0x10,0) ",%0 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "paddusb %%xmm1,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm3 \n"
+ "punpcklbw %%xmm5,%%xmm3 \n"
+ "punpckhbw %%xmm5,%%xmm0 \n"
+ "movdqa %%xmm1,%%xmm4 \n"
+ "punpcklbw %%xmm2,%%xmm4 \n"
+ "punpckhbw %%xmm2,%%xmm1 \n"
+ "movdqa %%xmm4,%%xmm6 \n"
+ "punpcklwd %%xmm3,%%xmm6 \n"
+ "punpckhwd %%xmm3,%%xmm4 \n"
+ "movdqa %%xmm1,%%xmm7 \n"
+ "punpcklwd %%xmm0,%%xmm7 \n"
+ "punpckhwd %%xmm0,%%xmm1 \n"
+ "movdqu %%xmm6," MEMACCESS(2) " \n"
+ "movdqu %%xmm4," MEMACCESS2(0x10,2) " \n"
+ "movdqu %%xmm7," MEMACCESS2(0x20,2) " \n"
+ "movdqu %%xmm1," MEMACCESS2(0x30,2) " \n"
+ "lea " MEMLEA(0x40,2) ",%2 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_sobelx), // %0
+ "+r"(src_sobely), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+ );
+}
+#endif // HAS_SOBELXYROW_SSE2
+
+#ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
+// Creates a table of cumulative sums where each value is a sum of all values
+// above and to the left of the value, inclusive of the value.
+void ComputeCumulativeSumRow_SSE2(const uint8* row,
+ int32* cumsum,
+ const int32* previous_cumsum,
+ int width) {
+ asm volatile (
+ "pxor %%xmm0,%%xmm0 \n"
+ "pxor %%xmm1,%%xmm1 \n"
+ "sub $0x4,%3 \n"
+ "jl 49f \n"
+ "test $0xf,%1 \n"
+ "jne 49f \n"
+
+ // 4 pixel loop.
+ LABELALIGN
+ "40: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm2 \n"
+ "lea " MEMLEA(0x10,0) ",%0 \n"
+ "movdqa %%xmm2,%%xmm4 \n"
+ "punpcklbw %%xmm1,%%xmm2 \n"
+ "movdqa %%xmm2,%%xmm3 \n"
+ "punpcklwd %%xmm1,%%xmm2 \n"
+ "punpckhwd %%xmm1,%%xmm3 \n"
+ "punpckhbw %%xmm1,%%xmm4 \n"
+ "movdqa %%xmm4,%%xmm5 \n"
+ "punpcklwd %%xmm1,%%xmm4 \n"
+ "punpckhwd %%xmm1,%%xmm5 \n"
+ "paddd %%xmm2,%%xmm0 \n"
+ "movdqu " MEMACCESS(2) ",%%xmm2 \n"
+ "paddd %%xmm0,%%xmm2 \n"
+ "paddd %%xmm3,%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,2) ",%%xmm3 \n"
+ "paddd %%xmm0,%%xmm3 \n"
+ "paddd %%xmm4,%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x20,2) ",%%xmm4 \n"
+ "paddd %%xmm0,%%xmm4 \n"
+ "paddd %%xmm5,%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x30,2) ",%%xmm5 \n"
+ "lea " MEMLEA(0x40,2) ",%2 \n"
+ "paddd %%xmm0,%%xmm5 \n"
+ "movdqu %%xmm2," MEMACCESS(1) " \n"
+ "movdqu %%xmm3," MEMACCESS2(0x10,1) " \n"
+ "movdqu %%xmm4," MEMACCESS2(0x20,1) " \n"
+ "movdqu %%xmm5," MEMACCESS2(0x30,1) " \n"
+ "lea " MEMLEA(0x40,1) ",%1 \n"
+ "sub $0x4,%3 \n"
+ "jge 40b \n"
+
+ "49: \n"
+ "add $0x3,%3 \n"
+ "jl 19f \n"
+
+ // 1 pixel loop.
+ LABELALIGN
+ "10: \n"
+ "movd " MEMACCESS(0) ",%%xmm2 \n"
+ "lea " MEMLEA(0x4,0) ",%0 \n"
+ "punpcklbw %%xmm1,%%xmm2 \n"
+ "punpcklwd %%xmm1,%%xmm2 \n"
+ "paddd %%xmm2,%%xmm0 \n"
+ "movdqu " MEMACCESS(2) ",%%xmm2 \n"
+ "lea " MEMLEA(0x10,2) ",%2 \n"
+ "paddd %%xmm0,%%xmm2 \n"
+ "movdqu %%xmm2," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x1,%3 \n"
+ "jge 10b \n"
+
+ "19: \n"
+ : "+r"(row), // %0
+ "+r"(cumsum), // %1
+ "+r"(previous_cumsum), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc"
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+}
+#endif // HAS_COMPUTECUMULATIVESUMROW_SSE2
+
+#ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
+void CumulativeSumToAverageRow_SSE2(const int32* topleft,
+ const int32* botleft,
+ int width,
+ int area,
+ uint8* dst,
+ int count) {
+ asm volatile (
+ "movd %5,%%xmm5 \n"
+ "cvtdq2ps %%xmm5,%%xmm5 \n"
+ "rcpss %%xmm5,%%xmm4 \n"
+ "pshufd $0x0,%%xmm4,%%xmm4 \n"
+ "sub $0x4,%3 \n"
+ "jl 49f \n"
+ "cmpl $0x80,%5 \n"
+ "ja 40f \n"
+
+ "pshufd $0x0,%%xmm5,%%xmm5 \n"
+ "pcmpeqb %%xmm6,%%xmm6 \n"
+ "psrld $0x10,%%xmm6 \n"
+ "cvtdq2ps %%xmm6,%%xmm6 \n"
+ "addps %%xmm6,%%xmm5 \n"
+ "mulps %%xmm4,%%xmm5 \n"
+ "cvtps2dq %%xmm5,%%xmm5 \n"
+ "packssdw %%xmm5,%%xmm5 \n"
+
+ // 4 pixel small loop.
+ LABELALIGN
+ "4: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
+ "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
+ MEMOPREG(psubd,0x00,0,4,4,xmm0) // psubd 0x00(%0,%4,4),%%xmm0
+ MEMOPREG(psubd,0x10,0,4,4,xmm1) // psubd 0x10(%0,%4,4),%%xmm1
+ MEMOPREG(psubd,0x20,0,4,4,xmm2) // psubd 0x20(%0,%4,4),%%xmm2
+ MEMOPREG(psubd,0x30,0,4,4,xmm3) // psubd 0x30(%0,%4,4),%%xmm3
+ "lea " MEMLEA(0x40,0) ",%0 \n"
+ "psubd " MEMACCESS(1) ",%%xmm0 \n"
+ "psubd " MEMACCESS2(0x10,1) ",%%xmm1 \n"
+ "psubd " MEMACCESS2(0x20,1) ",%%xmm2 \n"
+ "psubd " MEMACCESS2(0x30,1) ",%%xmm3 \n"
+ MEMOPREG(paddd,0x00,1,4,4,xmm0) // paddd 0x00(%1,%4,4),%%xmm0
+ MEMOPREG(paddd,0x10,1,4,4,xmm1) // paddd 0x10(%1,%4,4),%%xmm1
+ MEMOPREG(paddd,0x20,1,4,4,xmm2) // paddd 0x20(%1,%4,4),%%xmm2
+ MEMOPREG(paddd,0x30,1,4,4,xmm3) // paddd 0x30(%1,%4,4),%%xmm3
+ "lea " MEMLEA(0x40,1) ",%1 \n"
+ "packssdw %%xmm1,%%xmm0 \n"
+ "packssdw %%xmm3,%%xmm2 \n"
+ "pmulhuw %%xmm5,%%xmm0 \n"
+ "pmulhuw %%xmm5,%%xmm2 \n"
+ "packuswb %%xmm2,%%xmm0 \n"
+ "movdqu %%xmm0," MEMACCESS(2) " \n"
+ "lea " MEMLEA(0x10,2) ",%2 \n"
+ "sub $0x4,%3 \n"
+ "jge 4b \n"
+ "jmp 49f \n"
+
+ // 4 pixel loop \n"
+ LABELALIGN
+ "40: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
+ "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
+ MEMOPREG(psubd,0x00,0,4,4,xmm0) // psubd 0x00(%0,%4,4),%%xmm0
+ MEMOPREG(psubd,0x10,0,4,4,xmm1) // psubd 0x10(%0,%4,4),%%xmm1
+ MEMOPREG(psubd,0x20,0,4,4,xmm2) // psubd 0x20(%0,%4,4),%%xmm2
+ MEMOPREG(psubd,0x30,0,4,4,xmm3) // psubd 0x30(%0,%4,4),%%xmm3
+ "lea " MEMLEA(0x40,0) ",%0 \n"
+ "psubd " MEMACCESS(1) ",%%xmm0 \n"
+ "psubd " MEMACCESS2(0x10,1) ",%%xmm1 \n"
+ "psubd " MEMACCESS2(0x20,1) ",%%xmm2 \n"
+ "psubd " MEMACCESS2(0x30,1) ",%%xmm3 \n"
+ MEMOPREG(paddd,0x00,1,4,4,xmm0) // paddd 0x00(%1,%4,4),%%xmm0
+ MEMOPREG(paddd,0x10,1,4,4,xmm1) // paddd 0x10(%1,%4,4),%%xmm1
+ MEMOPREG(paddd,0x20,1,4,4,xmm2) // paddd 0x20(%1,%4,4),%%xmm2
+ MEMOPREG(paddd,0x30,1,4,4,xmm3) // paddd 0x30(%1,%4,4),%%xmm3
+ "lea " MEMLEA(0x40,1) ",%1 \n"
+ "cvtdq2ps %%xmm0,%%xmm0 \n"
+ "cvtdq2ps %%xmm1,%%xmm1 \n"
+ "mulps %%xmm4,%%xmm0 \n"
+ "mulps %%xmm4,%%xmm1 \n"
+ "cvtdq2ps %%xmm2,%%xmm2 \n"
+ "cvtdq2ps %%xmm3,%%xmm3 \n"
+ "mulps %%xmm4,%%xmm2 \n"
+ "mulps %%xmm4,%%xmm3 \n"
+ "cvtps2dq %%xmm0,%%xmm0 \n"
+ "cvtps2dq %%xmm1,%%xmm1 \n"
+ "cvtps2dq %%xmm2,%%xmm2 \n"
+ "cvtps2dq %%xmm3,%%xmm3 \n"
+ "packssdw %%xmm1,%%xmm0 \n"
+ "packssdw %%xmm3,%%xmm2 \n"
+ "packuswb %%xmm2,%%xmm0 \n"
+ "movdqu %%xmm0," MEMACCESS(2) " \n"
+ "lea " MEMLEA(0x10,2) ",%2 \n"
+ "sub $0x4,%3 \n"
+ "jge 40b \n"
+
+ "49: \n"
+ "add $0x3,%3 \n"
+ "jl 19f \n"
+
+ // 1 pixel loop \n"
+ LABELALIGN
+ "10: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ MEMOPREG(psubd,0x00,0,4,4,xmm0) // psubd 0x00(%0,%4,4),%%xmm0
+ "lea " MEMLEA(0x10,0) ",%0 \n"
+ "psubd " MEMACCESS(1) ",%%xmm0 \n"
+ MEMOPREG(paddd,0x00,1,4,4,xmm0) // paddd 0x00(%1,%4,4),%%xmm0
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "cvtdq2ps %%xmm0,%%xmm0 \n"
+ "mulps %%xmm4,%%xmm0 \n"
+ "cvtps2dq %%xmm0,%%xmm0 \n"
+ "packssdw %%xmm0,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movd %%xmm0," MEMACCESS(2) " \n"
+ "lea " MEMLEA(0x4,2) ",%2 \n"
+ "sub $0x1,%3 \n"
+ "jge 10b \n"
+ "19: \n"
+ : "+r"(topleft), // %0
+ "+r"(botleft), // %1
+ "+r"(dst), // %2
+ "+rm"(count) // %3
+ : "r"((intptr_t)(width)), // %4
+ "rm"(area) // %5
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+ );
+}
+#endif // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
+
+#ifdef HAS_ARGBAFFINEROW_SSE2
+// Copy ARGB pixels from source image with slope to a row of destination.
+LIBYUV_API
+void ARGBAffineRow_SSE2(const uint8* src_argb,
+ int src_argb_stride,
+ uint8* dst_argb,
+ const float* src_dudv,
+ int width) {
+ intptr_t src_argb_stride_temp = src_argb_stride;
+ intptr_t temp;
+ asm volatile (
+ "movq " MEMACCESS(3) ",%%xmm2 \n"
+ "movq " MEMACCESS2(0x08,3) ",%%xmm7 \n"
+ "shl $0x10,%1 \n"
+ "add $0x4,%1 \n"
+ "movd %1,%%xmm5 \n"
+ "sub $0x4,%4 \n"
+ "jl 49f \n"
+
+ "pshufd $0x44,%%xmm7,%%xmm7 \n"
+ "pshufd $0x0,%%xmm5,%%xmm5 \n"
+ "movdqa %%xmm2,%%xmm0 \n"
+ "addps %%xmm7,%%xmm0 \n"
+ "movlhps %%xmm0,%%xmm2 \n"
+ "movdqa %%xmm7,%%xmm4 \n"
+ "addps %%xmm4,%%xmm4 \n"
+ "movdqa %%xmm2,%%xmm3 \n"
+ "addps %%xmm4,%%xmm3 \n"
+ "addps %%xmm4,%%xmm4 \n"
+
+ // 4 pixel loop \n"
+ LABELALIGN
+ "40: \n"
+ "cvttps2dq %%xmm2,%%xmm0 \n" // x, y float to int first 2
+ "cvttps2dq %%xmm3,%%xmm1 \n" // x, y float to int next 2
+ "packssdw %%xmm1,%%xmm0 \n" // x, y as 8 shorts
+ "pmaddwd %%xmm5,%%xmm0 \n" // off = x * 4 + y * stride
+ "movd %%xmm0,%k1 \n"
+ "pshufd $0x39,%%xmm0,%%xmm0 \n"
+ "movd %%xmm0,%k5 \n"
+ "pshufd $0x39,%%xmm0,%%xmm0 \n"
+ MEMOPREG(movd,0x00,0,1,1,xmm1) // movd (%0,%1,1),%%xmm1
+ MEMOPREG(movd,0x00,0,5,1,xmm6) // movd (%0,%5,1),%%xmm6
+ "punpckldq %%xmm6,%%xmm1 \n"
+ "addps %%xmm4,%%xmm2 \n"
+ "movq %%xmm1," MEMACCESS(2) " \n"
+ "movd %%xmm0,%k1 \n"
+ "pshufd $0x39,%%xmm0,%%xmm0 \n"
+ "movd %%xmm0,%k5 \n"
+ MEMOPREG(movd,0x00,0,1,1,xmm0) // movd (%0,%1,1),%%xmm0
+ MEMOPREG(movd,0x00,0,5,1,xmm6) // movd (%0,%5,1),%%xmm6
+ "punpckldq %%xmm6,%%xmm0 \n"
+ "addps %%xmm4,%%xmm3 \n"
+ "movq %%xmm0," MEMACCESS2(0x08,2) " \n"
+ "lea " MEMLEA(0x10,2) ",%2 \n"
+ "sub $0x4,%4 \n"
+ "jge 40b \n"
+
+ "49: \n"
+ "add $0x3,%4 \n"
+ "jl 19f \n"
+
+ // 1 pixel loop \n"
+ LABELALIGN
+ "10: \n"
+ "cvttps2dq %%xmm2,%%xmm0 \n"
+ "packssdw %%xmm0,%%xmm0 \n"
+ "pmaddwd %%xmm5,%%xmm0 \n"
+ "addps %%xmm7,%%xmm2 \n"
+ "movd %%xmm0,%k1 \n"
+ MEMOPREG(movd,0x00,0,1,1,xmm0) // movd (%0,%1,1),%%xmm0
+ "movd %%xmm0," MEMACCESS(2) " \n"
+ "lea " MEMLEA(0x04,2) ",%2 \n"
+ "sub $0x1,%4 \n"
+ "jge 10b \n"
+ "19: \n"
+ : "+r"(src_argb), // %0
+ "+r"(src_argb_stride_temp), // %1
+ "+r"(dst_argb), // %2
+ "+r"(src_dudv), // %3
+ "+rm"(width), // %4
+ "=&r"(temp) // %5
+ :
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+ );
+}
+#endif // HAS_ARGBAFFINEROW_SSE2
+
+#ifdef HAS_INTERPOLATEROW_SSSE3
+// Bilinear filter 16x2 -> 16x1
+void InterpolateRow_SSSE3(uint8* dst_ptr,
+ const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ int dst_width,
+ int source_y_fraction) {
+ asm volatile (
+ "sub %1,%0 \n"
+ "cmp $0x0,%3 \n"
+ "je 100f \n"
+ "cmp $0x80,%3 \n"
+ "je 50f \n"
+
+ "movd %3,%%xmm0 \n"
+ "neg %3 \n"
+ "add $0x100,%3 \n"
+ "movd %3,%%xmm5 \n"
+ "punpcklbw %%xmm0,%%xmm5 \n"
+ "punpcklwd %%xmm5,%%xmm5 \n"
+ "pshufd $0x0,%%xmm5,%%xmm5 \n"
+ "mov $0x80808080,%%eax \n"
+ "movd %%eax,%%xmm4 \n"
+ "pshufd $0x0,%%xmm4,%%xmm4 \n"
+
+ // General purpose row blend.
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(1) ",%%xmm0 \n"
+ MEMOPREG(movdqu,0x00,1,4,1,xmm2)
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklbw %%xmm2,%%xmm0 \n"
+ "punpckhbw %%xmm2,%%xmm1 \n"
+ "psubb %%xmm4,%%xmm0 \n"
+ "psubb %%xmm4,%%xmm1 \n"
+ "movdqa %%xmm5,%%xmm2 \n"
+ "movdqa %%xmm5,%%xmm3 \n"
+ "pmaddubsw %%xmm0,%%xmm2 \n"
+ "pmaddubsw %%xmm1,%%xmm3 \n"
+ "paddw %%xmm4,%%xmm2 \n"
+ "paddw %%xmm4,%%xmm3 \n"
+ "psrlw $0x8,%%xmm2 \n"
+ "psrlw $0x8,%%xmm3 \n"
+ "packuswb %%xmm3,%%xmm2 \n"
+ MEMOPMEM(movdqu,xmm2,0x00,1,0,1)
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ "jmp 99f \n"
+
+ // Blend 50 / 50.
+ LABELALIGN
+ "50: \n"
+ "movdqu " MEMACCESS(1) ",%%xmm0 \n"
+ MEMOPREG(movdqu,0x00,1,4,1,xmm1)
+ "pavgb %%xmm1,%%xmm0 \n"
+ MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 50b \n"
+ "jmp 99f \n"
+
+ // Blend 100 / 0 - Copy row unchanged.
+ LABELALIGN
+ "100: \n"
+ "movdqu " MEMACCESS(1) ",%%xmm0 \n"
+ MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 100b \n"
+
+ "99: \n"
+ : "+r"(dst_ptr), // %0
+ "+r"(src_ptr), // %1
+ "+rm"(dst_width), // %2
+ "+r"(source_y_fraction) // %3
+ : "r"((intptr_t)(src_stride)) // %4
+ : "memory", "cc", "eax", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+}
+#endif // HAS_INTERPOLATEROW_SSSE3
+
+#ifdef HAS_INTERPOLATEROW_AVX2
+// Bilinear filter 32x2 -> 32x1
+void InterpolateRow_AVX2(uint8* dst_ptr,
+ const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ int dst_width,
+ int source_y_fraction) {
+ asm volatile (
+ "cmp $0x0,%3 \n"
+ "je 100f \n"
+ "sub %1,%0 \n"
+ "cmp $0x80,%3 \n"
+ "je 50f \n"
+
+ "vmovd %3,%%xmm0 \n"
+ "neg %3 \n"
+ "add $0x100,%3 \n"
+ "vmovd %3,%%xmm5 \n"
+ "vpunpcklbw %%xmm0,%%xmm5,%%xmm5 \n"
+ "vpunpcklwd %%xmm5,%%xmm5,%%xmm5 \n"
+ "vbroadcastss %%xmm5,%%ymm5 \n"
+ "mov $0x80808080,%%eax \n"
+ "vmovd %%eax,%%xmm4 \n"
+ "vbroadcastss %%xmm4,%%ymm4 \n"
+
+ // General purpose row blend.
+ LABELALIGN
+ "1: \n"
+ "vmovdqu " MEMACCESS(1) ",%%ymm0 \n"
+ MEMOPREG(vmovdqu,0x00,1,4,1,ymm2)
+ "vpunpckhbw %%ymm2,%%ymm0,%%ymm1 \n"
+ "vpunpcklbw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpsubb %%ymm4,%%ymm1,%%ymm1 \n"
+ "vpsubb %%ymm4,%%ymm0,%%ymm0 \n"
+ "vpmaddubsw %%ymm1,%%ymm5,%%ymm1 \n"
+ "vpmaddubsw %%ymm0,%%ymm5,%%ymm0 \n"
+ "vpaddw %%ymm4,%%ymm1,%%ymm1 \n"
+ "vpaddw %%ymm4,%%ymm0,%%ymm0 \n"
+ "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1)
+ "lea " MEMLEA(0x20,1) ",%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
+ "jmp 99f \n"
+
+ // Blend 50 / 50.
+ LABELALIGN
+ "50: \n"
+ "vmovdqu " MEMACCESS(1) ",%%ymm0 \n"
+ VMEMOPREG(vpavgb,0x00,1,4,1,ymm0,ymm0) // vpavgb (%1,%4,1),%%ymm0,%%ymm0
+ MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1)
+ "lea " MEMLEA(0x20,1) ",%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 50b \n"
+ "jmp 99f \n"
+
+ // Blend 100 / 0 - Copy row unchanged.
+ LABELALIGN
+ "100: \n"
+ "rep movsb " MEMMOVESTRING(1,0) " \n"
+ "jmp 999f \n"
+
+ "99: \n"
+ "vzeroupper \n"
+ "999: \n"
+ : "+D"(dst_ptr), // %0
+ "+S"(src_ptr), // %1
+ "+cm"(dst_width), // %2
+ "+r"(source_y_fraction) // %3
+ : "r"((intptr_t)(src_stride)) // %4
+ : "memory", "cc", "eax", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm4", "xmm5"
+ );
+}
+#endif // HAS_INTERPOLATEROW_AVX2
+
+#ifdef HAS_ARGBSHUFFLEROW_SSSE3
+// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
+void ARGBShuffleRow_SSSE3(const uint8* src_argb,
+ uint8* dst_argb,
+ const uint8* shuffler,
+ int width) {
+ asm volatile (
+ "movdqu " MEMACCESS(3) ",%%xmm5 \n"
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "pshufb %%xmm5,%%xmm0 \n"
+ "pshufb %%xmm5,%%xmm1 \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
+ "lea " MEMLEA(0x20,1) ",%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "r"(shuffler) // %3
+ : "memory", "cc"
+ , "xmm0", "xmm1", "xmm5"
+ );
+}
+#endif // HAS_ARGBSHUFFLEROW_SSSE3
+
+#ifdef HAS_ARGBSHUFFLEROW_AVX2
+// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
+void ARGBShuffleRow_AVX2(const uint8* src_argb,
+ uint8* dst_argb,
+ const uint8* shuffler,
+ int width) {
+ asm volatile (
+ "vbroadcastf128 " MEMACCESS(3) ",%%ymm5 \n"
+ LABELALIGN
+ "1: \n"
+ "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
+ "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
+ "lea " MEMLEA(0x40,0) ",%0 \n"
+ "vpshufb %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpshufb %%ymm5,%%ymm1,%%ymm1 \n"
+ "vmovdqu %%ymm0," MEMACCESS(1) " \n"
+ "vmovdqu %%ymm1," MEMACCESS2(0x20,1) " \n"
+ "lea " MEMLEA(0x40,1) ",%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "r"(shuffler) // %3
+ : "memory", "cc"
+ , "xmm0", "xmm1", "xmm5"
+ );
+}
+#endif // HAS_ARGBSHUFFLEROW_AVX2
+
+#ifdef HAS_ARGBSHUFFLEROW_SSE2
+// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
+void ARGBShuffleRow_SSE2(const uint8* src_argb,
+ uint8* dst_argb,
+ const uint8* shuffler,
+ int width) {
+ uintptr_t pixel_temp;
+ asm volatile (
+ "pxor %%xmm5,%%xmm5 \n"
+ "mov " MEMACCESS(4) ",%k2 \n"
+ "cmp $0x3000102,%k2 \n"
+ "je 3012f \n"
+ "cmp $0x10203,%k2 \n"
+ "je 123f \n"
+ "cmp $0x30201,%k2 \n"
+ "je 321f \n"
+ "cmp $0x2010003,%k2 \n"
+ "je 2103f \n"
+
+ LABELALIGN
+ "1: \n"
+ "movzb " MEMACCESS(4) ",%2 \n"
+ MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2
+ "mov %b2," MEMACCESS(1) " \n"
+ "movzb " MEMACCESS2(0x1,4) ",%2 \n"
+ MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2
+ "mov %b2," MEMACCESS2(0x1,1) " \n"
+ "movzb " MEMACCESS2(0x2,4) ",%2 \n"
+ MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2
+ "mov %b2," MEMACCESS2(0x2,1) " \n"
+ "movzb " MEMACCESS2(0x3,4) ",%2 \n"
+ MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2
+ "mov %b2," MEMACCESS2(0x3,1) " \n"
+ "lea " MEMLEA(0x4,0) ",%0 \n"
+ "lea " MEMLEA(0x4,1) ",%1 \n"
+ "sub $0x1,%3 \n"
+ "jg 1b \n"
+ "jmp 99f \n"
+
+ LABELALIGN
+ "123: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "lea " MEMLEA(0x10,0) ",%0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklbw %%xmm5,%%xmm0 \n"
+ "punpckhbw %%xmm5,%%xmm1 \n"
+ "pshufhw $0x1b,%%xmm0,%%xmm0 \n"
+ "pshuflw $0x1b,%%xmm0,%%xmm0 \n"
+ "pshufhw $0x1b,%%xmm1,%%xmm1 \n"
+ "pshuflw $0x1b,%%xmm1,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x4,%3 \n"
+ "jg 123b \n"
+ "jmp 99f \n"
+
+ LABELALIGN
+ "321: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "lea " MEMLEA(0x10,0) ",%0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklbw %%xmm5,%%xmm0 \n"
+ "punpckhbw %%xmm5,%%xmm1 \n"
+ "pshufhw $0x39,%%xmm0,%%xmm0 \n"
+ "pshuflw $0x39,%%xmm0,%%xmm0 \n"
+ "pshufhw $0x39,%%xmm1,%%xmm1 \n"
+ "pshuflw $0x39,%%xmm1,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x4,%3 \n"
+ "jg 321b \n"
+ "jmp 99f \n"
+
+ LABELALIGN
+ "2103: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "lea " MEMLEA(0x10,0) ",%0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklbw %%xmm5,%%xmm0 \n"
+ "punpckhbw %%xmm5,%%xmm1 \n"
+ "pshufhw $0x93,%%xmm0,%%xmm0 \n"
+ "pshuflw $0x93,%%xmm0,%%xmm0 \n"
+ "pshufhw $0x93,%%xmm1,%%xmm1 \n"
+ "pshuflw $0x93,%%xmm1,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x4,%3 \n"
+ "jg 2103b \n"
+ "jmp 99f \n"
+
+ LABELALIGN
+ "3012: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "lea " MEMLEA(0x10,0) ",%0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklbw %%xmm5,%%xmm0 \n"
+ "punpckhbw %%xmm5,%%xmm1 \n"
+ "pshufhw $0xc6,%%xmm0,%%xmm0 \n"
+ "pshuflw $0xc6,%%xmm0,%%xmm0 \n"
+ "pshufhw $0xc6,%%xmm1,%%xmm1 \n"
+ "pshuflw $0xc6,%%xmm1,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x4,%3 \n"
+ "jg 3012b \n"
+
+ "99: \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "=&d"(pixel_temp), // %2
+ "+r"(width) // %3
+ : "r"(shuffler) // %4
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm5"
+ );
+}
+#endif // HAS_ARGBSHUFFLEROW_SSE2
+
+#ifdef HAS_I422TOYUY2ROW_SSE2
+void I422ToYUY2Row_SSE2(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_frame,
+ int width) {
+ asm volatile (
+ "sub %1,%2 \n"
+ LABELALIGN
+ "1: \n"
+ "movq " MEMACCESS(1) ",%%xmm2 \n"
+ MEMOPREG(movq,0x00,1,2,1,xmm3) // movq (%1,%2,1),%%xmm3
+ "lea " MEMLEA(0x8,1) ",%1 \n"
+ "punpcklbw %%xmm3,%%xmm2 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "lea " MEMLEA(0x10,0) ",%0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklbw %%xmm2,%%xmm0 \n"
+ "punpckhbw %%xmm2,%%xmm1 \n"
+ "movdqu %%xmm0," MEMACCESS(3) " \n"
+ "movdqu %%xmm1," MEMACCESS2(0x10,3) " \n"
+ "lea " MEMLEA(0x20,3) ",%3 \n"
+ "sub $0x10,%4 \n"
+ "jg 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_frame), // %3
+ "+rm"(width) // %4
+ :
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm3"
+ );
+}
+#endif // HAS_I422TOYUY2ROW_SSE2
+
+#ifdef HAS_I422TOUYVYROW_SSE2
+void I422ToUYVYRow_SSE2(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_frame,
+ int width) {
+ asm volatile (
+ "sub %1,%2 \n"
+ LABELALIGN
+ "1: \n"
+ "movq " MEMACCESS(1) ",%%xmm2 \n"
+ MEMOPREG(movq,0x00,1,2,1,xmm3) // movq (%1,%2,1),%%xmm3
+ "lea " MEMLEA(0x8,1) ",%1 \n"
+ "punpcklbw %%xmm3,%%xmm2 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqa %%xmm2,%%xmm1 \n"
+ "lea " MEMLEA(0x10,0) ",%0 \n"
+ "punpcklbw %%xmm0,%%xmm1 \n"
+ "punpckhbw %%xmm0,%%xmm2 \n"
+ "movdqu %%xmm1," MEMACCESS(3) " \n"
+ "movdqu %%xmm2," MEMACCESS2(0x10,3) " \n"
+ "lea " MEMLEA(0x20,3) ",%3 \n"
+ "sub $0x10,%4 \n"
+ "jg 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_frame), // %3
+ "+rm"(width) // %4
+ :
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm3"
+ );
+}
+#endif // HAS_I422TOUYVYROW_SSE2
+
+#ifdef HAS_ARGBPOLYNOMIALROW_SSE2
+void ARGBPolynomialRow_SSE2(const uint8* src_argb,
+ uint8* dst_argb,
+ const float* poly,
+ int width) {
+ asm volatile (
+ "pxor %%xmm3,%%xmm3 \n"
+
+ // 2 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movq " MEMACCESS(0) ",%%xmm0 \n"
+ "lea " MEMLEA(0x8,0) ",%0 \n"
+ "punpcklbw %%xmm3,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm4 \n"
+ "punpcklwd %%xmm3,%%xmm0 \n"
+ "punpckhwd %%xmm3,%%xmm4 \n"
+ "cvtdq2ps %%xmm0,%%xmm0 \n"
+ "cvtdq2ps %%xmm4,%%xmm4 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm4,%%xmm5 \n"
+ "mulps " MEMACCESS2(0x10,3) ",%%xmm0 \n"
+ "mulps " MEMACCESS2(0x10,3) ",%%xmm4 \n"
+ "addps " MEMACCESS(3) ",%%xmm0 \n"
+ "addps " MEMACCESS(3) ",%%xmm4 \n"
+ "movdqa %%xmm1,%%xmm2 \n"
+ "movdqa %%xmm5,%%xmm6 \n"
+ "mulps %%xmm1,%%xmm2 \n"
+ "mulps %%xmm5,%%xmm6 \n"
+ "mulps %%xmm2,%%xmm1 \n"
+ "mulps %%xmm6,%%xmm5 \n"
+ "mulps " MEMACCESS2(0x20,3) ",%%xmm2 \n"
+ "mulps " MEMACCESS2(0x20,3) ",%%xmm6 \n"
+ "mulps " MEMACCESS2(0x30,3) ",%%xmm1 \n"
+ "mulps " MEMACCESS2(0x30,3) ",%%xmm5 \n"
+ "addps %%xmm2,%%xmm0 \n"
+ "addps %%xmm6,%%xmm4 \n"
+ "addps %%xmm1,%%xmm0 \n"
+ "addps %%xmm5,%%xmm4 \n"
+ "cvttps2dq %%xmm0,%%xmm0 \n"
+ "cvttps2dq %%xmm4,%%xmm4 \n"
+ "packuswb %%xmm4,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movq %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x8,1) ",%1 \n"
+ "sub $0x2,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "r"(poly) // %3
+ : "memory", "cc"
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+ );
+}
+#endif // HAS_ARGBPOLYNOMIALROW_SSE2
+
+#ifdef HAS_ARGBPOLYNOMIALROW_AVX2
+void ARGBPolynomialRow_AVX2(const uint8* src_argb,
+ uint8* dst_argb,
+ const float* poly,
+ int width) {
+ asm volatile (
+ "vbroadcastf128 " MEMACCESS(3) ",%%ymm4 \n"
+ "vbroadcastf128 " MEMACCESS2(0x10,3) ",%%ymm5 \n"
+ "vbroadcastf128 " MEMACCESS2(0x20,3) ",%%ymm6 \n"
+ "vbroadcastf128 " MEMACCESS2(0x30,3) ",%%ymm7 \n"
+
+ // 2 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "vpmovzxbd " MEMACCESS(0) ",%%ymm0 \n" // 2 ARGB pixels
+ "lea " MEMLEA(0x8,0) ",%0 \n"
+ "vcvtdq2ps %%ymm0,%%ymm0 \n" // X 8 floats
+ "vmulps %%ymm0,%%ymm0,%%ymm2 \n" // X * X
+ "vmulps %%ymm7,%%ymm0,%%ymm3 \n" // C3 * X
+ "vfmadd132ps %%ymm5,%%ymm4,%%ymm0 \n" // result = C0 + C1 * X
+ "vfmadd231ps %%ymm6,%%ymm2,%%ymm0 \n" // result += C2 * X * X
+ "vfmadd231ps %%ymm3,%%ymm2,%%ymm0 \n" // result += C3 * X * X * X
+ "vcvttps2dq %%ymm0,%%ymm0 \n"
+ "vpackusdw %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpackuswb %%xmm0,%%xmm0,%%xmm0 \n"
+ "vmovq %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x8,1) ",%1 \n"
+ "sub $0x2,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "r"(poly) // %3
+ : "memory", "cc",
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+ );
+}
+#endif // HAS_ARGBPOLYNOMIALROW_AVX2
+
+#ifdef HAS_HALFFLOATROW_SSE2
+static float kScaleBias = 1.9259299444e-34f;
+void HalfFloatRow_SSE2(const uint16* src, uint16* dst, float scale, int width) {
+ asm volatile (
+ "pshufd $0x0,%3,%%xmm4 \n"
+ "pxor %%xmm5,%%xmm5 \n"
+ "sub %0,%1 \n"
+
+ // 16 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm2 \n" // 8 shorts
+ "add $0x10,%0 \n"
+ "movdqa %%xmm2,%%xmm3 \n"
+ "punpcklwd %%xmm5,%%xmm2 \n" // 8 ints in xmm2/1
+ "cvtdq2ps %%xmm2,%%xmm2 \n" // 8 floats
+ "punpckhwd %%xmm5,%%xmm3 \n"
+ "cvtdq2ps %%xmm3,%%xmm3 \n"
+ "mulps %%xmm4,%%xmm2 \n"
+ "mulps %%xmm4,%%xmm3 \n"
+ "psrld $0xd,%%xmm2 \n"
+ "psrld $0xd,%%xmm3 \n"
+ "packssdw %%xmm3,%%xmm2 \n"
+ MEMOPMEM(movdqu,xmm2,-0x10,0,1,1)
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ : "x"(scale * kScaleBias) // %3
+ : "memory", "cc",
+ "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+}
+#endif // HAS_HALFFLOATROW_SSE2
+
+#ifdef HAS_HALFFLOATROW_AVX2
+void HalfFloatRow_AVX2(const uint16* src, uint16* dst, float scale, int width) {
+ asm volatile (
+ "vbroadcastss %3, %%ymm4 \n"
+ "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
+ "sub %0,%1 \n"
+
+ // 16 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "vmovdqu " MEMACCESS(0) ",%%ymm2 \n" // 16 shorts
+ "add $0x20,%0 \n"
+ "vpunpckhwd %%ymm5,%%ymm2,%%ymm3 \n" // mutates
+ "vpunpcklwd %%ymm5,%%ymm2,%%ymm2 \n"
+ "vcvtdq2ps %%ymm3,%%ymm3 \n"
+ "vcvtdq2ps %%ymm2,%%ymm2 \n"
+ "vmulps %%ymm3,%%ymm4,%%ymm3 \n"
+ "vmulps %%ymm2,%%ymm4,%%ymm2 \n"
+ "vpsrld $0xd,%%ymm3,%%ymm3 \n"
+ "vpsrld $0xd,%%ymm2,%%ymm2 \n"
+ "vpackssdw %%ymm3, %%ymm2, %%ymm2 \n" // unmutates
+ MEMOPMEM(vmovdqu,ymm2,-0x20,0,1,1)
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+
+ "vzeroupper \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ : "x"(scale * kScaleBias) // %3
+ : "memory", "cc",
+ "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+}
+#endif // HAS_HALFFLOATROW_AVX2
+
+#ifdef HAS_HALFFLOATROW_F16C
+void HalfFloatRow_F16C(const uint16* src, uint16* dst, float scale, int width) {
+ asm volatile (
+ "vbroadcastss %3, %%ymm4 \n"
+ "sub %0,%1 \n"
+
+ // 16 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "vpmovzxwd " MEMACCESS(0) ",%%ymm2 \n" // 16 shorts -> 16 ints
+ "vpmovzxwd " MEMACCESS2(0x10,0) ",%%ymm3 \n"
+ "vcvtdq2ps %%ymm2,%%ymm2 \n"
+ "vcvtdq2ps %%ymm3,%%ymm3 \n"
+ "vmulps %%ymm2,%%ymm4,%%ymm2 \n"
+ "vmulps %%ymm3,%%ymm4,%%ymm3 \n"
+ "vcvtps2ph $3, %%ymm2, %%xmm2 \n"
+ "vcvtps2ph $3, %%ymm3, %%xmm3 \n"
+ MEMOPMEM(vmovdqu,xmm2,0x00,0,1,1)
+ MEMOPMEM(vmovdqu,xmm3,0x10,0,1,1)
+ "add $0x20,%0 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ : "x"(scale) // %3
+ : "memory", "cc",
+ "xmm2", "xmm3", "xmm4"
+ );
+}
+#endif // HAS_HALFFLOATROW_F16C
+
+#ifdef HAS_HALFFLOATROW_F16C
+void HalfFloat1Row_F16C(const uint16* src, uint16* dst, float, int width) {
+ asm volatile (
+ "sub %0,%1 \n"
+ // 16 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "vpmovzxwd " MEMACCESS(0) ",%%ymm2 \n" // 16 shorts -> 16 ints
+ "vpmovzxwd " MEMACCESS2(0x10,0) ",%%ymm3 \n"
+ "vcvtdq2ps %%ymm2,%%ymm2 \n"
+ "vcvtdq2ps %%ymm3,%%ymm3 \n"
+ "vcvtps2ph $3, %%ymm2, %%xmm2 \n"
+ "vcvtps2ph $3, %%ymm3, %%xmm3 \n"
+ MEMOPMEM(vmovdqu,xmm2,0x00,0,1,1)
+ MEMOPMEM(vmovdqu,xmm3,0x10,0,1,1)
+ "add $0x20,%0 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ :
+ : "memory", "cc",
+ "xmm2", "xmm3"
+ );
+}
+#endif // HAS_HALFFLOATROW_F16C
+
+#ifdef HAS_ARGBCOLORTABLEROW_X86
+// Tranform ARGB pixels with color table.
+void ARGBColorTableRow_X86(uint8* dst_argb,
+ const uint8* table_argb,
+ int width) {
+ uintptr_t pixel_temp;
+ asm volatile (
+ // 1 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movzb " MEMACCESS(0) ",%1 \n"
+ "lea " MEMLEA(0x4,0) ",%0 \n"
+ MEMOPARG(movzb,0x00,3,1,4,1) " \n" // movzb (%3,%1,4),%1
+ "mov %b1," MEMACCESS2(-0x4,0) " \n"
+ "movzb " MEMACCESS2(-0x3,0) ",%1 \n"
+ MEMOPARG(movzb,0x01,3,1,4,1) " \n" // movzb 0x1(%3,%1,4),%1
+ "mov %b1," MEMACCESS2(-0x3,0) " \n"
+ "movzb " MEMACCESS2(-0x2,0) ",%1 \n"
+ MEMOPARG(movzb,0x02,3,1,4,1) " \n" // movzb 0x2(%3,%1,4),%1
+ "mov %b1," MEMACCESS2(-0x2,0) " \n"
+ "movzb " MEMACCESS2(-0x1,0) ",%1 \n"
+ MEMOPARG(movzb,0x03,3,1,4,1) " \n" // movzb 0x3(%3,%1,4),%1
+ "mov %b1," MEMACCESS2(-0x1,0) " \n"
+ "dec %2 \n"
+ "jg 1b \n"
+ : "+r"(dst_argb), // %0
+ "=&d"(pixel_temp), // %1
+ "+r"(width) // %2
+ : "r"(table_argb) // %3
+ : "memory", "cc");
+}
+#endif // HAS_ARGBCOLORTABLEROW_X86
+
+#ifdef HAS_RGBCOLORTABLEROW_X86
+// Tranform RGB pixels with color table.
+void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) {
+ uintptr_t pixel_temp;
+ asm volatile (
+ // 1 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movzb " MEMACCESS(0) ",%1 \n"
+ "lea " MEMLEA(0x4,0) ",%0 \n"
+ MEMOPARG(movzb,0x00,3,1,4,1) " \n" // movzb (%3,%1,4),%1
+ "mov %b1," MEMACCESS2(-0x4,0) " \n"
+ "movzb " MEMACCESS2(-0x3,0) ",%1 \n"
+ MEMOPARG(movzb,0x01,3,1,4,1) " \n" // movzb 0x1(%3,%1,4),%1
+ "mov %b1," MEMACCESS2(-0x3,0) " \n"
+ "movzb " MEMACCESS2(-0x2,0) ",%1 \n"
+ MEMOPARG(movzb,0x02,3,1,4,1) " \n" // movzb 0x2(%3,%1,4),%1
+ "mov %b1," MEMACCESS2(-0x2,0) " \n"
+ "dec %2 \n"
+ "jg 1b \n"
+ : "+r"(dst_argb), // %0
+ "=&d"(pixel_temp), // %1
+ "+r"(width) // %2
+ : "r"(table_argb) // %3
+ : "memory", "cc");
+}
+#endif // HAS_RGBCOLORTABLEROW_X86
+
+#ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
+// Tranform RGB pixels with luma table.
+void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb,
+ uint8* dst_argb,
+ int width,
+ const uint8* luma,
+ uint32 lumacoeff) {
+ uintptr_t pixel_temp;
+ uintptr_t table_temp;
+ asm volatile (
+ "movd %6,%%xmm3 \n"
+ "pshufd $0x0,%%xmm3,%%xmm3 \n"
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "psllw $0x8,%%xmm4 \n"
+ "pxor %%xmm5,%%xmm5 \n"
+
+ // 4 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(2) ",%%xmm0 \n"
+ "pmaddubsw %%xmm3,%%xmm0 \n"
+ "phaddw %%xmm0,%%xmm0 \n"
+ "pand %%xmm4,%%xmm0 \n"
+ "punpcklwd %%xmm5,%%xmm0 \n"
+ "movd %%xmm0,%k1 \n" // 32 bit offset
+ "add %5,%1 \n"
+ "pshufd $0x39,%%xmm0,%%xmm0 \n"
+
+ "movzb " MEMACCESS(2) ",%0 \n"
+ MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
+ "mov %b0," MEMACCESS(3) " \n"
+ "movzb " MEMACCESS2(0x1,2) ",%0 \n"
+ MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
+ "mov %b0," MEMACCESS2(0x1,3) " \n"
+ "movzb " MEMACCESS2(0x2,2) ",%0 \n"
+ MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
+ "mov %b0," MEMACCESS2(0x2,3) " \n"
+ "movzb " MEMACCESS2(0x3,2) ",%0 \n"
+ "mov %b0," MEMACCESS2(0x3,3) " \n"
+
+ "movd %%xmm0,%k1 \n" // 32 bit offset
+ "add %5,%1 \n"
+ "pshufd $0x39,%%xmm0,%%xmm0 \n"
+
+ "movzb " MEMACCESS2(0x4,2) ",%0 \n"
+ MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
+ "mov %b0," MEMACCESS2(0x4,3) " \n"
+ "movzb " MEMACCESS2(0x5,2) ",%0 \n"
+ MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
+ "mov %b0," MEMACCESS2(0x5,3) " \n"
+ "movzb " MEMACCESS2(0x6,2) ",%0 \n"
+ MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
+ "mov %b0," MEMACCESS2(0x6,3) " \n"
+ "movzb " MEMACCESS2(0x7,2) ",%0 \n"
+ "mov %b0," MEMACCESS2(0x7,3) " \n"
+
+ "movd %%xmm0,%k1 \n" // 32 bit offset
+ "add %5,%1 \n"
+ "pshufd $0x39,%%xmm0,%%xmm0 \n"
+
+ "movzb " MEMACCESS2(0x8,2) ",%0 \n"
+ MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
+ "mov %b0," MEMACCESS2(0x8,3) " \n"
+ "movzb " MEMACCESS2(0x9,2) ",%0 \n"
+ MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
+ "mov %b0," MEMACCESS2(0x9,3) " \n"
+ "movzb " MEMACCESS2(0xa,2) ",%0 \n"
+ MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
+ "mov %b0," MEMACCESS2(0xa,3) " \n"
+ "movzb " MEMACCESS2(0xb,2) ",%0 \n"
+ "mov %b0," MEMACCESS2(0xb,3) " \n"
+
+ "movd %%xmm0,%k1 \n" // 32 bit offset
+ "add %5,%1 \n"
+
+ "movzb " MEMACCESS2(0xc,2) ",%0 \n"
+ MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
+ "mov %b0," MEMACCESS2(0xc,3) " \n"
+ "movzb " MEMACCESS2(0xd,2) ",%0 \n"
+ MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
+ "mov %b0," MEMACCESS2(0xd,3) " \n"
+ "movzb " MEMACCESS2(0xe,2) ",%0 \n"
+ MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
+ "mov %b0," MEMACCESS2(0xe,3) " \n"
+ "movzb " MEMACCESS2(0xf,2) ",%0 \n"
+ "mov %b0," MEMACCESS2(0xf,3) " \n"
+ "lea " MEMLEA(0x10,2) ",%2 \n"
+ "lea " MEMLEA(0x10,3) ",%3 \n"
+ "sub $0x4,%4 \n"
+ "jg 1b \n"
+ : "=&d"(pixel_temp), // %0
+ "=&a"(table_temp), // %1
+ "+r"(src_argb), // %2
+ "+r"(dst_argb), // %3
+ "+rm"(width) // %4
+ : "r"(luma), // %5
+ "rm"(lumacoeff) // %6
+ : "memory", "cc", "xmm0", "xmm3", "xmm4", "xmm5"
+ );
+}
+#endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3
+
+#endif // defined(__x86_64__) || defined(__i386__)
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/libyuv/src/main/cpp/libyuv/source/row_msa.cc b/libyuv/src/main/cpp/libyuv/source/row_msa.cc
new file mode 100644
index 0000000..f79de1c
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/source/row_msa.cc
@@ -0,0 +1,2977 @@
+/*
+ * Copyright 2016 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include
+
+#include "libyuv/row.h"
+
+// This module is for GCC MSA
+#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
+#include "libyuv/macros_msa.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#define ALPHA_VAL (-1)
+
+// Fill YUV -> RGB conversion constants into vectors
+#define YUVTORGB_SETUP(yuvconst, ub, vr, ug, vg, bb, bg, br, yg) \
+ { \
+ ub = __msa_fill_w(yuvconst->kUVToB[0]); \
+ vr = __msa_fill_w(yuvconst->kUVToR[1]); \
+ ug = __msa_fill_w(yuvconst->kUVToG[0]); \
+ vg = __msa_fill_w(yuvconst->kUVToG[1]); \
+ bb = __msa_fill_w(yuvconst->kUVBiasB[0]); \
+ bg = __msa_fill_w(yuvconst->kUVBiasG[0]); \
+ br = __msa_fill_w(yuvconst->kUVBiasR[0]); \
+ yg = __msa_fill_w(yuvconst->kYToRgb[0]); \
+ }
+
+// Load YUV 422 pixel data
+#define READYUV422(psrc_y, psrc_u, psrc_v, out_y, out_u, out_v) \
+ { \
+ uint64 y_m; \
+ uint32 u_m, v_m; \
+ v4i32 zero_m = {0}; \
+ y_m = LD(psrc_y); \
+ u_m = LW(psrc_u); \
+ v_m = LW(psrc_v); \
+ out_y = (v16u8)__msa_insert_d((v2i64)zero_m, 0, (int64)y_m); \
+ out_u = (v16u8)__msa_insert_w(zero_m, 0, (int32)u_m); \
+ out_v = (v16u8)__msa_insert_w(zero_m, 0, (int32)v_m); \
+ }
+
+// Clip input vector elements between 0 to 255
+#define CLIP_0TO255(in0, in1, in2, in3, in4, in5) \
+ { \
+ v4i32 max_m = __msa_ldi_w(0xFF); \
+ \
+ in0 = __msa_maxi_s_w(in0, 0); \
+ in1 = __msa_maxi_s_w(in1, 0); \
+ in2 = __msa_maxi_s_w(in2, 0); \
+ in3 = __msa_maxi_s_w(in3, 0); \
+ in4 = __msa_maxi_s_w(in4, 0); \
+ in5 = __msa_maxi_s_w(in5, 0); \
+ in0 = __msa_min_s_w(max_m, in0); \
+ in1 = __msa_min_s_w(max_m, in1); \
+ in2 = __msa_min_s_w(max_m, in2); \
+ in3 = __msa_min_s_w(max_m, in3); \
+ in4 = __msa_min_s_w(max_m, in4); \
+ in5 = __msa_min_s_w(max_m, in5); \
+ }
+
+// Convert 8 pixels of YUV 420 to RGB.
+#define YUVTORGB(in_y, in_uv, ubvr, ugvg, bb, bg, br, yg, out_b, out_g, out_r) \
+ { \
+ v8i16 vec0_m, vec1_m; \
+ v4i32 reg0_m, reg1_m, reg2_m, reg3_m, reg4_m; \
+ v4i32 reg5_m, reg6_m, reg7_m; \
+ v16i8 zero_m = {0}; \
+ \
+ vec0_m = (v8i16)__msa_ilvr_b((v16i8)in_y, (v16i8)in_y); \
+ vec1_m = (v8i16)__msa_ilvr_b((v16i8)zero_m, (v16i8)in_uv); \
+ reg0_m = (v4i32)__msa_ilvr_h((v8i16)zero_m, (v8i16)vec0_m); \
+ reg1_m = (v4i32)__msa_ilvl_h((v8i16)zero_m, (v8i16)vec0_m); \
+ reg2_m = (v4i32)__msa_ilvr_h((v8i16)zero_m, (v8i16)vec1_m); \
+ reg3_m = (v4i32)__msa_ilvl_h((v8i16)zero_m, (v8i16)vec1_m); \
+ reg0_m *= yg; \
+ reg1_m *= yg; \
+ reg2_m *= ubvr; \
+ reg3_m *= ubvr; \
+ reg0_m = __msa_srai_w(reg0_m, 16); \
+ reg1_m = __msa_srai_w(reg1_m, 16); \
+ reg4_m = __msa_dotp_s_w((v8i16)vec1_m, (v8i16)ugvg); \
+ reg5_m = __msa_ilvev_w(reg2_m, reg2_m); \
+ reg6_m = __msa_ilvev_w(reg3_m, reg3_m); \
+ reg7_m = __msa_ilvr_w(reg4_m, reg4_m); \
+ reg2_m = __msa_ilvod_w(reg2_m, reg2_m); \
+ reg3_m = __msa_ilvod_w(reg3_m, reg3_m); \
+ reg4_m = __msa_ilvl_w(reg4_m, reg4_m); \
+ reg5_m = reg0_m - reg5_m; \
+ reg6_m = reg1_m - reg6_m; \
+ reg2_m = reg0_m - reg2_m; \
+ reg3_m = reg1_m - reg3_m; \
+ reg7_m = reg0_m - reg7_m; \
+ reg4_m = reg1_m - reg4_m; \
+ reg5_m += bb; \
+ reg6_m += bb; \
+ reg7_m += bg; \
+ reg4_m += bg; \
+ reg2_m += br; \
+ reg3_m += br; \
+ reg5_m = __msa_srai_w(reg5_m, 6); \
+ reg6_m = __msa_srai_w(reg6_m, 6); \
+ reg7_m = __msa_srai_w(reg7_m, 6); \
+ reg4_m = __msa_srai_w(reg4_m, 6); \
+ reg2_m = __msa_srai_w(reg2_m, 6); \
+ reg3_m = __msa_srai_w(reg3_m, 6); \
+ CLIP_0TO255(reg5_m, reg6_m, reg7_m, reg4_m, reg2_m, reg3_m); \
+ out_b = __msa_pckev_h((v8i16)reg6_m, (v8i16)reg5_m); \
+ out_g = __msa_pckev_h((v8i16)reg4_m, (v8i16)reg7_m); \
+ out_r = __msa_pckev_h((v8i16)reg3_m, (v8i16)reg2_m); \
+ }
+
+// Pack and Store 8 ARGB values.
+#define STOREARGB(in0, in1, in2, in3, pdst_argb) \
+ { \
+ v8i16 vec0_m, vec1_m; \
+ v16u8 dst0_m, dst1_m; \
+ vec0_m = (v8i16)__msa_ilvev_b((v16i8)in1, (v16i8)in0); \
+ vec1_m = (v8i16)__msa_ilvev_b((v16i8)in3, (v16i8)in2); \
+ dst0_m = (v16u8)__msa_ilvr_h(vec1_m, vec0_m); \
+ dst1_m = (v16u8)__msa_ilvl_h(vec1_m, vec0_m); \
+ ST_UB2(dst0_m, dst1_m, pdst_argb, 16); \
+ }
+
+// Takes ARGB input and calculates Y.
+#define ARGBTOY(argb0, argb1, argb2, argb3, const0, const1, const2, shift, \
+ y_out) \
+ { \
+ v16u8 vec0_m, vec1_m, vec2_m, vec3_m; \
+ v8u16 reg0_m, reg1_m; \
+ \
+ vec0_m = (v16u8)__msa_pckev_h((v8i16)argb1, (v8i16)argb0); \
+ vec1_m = (v16u8)__msa_pckev_h((v8i16)argb3, (v8i16)argb2); \
+ vec2_m = (v16u8)__msa_pckod_h((v8i16)argb1, (v8i16)argb0); \
+ vec3_m = (v16u8)__msa_pckod_h((v8i16)argb3, (v8i16)argb2); \
+ reg0_m = __msa_dotp_u_h(vec0_m, const0); \
+ reg1_m = __msa_dotp_u_h(vec1_m, const0); \
+ reg0_m = __msa_dpadd_u_h(reg0_m, vec2_m, const1); \
+ reg1_m = __msa_dpadd_u_h(reg1_m, vec3_m, const1); \
+ reg0_m += const2; \
+ reg1_m += const2; \
+ reg0_m = (v8u16)__msa_srai_h((v8i16)reg0_m, shift); \
+ reg1_m = (v8u16)__msa_srai_h((v8i16)reg1_m, shift); \
+ y_out = (v16u8)__msa_pckev_b((v16i8)reg1_m, (v16i8)reg0_m); \
+ }
+
+// Loads current and next row of ARGB input and averages it to calculate U and V
+#define READ_ARGB(s_ptr, t_ptr, argb0, argb1, argb2, argb3) \
+ { \
+ v16u8 src0_m, src1_m, src2_m, src3_m, src4_m, src5_m, src6_m, src7_m; \
+ v16u8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
+ v16u8 vec8_m, vec9_m; \
+ v8u16 reg0_m, reg1_m, reg2_m, reg3_m, reg4_m, reg5_m, reg6_m, reg7_m; \
+ v8u16 reg8_m, reg9_m; \
+ \
+ src0_m = (v16u8)__msa_ld_b((v16i8*)s, 0); \
+ src1_m = (v16u8)__msa_ld_b((v16i8*)s, 16); \
+ src2_m = (v16u8)__msa_ld_b((v16i8*)s, 32); \
+ src3_m = (v16u8)__msa_ld_b((v16i8*)s, 48); \
+ src4_m = (v16u8)__msa_ld_b((v16i8*)t, 0); \
+ src5_m = (v16u8)__msa_ld_b((v16i8*)t, 16); \
+ src6_m = (v16u8)__msa_ld_b((v16i8*)t, 32); \
+ src7_m = (v16u8)__msa_ld_b((v16i8*)t, 48); \
+ vec0_m = (v16u8)__msa_ilvr_b((v16i8)src0_m, (v16i8)src4_m); \
+ vec1_m = (v16u8)__msa_ilvr_b((v16i8)src1_m, (v16i8)src5_m); \
+ vec2_m = (v16u8)__msa_ilvr_b((v16i8)src2_m, (v16i8)src6_m); \
+ vec3_m = (v16u8)__msa_ilvr_b((v16i8)src3_m, (v16i8)src7_m); \
+ vec4_m = (v16u8)__msa_ilvl_b((v16i8)src0_m, (v16i8)src4_m); \
+ vec5_m = (v16u8)__msa_ilvl_b((v16i8)src1_m, (v16i8)src5_m); \
+ vec6_m = (v16u8)__msa_ilvl_b((v16i8)src2_m, (v16i8)src6_m); \
+ vec7_m = (v16u8)__msa_ilvl_b((v16i8)src3_m, (v16i8)src7_m); \
+ reg0_m = __msa_hadd_u_h(vec0_m, vec0_m); \
+ reg1_m = __msa_hadd_u_h(vec1_m, vec1_m); \
+ reg2_m = __msa_hadd_u_h(vec2_m, vec2_m); \
+ reg3_m = __msa_hadd_u_h(vec3_m, vec3_m); \
+ reg4_m = __msa_hadd_u_h(vec4_m, vec4_m); \
+ reg5_m = __msa_hadd_u_h(vec5_m, vec5_m); \
+ reg6_m = __msa_hadd_u_h(vec6_m, vec6_m); \
+ reg7_m = __msa_hadd_u_h(vec7_m, vec7_m); \
+ reg8_m = (v8u16)__msa_pckev_d((v2i64)reg4_m, (v2i64)reg0_m); \
+ reg9_m = (v8u16)__msa_pckev_d((v2i64)reg5_m, (v2i64)reg1_m); \
+ reg8_m += (v8u16)__msa_pckod_d((v2i64)reg4_m, (v2i64)reg0_m); \
+ reg9_m += (v8u16)__msa_pckod_d((v2i64)reg5_m, (v2i64)reg1_m); \
+ reg0_m = (v8u16)__msa_pckev_d((v2i64)reg6_m, (v2i64)reg2_m); \
+ reg1_m = (v8u16)__msa_pckev_d((v2i64)reg7_m, (v2i64)reg3_m); \
+ reg0_m += (v8u16)__msa_pckod_d((v2i64)reg6_m, (v2i64)reg2_m); \
+ reg1_m += (v8u16)__msa_pckod_d((v2i64)reg7_m, (v2i64)reg3_m); \
+ reg8_m = (v8u16)__msa_srai_h((v8i16)reg8_m, 2); \
+ reg9_m = (v8u16)__msa_srai_h((v8i16)reg9_m, 2); \
+ reg0_m = (v8u16)__msa_srai_h((v8i16)reg0_m, 2); \
+ reg1_m = (v8u16)__msa_srai_h((v8i16)reg1_m, 2); \
+ argb0 = (v16u8)__msa_pckev_b((v16i8)reg9_m, (v16i8)reg8_m); \
+ argb1 = (v16u8)__msa_pckev_b((v16i8)reg1_m, (v16i8)reg0_m); \
+ src0_m = (v16u8)__msa_ld_b((v16i8*)s, 64); \
+ src1_m = (v16u8)__msa_ld_b((v16i8*)s, 80); \
+ src2_m = (v16u8)__msa_ld_b((v16i8*)s, 96); \
+ src3_m = (v16u8)__msa_ld_b((v16i8*)s, 112); \
+ src4_m = (v16u8)__msa_ld_b((v16i8*)t, 64); \
+ src5_m = (v16u8)__msa_ld_b((v16i8*)t, 80); \
+ src6_m = (v16u8)__msa_ld_b((v16i8*)t, 96); \
+ src7_m = (v16u8)__msa_ld_b((v16i8*)t, 112); \
+ vec2_m = (v16u8)__msa_ilvr_b((v16i8)src0_m, (v16i8)src4_m); \
+ vec3_m = (v16u8)__msa_ilvr_b((v16i8)src1_m, (v16i8)src5_m); \
+ vec4_m = (v16u8)__msa_ilvr_b((v16i8)src2_m, (v16i8)src6_m); \
+ vec5_m = (v16u8)__msa_ilvr_b((v16i8)src3_m, (v16i8)src7_m); \
+ vec6_m = (v16u8)__msa_ilvl_b((v16i8)src0_m, (v16i8)src4_m); \
+ vec7_m = (v16u8)__msa_ilvl_b((v16i8)src1_m, (v16i8)src5_m); \
+ vec8_m = (v16u8)__msa_ilvl_b((v16i8)src2_m, (v16i8)src6_m); \
+ vec9_m = (v16u8)__msa_ilvl_b((v16i8)src3_m, (v16i8)src7_m); \
+ reg0_m = __msa_hadd_u_h(vec2_m, vec2_m); \
+ reg1_m = __msa_hadd_u_h(vec3_m, vec3_m); \
+ reg2_m = __msa_hadd_u_h(vec4_m, vec4_m); \
+ reg3_m = __msa_hadd_u_h(vec5_m, vec5_m); \
+ reg4_m = __msa_hadd_u_h(vec6_m, vec6_m); \
+ reg5_m = __msa_hadd_u_h(vec7_m, vec7_m); \
+ reg6_m = __msa_hadd_u_h(vec8_m, vec8_m); \
+ reg7_m = __msa_hadd_u_h(vec9_m, vec9_m); \
+ reg8_m = (v8u16)__msa_pckev_d((v2i64)reg4_m, (v2i64)reg0_m); \
+ reg9_m = (v8u16)__msa_pckev_d((v2i64)reg5_m, (v2i64)reg1_m); \
+ reg8_m += (v8u16)__msa_pckod_d((v2i64)reg4_m, (v2i64)reg0_m); \
+ reg9_m += (v8u16)__msa_pckod_d((v2i64)reg5_m, (v2i64)reg1_m); \
+ reg0_m = (v8u16)__msa_pckev_d((v2i64)reg6_m, (v2i64)reg2_m); \
+ reg1_m = (v8u16)__msa_pckev_d((v2i64)reg7_m, (v2i64)reg3_m); \
+ reg0_m += (v8u16)__msa_pckod_d((v2i64)reg6_m, (v2i64)reg2_m); \
+ reg1_m += (v8u16)__msa_pckod_d((v2i64)reg7_m, (v2i64)reg3_m); \
+ reg8_m = (v8u16)__msa_srai_h((v8i16)reg8_m, 2); \
+ reg9_m = (v8u16)__msa_srai_h((v8i16)reg9_m, 2); \
+ reg0_m = (v8u16)__msa_srai_h((v8i16)reg0_m, 2); \
+ reg1_m = (v8u16)__msa_srai_h((v8i16)reg1_m, 2); \
+ argb2 = (v16u8)__msa_pckev_b((v16i8)reg9_m, (v16i8)reg8_m); \
+ argb3 = (v16u8)__msa_pckev_b((v16i8)reg1_m, (v16i8)reg0_m); \
+ }
+
+// Takes ARGB input and calculates U and V.
+#define ARGBTOUV(argb0, argb1, argb2, argb3, const0, const1, const2, const3, \
+ shf0, shf1, shf2, shf3, v_out, u_out) \
+ { \
+ v16u8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
+ v8u16 reg0_m, reg1_m, reg2_m, reg3_m; \
+ \
+ vec0_m = (v16u8)__msa_vshf_b(shf0, (v16i8)argb1, (v16i8)argb0); \
+ vec1_m = (v16u8)__msa_vshf_b(shf0, (v16i8)argb3, (v16i8)argb2); \
+ vec2_m = (v16u8)__msa_vshf_b(shf1, (v16i8)argb1, (v16i8)argb0); \
+ vec3_m = (v16u8)__msa_vshf_b(shf1, (v16i8)argb3, (v16i8)argb2); \
+ vec4_m = (v16u8)__msa_vshf_b(shf2, (v16i8)argb1, (v16i8)argb0); \
+ vec5_m = (v16u8)__msa_vshf_b(shf2, (v16i8)argb3, (v16i8)argb2); \
+ vec6_m = (v16u8)__msa_vshf_b(shf3, (v16i8)argb1, (v16i8)argb0); \
+ vec7_m = (v16u8)__msa_vshf_b(shf3, (v16i8)argb3, (v16i8)argb2); \
+ reg0_m = __msa_dotp_u_h(vec0_m, const1); \
+ reg1_m = __msa_dotp_u_h(vec1_m, const1); \
+ reg2_m = __msa_dotp_u_h(vec4_m, const1); \
+ reg3_m = __msa_dotp_u_h(vec5_m, const1); \
+ reg0_m += const3; \
+ reg1_m += const3; \
+ reg2_m += const3; \
+ reg3_m += const3; \
+ reg0_m -= __msa_dotp_u_h(vec2_m, const0); \
+ reg1_m -= __msa_dotp_u_h(vec3_m, const0); \
+ reg2_m -= __msa_dotp_u_h(vec6_m, const2); \
+ reg3_m -= __msa_dotp_u_h(vec7_m, const2); \
+ v_out = (v16u8)__msa_pckod_b((v16i8)reg1_m, (v16i8)reg0_m); \
+ u_out = (v16u8)__msa_pckod_b((v16i8)reg3_m, (v16i8)reg2_m); \
+ }
+
+// Load I444 pixel data
+#define READI444(psrc_y, psrc_u, psrc_v, out_y, out_u, out_v) \
+ { \
+ uint64 y_m, u_m, v_m; \
+ v2i64 zero_m = {0}; \
+ y_m = LD(psrc_y); \
+ u_m = LD(psrc_u); \
+ v_m = LD(psrc_v); \
+ out_y = (v16u8)__msa_insert_d(zero_m, 0, (int64)y_m); \
+ out_u = (v16u8)__msa_insert_d(zero_m, 0, (int64)u_m); \
+ out_v = (v16u8)__msa_insert_d(zero_m, 0, (int64)v_m); \
+ }
+
+void MirrorRow_MSA(const uint8* src, uint8* dst, int width) {
+ int x;
+ v16u8 src0, src1, src2, src3;
+ v16u8 dst0, dst1, dst2, dst3;
+ v16i8 shuffler = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0};
+ src += width - 64;
+
+ for (x = 0; x < width; x += 64) {
+ LD_UB4(src, 16, src3, src2, src1, src0);
+ VSHF_B2_UB(src3, src3, src2, src2, shuffler, shuffler, dst3, dst2);
+ VSHF_B2_UB(src1, src1, src0, src0, shuffler, shuffler, dst1, dst0);
+ ST_UB4(dst0, dst1, dst2, dst3, dst, 16);
+ dst += 64;
+ src -= 64;
+ }
+}
+
+void ARGBMirrorRow_MSA(const uint8* src, uint8* dst, int width) {
+ int x;
+ v16u8 src0, src1, src2, src3;
+ v16u8 dst0, dst1, dst2, dst3;
+ v16i8 shuffler = {12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3};
+ src += width * 4 - 64;
+
+ for (x = 0; x < width; x += 16) {
+ LD_UB4(src, 16, src3, src2, src1, src0);
+ VSHF_B2_UB(src3, src3, src2, src2, shuffler, shuffler, dst3, dst2);
+ VSHF_B2_UB(src1, src1, src0, src0, shuffler, shuffler, dst1, dst0);
+ ST_UB4(dst0, dst1, dst2, dst3, dst, 16);
+ dst += 64;
+ src -= 64;
+ }
+}
+
+void I422ToYUY2Row_MSA(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_yuy2,
+ int width) {
+ int x;
+ v16u8 src_u0, src_v0, src_y0, src_y1, vec_uv0, vec_uv1;
+ v16u8 dst_yuy2_0, dst_yuy2_1, dst_yuy2_2, dst_yuy2_3;
+
+ for (x = 0; x < width; x += 32) {
+ src_u0 = LD_UB(src_u);
+ src_v0 = LD_UB(src_v);
+ LD_UB2(src_y, 16, src_y0, src_y1);
+ ILVRL_B2_UB(src_v0, src_u0, vec_uv0, vec_uv1);
+ ILVRL_B2_UB(vec_uv0, src_y0, dst_yuy2_0, dst_yuy2_1);
+ ILVRL_B2_UB(vec_uv1, src_y1, dst_yuy2_2, dst_yuy2_3);
+ ST_UB4(dst_yuy2_0, dst_yuy2_1, dst_yuy2_2, dst_yuy2_3, dst_yuy2, 16);
+ src_u += 16;
+ src_v += 16;
+ src_y += 32;
+ dst_yuy2 += 64;
+ }
+}
+
+void I422ToUYVYRow_MSA(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_uyvy,
+ int width) {
+ int x;
+ v16u8 src_u0, src_v0, src_y0, src_y1, vec_uv0, vec_uv1;
+ v16u8 dst_uyvy0, dst_uyvy1, dst_uyvy2, dst_uyvy3;
+
+ for (x = 0; x < width; x += 32) {
+ src_u0 = LD_UB(src_u);
+ src_v0 = LD_UB(src_v);
+ LD_UB2(src_y, 16, src_y0, src_y1);
+ ILVRL_B2_UB(src_v0, src_u0, vec_uv0, vec_uv1);
+ ILVRL_B2_UB(src_y0, vec_uv0, dst_uyvy0, dst_uyvy1);
+ ILVRL_B2_UB(src_y1, vec_uv1, dst_uyvy2, dst_uyvy3);
+ ST_UB4(dst_uyvy0, dst_uyvy1, dst_uyvy2, dst_uyvy3, dst_uyvy, 16);
+ src_u += 16;
+ src_v += 16;
+ src_y += 32;
+ dst_uyvy += 64;
+ }
+}
+
+void I422ToARGBRow_MSA(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ v16u8 src0, src1, src2;
+ v8i16 vec0, vec1, vec2;
+ v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
+ v4i32 vec_ubvr, vec_ugvg;
+ v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
+
+ YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
+ vec_br, vec_yg);
+ vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
+ vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
+
+ for (x = 0; x < width; x += 8) {
+ READYUV422(src_y, src_u, src_v, src0, src1, src2);
+ src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
+ YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
+ vec0, vec1, vec2);
+ STOREARGB(vec0, vec1, vec2, alpha, rgb_buf);
+ src_y += 8;
+ src_u += 4;
+ src_v += 4;
+ rgb_buf += 32;
+ }
+}
+
+void I422ToRGBARow_MSA(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ v16u8 src0, src1, src2;
+ v8i16 vec0, vec1, vec2;
+ v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
+ v4i32 vec_ubvr, vec_ugvg;
+ v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
+
+ YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
+ vec_br, vec_yg);
+ vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
+ vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
+
+ for (x = 0; x < width; x += 8) {
+ READYUV422(src_y, src_u, src_v, src0, src1, src2);
+ src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
+ YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
+ vec0, vec1, vec2);
+ STOREARGB(alpha, vec0, vec1, vec2, rgb_buf);
+ src_y += 8;
+ src_u += 4;
+ src_v += 4;
+ rgb_buf += 32;
+ }
+}
+
+void I422AlphaToARGBRow_MSA(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ const uint8* src_a,
+ uint8* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ int64 data_a;
+ v16u8 src0, src1, src2, src3;
+ v8i16 vec0, vec1, vec2;
+ v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
+ v4i32 vec_ubvr, vec_ugvg;
+ v4i32 zero = {0};
+
+ YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
+ vec_br, vec_yg);
+ vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
+ vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
+
+ for (x = 0; x < width; x += 8) {
+ data_a = LD(src_a);
+ READYUV422(src_y, src_u, src_v, src0, src1, src2);
+ src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
+ src3 = (v16u8)__msa_insert_d((v2i64)zero, 0, data_a);
+ YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
+ vec0, vec1, vec2);
+ src3 = (v16u8)__msa_ilvr_b((v16i8)src3, (v16i8)src3);
+ STOREARGB(vec0, vec1, vec2, src3, rgb_buf);
+ src_y += 8;
+ src_u += 4;
+ src_v += 4;
+ src_a += 8;
+ rgb_buf += 32;
+ }
+}
+
+void I422ToRGB24Row_MSA(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int32 width) {
+ int x;
+ int64 data_u, data_v;
+ v16u8 src0, src1, src2, src3, src4, dst0, dst1, dst2;
+ v8i16 vec0, vec1, vec2, vec3, vec4, vec5;
+ v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
+ v4i32 vec_ubvr, vec_ugvg;
+ v16u8 reg0, reg1, reg2, reg3;
+ v2i64 zero = {0};
+ v16i8 shuffler0 = {0, 1, 16, 2, 3, 17, 4, 5, 18, 6, 7, 19, 8, 9, 20, 10};
+ v16i8 shuffler1 = {0, 21, 1, 2, 22, 3, 4, 23, 5, 6, 24, 7, 8, 25, 9, 10};
+ v16i8 shuffler2 = {26, 6, 7, 27, 8, 9, 28, 10,
+ 11, 29, 12, 13, 30, 14, 15, 31};
+
+ YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
+ vec_br, vec_yg);
+ vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
+ vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
+
+ for (x = 0; x < width; x += 16) {
+ src0 = (v16u8)__msa_ld_b((v16u8*)src_y, 0);
+ data_u = LD(src_u);
+ data_v = LD(src_v);
+ src1 = (v16u8)__msa_insert_d(zero, 0, data_u);
+ src2 = (v16u8)__msa_insert_d(zero, 0, data_v);
+ src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
+ src3 = (v16u8)__msa_sldi_b((v16i8)src0, (v16i8)src0, 8);
+ src4 = (v16u8)__msa_sldi_b((v16i8)src1, (v16i8)src1, 8);
+ YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
+ vec0, vec1, vec2);
+ YUVTORGB(src3, src4, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
+ vec3, vec4, vec5);
+ reg0 = (v16u8)__msa_ilvev_b((v16i8)vec1, (v16i8)vec0);
+ reg2 = (v16u8)__msa_ilvev_b((v16i8)vec4, (v16i8)vec3);
+ reg3 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec2);
+ reg1 = (v16u8)__msa_sldi_b((v16i8)reg2, (v16i8)reg0, 11);
+ dst0 = (v16u8)__msa_vshf_b(shuffler0, (v16i8)reg3, (v16i8)reg0);
+ dst1 = (v16u8)__msa_vshf_b(shuffler1, (v16i8)reg3, (v16i8)reg1);
+ dst2 = (v16u8)__msa_vshf_b(shuffler2, (v16i8)reg3, (v16i8)reg2);
+ ST_UB2(dst0, dst1, rgb_buf, 16);
+ ST_UB(dst2, (rgb_buf + 32));
+ src_y += 16;
+ src_u += 8;
+ src_v += 8;
+ rgb_buf += 48;
+ }
+}
+
+// TODO(fbarchard): Consider AND instead of shift to isolate 5 upper bits of R.
+void I422ToRGB565Row_MSA(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_rgb565,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ v16u8 src0, src1, src2, dst0;
+ v8i16 vec0, vec1, vec2;
+ v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
+ v4i32 vec_ubvr, vec_ugvg;
+
+ YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
+ vec_br, vec_yg);
+ vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
+ vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
+
+ for (x = 0; x < width; x += 8) {
+ READYUV422(src_y, src_u, src_v, src0, src1, src2);
+ src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
+ YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
+ vec0, vec2, vec1);
+ vec0 = __msa_srai_h(vec0, 3);
+ vec1 = __msa_srai_h(vec1, 3);
+ vec2 = __msa_srai_h(vec2, 2);
+ vec1 = __msa_slli_h(vec1, 11);
+ vec2 = __msa_slli_h(vec2, 5);
+ vec0 |= vec1;
+ dst0 = (v16u8)(vec2 | vec0);
+ ST_UB(dst0, dst_rgb565);
+ src_y += 8;
+ src_u += 4;
+ src_v += 4;
+ dst_rgb565 += 16;
+ }
+}
+
+// TODO(fbarchard): Consider AND instead of shift to isolate 4 upper bits of G.
+void I422ToARGB4444Row_MSA(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb4444,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ v16u8 src0, src1, src2, dst0;
+ v8i16 vec0, vec1, vec2;
+ v8u16 reg0, reg1, reg2;
+ v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
+ v4i32 vec_ubvr, vec_ugvg;
+ v8u16 const_0xF000 = (v8u16)__msa_fill_h(0xF000);
+
+ YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
+ vec_br, vec_yg);
+ vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
+ vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
+
+ for (x = 0; x < width; x += 8) {
+ READYUV422(src_y, src_u, src_v, src0, src1, src2);
+ src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
+ YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
+ vec0, vec1, vec2);
+ reg0 = (v8u16)__msa_srai_h(vec0, 4);
+ reg1 = (v8u16)__msa_srai_h(vec1, 4);
+ reg2 = (v8u16)__msa_srai_h(vec2, 4);
+ reg1 = (v8u16)__msa_slli_h((v8i16)reg1, 4);
+ reg2 = (v8u16)__msa_slli_h((v8i16)reg2, 8);
+ reg1 |= const_0xF000;
+ reg0 |= reg2;
+ dst0 = (v16u8)(reg1 | reg0);
+ ST_UB(dst0, dst_argb4444);
+ src_y += 8;
+ src_u += 4;
+ src_v += 4;
+ dst_argb4444 += 16;
+ }
+}
+
+void I422ToARGB1555Row_MSA(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb1555,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ v16u8 src0, src1, src2, dst0;
+ v8i16 vec0, vec1, vec2;
+ v8u16 reg0, reg1, reg2;
+ v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
+ v4i32 vec_ubvr, vec_ugvg;
+ v8u16 const_0x8000 = (v8u16)__msa_fill_h(0x8000);
+
+ YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
+ vec_br, vec_yg);
+ vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
+ vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
+
+ for (x = 0; x < width; x += 8) {
+ READYUV422(src_y, src_u, src_v, src0, src1, src2);
+ src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
+ YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
+ vec0, vec1, vec2);
+ reg0 = (v8u16)__msa_srai_h(vec0, 3);
+ reg1 = (v8u16)__msa_srai_h(vec1, 3);
+ reg2 = (v8u16)__msa_srai_h(vec2, 3);
+ reg1 = (v8u16)__msa_slli_h((v8i16)reg1, 5);
+ reg2 = (v8u16)__msa_slli_h((v8i16)reg2, 10);
+ reg1 |= const_0x8000;
+ reg0 |= reg2;
+ dst0 = (v16u8)(reg1 | reg0);
+ ST_UB(dst0, dst_argb1555);
+ src_y += 8;
+ src_u += 4;
+ src_v += 4;
+ dst_argb1555 += 16;
+ }
+}
+
+void YUY2ToYRow_MSA(const uint8* src_yuy2, uint8* dst_y, int width) {
+ int x;
+ v16u8 src0, src1, src2, src3, dst0, dst1;
+
+ for (x = 0; x < width; x += 32) {
+ LD_UB4(src_yuy2, 16, src0, src1, src2, src3);
+ dst0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
+ dst1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
+ ST_UB2(dst0, dst1, dst_y, 16);
+ src_yuy2 += 64;
+ dst_y += 32;
+ }
+}
+
+void YUY2ToUVRow_MSA(const uint8* src_yuy2,
+ int src_stride_yuy2,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
+ const uint8* src_yuy2_next = src_yuy2 + src_stride_yuy2;
+ int x;
+ v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+ v16u8 vec0, vec1, dst0, dst1;
+
+ for (x = 0; x < width; x += 32) {
+ LD_UB4(src_yuy2, 16, src0, src1, src2, src3);
+ LD_UB4(src_yuy2_next, 16, src4, src5, src6, src7);
+ src0 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
+ src1 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
+ src2 = (v16u8)__msa_pckod_b((v16i8)src5, (v16i8)src4);
+ src3 = (v16u8)__msa_pckod_b((v16i8)src7, (v16i8)src6);
+ vec0 = __msa_aver_u_b(src0, src2);
+ vec1 = __msa_aver_u_b(src1, src3);
+ dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+ dst1 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0);
+ ST_UB(dst0, dst_u);
+ ST_UB(dst1, dst_v);
+ src_yuy2 += 64;
+ src_yuy2_next += 64;
+ dst_u += 16;
+ dst_v += 16;
+ }
+}
+
+void YUY2ToUV422Row_MSA(const uint8* src_yuy2,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
+ int x;
+ v16u8 src0, src1, src2, src3, dst0, dst1;
+
+ for (x = 0; x < width; x += 32) {
+ LD_UB4(src_yuy2, 16, src0, src1, src2, src3);
+ src0 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
+ src1 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
+ dst0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
+ dst1 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
+ ST_UB(dst0, dst_u);
+ ST_UB(dst1, dst_v);
+ src_yuy2 += 64;
+ dst_u += 16;
+ dst_v += 16;
+ }
+}
+
+void UYVYToYRow_MSA(const uint8* src_uyvy, uint8* dst_y, int width) {
+ int x;
+ v16u8 src0, src1, src2, src3, dst0, dst1;
+
+ for (x = 0; x < width; x += 32) {
+ LD_UB4(src_uyvy, 16, src0, src1, src2, src3);
+ dst0 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
+ dst1 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
+ ST_UB2(dst0, dst1, dst_y, 16);
+ src_uyvy += 64;
+ dst_y += 32;
+ }
+}
+
+void UYVYToUVRow_MSA(const uint8* src_uyvy,
+ int src_stride_uyvy,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
+ const uint8* src_uyvy_next = src_uyvy + src_stride_uyvy;
+ int x;
+ v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+ v16u8 vec0, vec1, dst0, dst1;
+
+ for (x = 0; x < width; x += 32) {
+ LD_UB4(src_uyvy, 16, src0, src1, src2, src3);
+ LD_UB4(src_uyvy_next, 16, src4, src5, src6, src7);
+ src0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
+ src1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
+ src2 = (v16u8)__msa_pckev_b((v16i8)src5, (v16i8)src4);
+ src3 = (v16u8)__msa_pckev_b((v16i8)src7, (v16i8)src6);
+ vec0 = __msa_aver_u_b(src0, src2);
+ vec1 = __msa_aver_u_b(src1, src3);
+ dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+ dst1 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0);
+ ST_UB(dst0, dst_u);
+ ST_UB(dst1, dst_v);
+ src_uyvy += 64;
+ src_uyvy_next += 64;
+ dst_u += 16;
+ dst_v += 16;
+ }
+}
+
+void UYVYToUV422Row_MSA(const uint8* src_uyvy,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
+ int x;
+ v16u8 src0, src1, src2, src3, dst0, dst1;
+
+ for (x = 0; x < width; x += 32) {
+ LD_UB4(src_uyvy, 16, src0, src1, src2, src3);
+ src0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
+ src1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
+ dst0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
+ dst1 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
+ ST_UB(dst0, dst_u);
+ ST_UB(dst1, dst_v);
+ src_uyvy += 64;
+ dst_u += 16;
+ dst_v += 16;
+ }
+}
+
+void ARGBToYRow_MSA(const uint8* src_argb0, uint8* dst_y, int width) {
+ int x;
+ v16u8 src0, src1, src2, src3, vec0, vec1, vec2, vec3, dst0;
+ v8u16 reg0, reg1, reg2, reg3, reg4, reg5;
+ v16i8 zero = {0};
+ v8u16 const_0x19 = (v8u16)__msa_ldi_h(0x19);
+ v8u16 const_0x81 = (v8u16)__msa_ldi_h(0x81);
+ v8u16 const_0x42 = (v8u16)__msa_ldi_h(0x42);
+ v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);
+
+ for (x = 0; x < width; x += 16) {
+ src0 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 0);
+ src1 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 16);
+ src2 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 32);
+ src3 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 48);
+ vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
+ vec1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
+ vec2 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
+ vec3 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
+ reg0 = (v8u16)__msa_ilvev_b(zero, (v16i8)vec0);
+ reg1 = (v8u16)__msa_ilvev_b(zero, (v16i8)vec1);
+ reg2 = (v8u16)__msa_ilvev_b(zero, (v16i8)vec2);
+ reg3 = (v8u16)__msa_ilvev_b(zero, (v16i8)vec3);
+ reg4 = (v8u16)__msa_ilvod_b(zero, (v16i8)vec0);
+ reg5 = (v8u16)__msa_ilvod_b(zero, (v16i8)vec1);
+ reg0 *= const_0x19;
+ reg1 *= const_0x19;
+ reg2 *= const_0x81;
+ reg3 *= const_0x81;
+ reg4 *= const_0x42;
+ reg5 *= const_0x42;
+ reg0 += reg2;
+ reg1 += reg3;
+ reg0 += reg4;
+ reg1 += reg5;
+ reg0 += const_0x1080;
+ reg1 += const_0x1080;
+ reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 8);
+ reg1 = (v8u16)__msa_srai_h((v8i16)reg1, 8);
+ dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0);
+ ST_UB(dst0, dst_y);
+ src_argb0 += 64;
+ dst_y += 16;
+ }
+}
+
+void ARGBToUVRow_MSA(const uint8* src_argb0,
+ int src_stride_argb,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
+ int x;
+ const uint8* src_argb0_next = src_argb0 + src_stride_argb;
+ v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+ v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
+ v8u16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9;
+ v16u8 dst0, dst1;
+ v8u16 const_0x70 = (v8u16)__msa_ldi_h(0x70);
+ v8u16 const_0x4A = (v8u16)__msa_ldi_h(0x4A);
+ v8u16 const_0x26 = (v8u16)__msa_ldi_h(0x26);
+ v8u16 const_0x5E = (v8u16)__msa_ldi_h(0x5E);
+ v8u16 const_0x12 = (v8u16)__msa_ldi_h(0x12);
+ v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
+
+ for (x = 0; x < width; x += 32) {
+ src0 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 0);
+ src1 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 16);
+ src2 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 32);
+ src3 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 48);
+ src4 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 64);
+ src5 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 80);
+ src6 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 96);
+ src7 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 112);
+ vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
+ vec1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
+ vec2 = (v16u8)__msa_pckev_b((v16i8)src5, (v16i8)src4);
+ vec3 = (v16u8)__msa_pckev_b((v16i8)src7, (v16i8)src6);
+ vec4 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
+ vec5 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
+ vec6 = (v16u8)__msa_pckod_b((v16i8)src5, (v16i8)src4);
+ vec7 = (v16u8)__msa_pckod_b((v16i8)src7, (v16i8)src6);
+ vec8 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+ vec9 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
+ vec4 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec4);
+ vec5 = (v16u8)__msa_pckev_b((v16i8)vec7, (v16i8)vec6);
+ vec0 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0);
+ vec1 = (v16u8)__msa_pckod_b((v16i8)vec3, (v16i8)vec2);
+ reg0 = __msa_hadd_u_h(vec8, vec8);
+ reg1 = __msa_hadd_u_h(vec9, vec9);
+ reg2 = __msa_hadd_u_h(vec4, vec4);
+ reg3 = __msa_hadd_u_h(vec5, vec5);
+ reg4 = __msa_hadd_u_h(vec0, vec0);
+ reg5 = __msa_hadd_u_h(vec1, vec1);
+ src0 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 0);
+ src1 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 16);
+ src2 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 32);
+ src3 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 48);
+ src4 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 64);
+ src5 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 80);
+ src6 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 96);
+ src7 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 112);
+ vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
+ vec1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
+ vec2 = (v16u8)__msa_pckev_b((v16i8)src5, (v16i8)src4);
+ vec3 = (v16u8)__msa_pckev_b((v16i8)src7, (v16i8)src6);
+ vec4 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
+ vec5 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
+ vec6 = (v16u8)__msa_pckod_b((v16i8)src5, (v16i8)src4);
+ vec7 = (v16u8)__msa_pckod_b((v16i8)src7, (v16i8)src6);
+ vec8 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+ vec9 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
+ vec4 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec4);
+ vec5 = (v16u8)__msa_pckev_b((v16i8)vec7, (v16i8)vec6);
+ vec0 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0);
+ vec1 = (v16u8)__msa_pckod_b((v16i8)vec3, (v16i8)vec2);
+ reg0 += __msa_hadd_u_h(vec8, vec8);
+ reg1 += __msa_hadd_u_h(vec9, vec9);
+ reg2 += __msa_hadd_u_h(vec4, vec4);
+ reg3 += __msa_hadd_u_h(vec5, vec5);
+ reg4 += __msa_hadd_u_h(vec0, vec0);
+ reg5 += __msa_hadd_u_h(vec1, vec1);
+ reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 2);
+ reg1 = (v8u16)__msa_srai_h((v8i16)reg1, 2);
+ reg2 = (v8u16)__msa_srai_h((v8i16)reg2, 2);
+ reg3 = (v8u16)__msa_srai_h((v8i16)reg3, 2);
+ reg4 = (v8u16)__msa_srai_h((v8i16)reg4, 2);
+ reg5 = (v8u16)__msa_srai_h((v8i16)reg5, 2);
+ reg6 = reg0 * const_0x70;
+ reg7 = reg1 * const_0x70;
+ reg8 = reg2 * const_0x4A;
+ reg9 = reg3 * const_0x4A;
+ reg6 += const_0x8080;
+ reg7 += const_0x8080;
+ reg8 += reg4 * const_0x26;
+ reg9 += reg5 * const_0x26;
+ reg0 *= const_0x12;
+ reg1 *= const_0x12;
+ reg2 *= const_0x5E;
+ reg3 *= const_0x5E;
+ reg4 *= const_0x70;
+ reg5 *= const_0x70;
+ reg2 += reg0;
+ reg3 += reg1;
+ reg4 += const_0x8080;
+ reg5 += const_0x8080;
+ reg6 -= reg8;
+ reg7 -= reg9;
+ reg4 -= reg2;
+ reg5 -= reg3;
+ reg6 = (v8u16)__msa_srai_h((v8i16)reg6, 8);
+ reg7 = (v8u16)__msa_srai_h((v8i16)reg7, 8);
+ reg4 = (v8u16)__msa_srai_h((v8i16)reg4, 8);
+ reg5 = (v8u16)__msa_srai_h((v8i16)reg5, 8);
+ dst0 = (v16u8)__msa_pckev_b((v16i8)reg7, (v16i8)reg6);
+ dst1 = (v16u8)__msa_pckev_b((v16i8)reg5, (v16i8)reg4);
+ ST_UB(dst0, dst_u);
+ ST_UB(dst1, dst_v);
+ src_argb0 += 128;
+ src_argb0_next += 128;
+ dst_u += 16;
+ dst_v += 16;
+ }
+}
+
+void ARGBToRGB24Row_MSA(const uint8* src_argb, uint8* dst_rgb, int width) {
+ int x;
+ v16u8 src0, src1, src2, src3, dst0, dst1, dst2;
+ v16i8 shuffler0 = {0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 16, 17, 18, 20};
+ v16i8 shuffler1 = {5, 6, 8, 9, 10, 12, 13, 14,
+ 16, 17, 18, 20, 21, 22, 24, 25};
+ v16i8 shuffler2 = {10, 12, 13, 14, 16, 17, 18, 20,
+ 21, 22, 24, 25, 26, 28, 29, 30};
+
+ for (x = 0; x < width; x += 16) {
+ src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0);
+ src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16);
+ src2 = (v16u8)__msa_ld_b((v16i8*)src_argb, 32);
+ src3 = (v16u8)__msa_ld_b((v16i8*)src_argb, 48);
+ dst0 = (v16u8)__msa_vshf_b(shuffler0, (v16i8)src1, (v16i8)src0);
+ dst1 = (v16u8)__msa_vshf_b(shuffler1, (v16i8)src2, (v16i8)src1);
+ dst2 = (v16u8)__msa_vshf_b(shuffler2, (v16i8)src3, (v16i8)src2);
+ ST_UB2(dst0, dst1, dst_rgb, 16);
+ ST_UB(dst2, (dst_rgb + 32));
+ src_argb += 64;
+ dst_rgb += 48;
+ }
+}
+
+void ARGBToRAWRow_MSA(const uint8* src_argb, uint8* dst_rgb, int width) {
+ int x;
+ v16u8 src0, src1, src2, src3, dst0, dst1, dst2;
+ v16i8 shuffler0 = {2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, 18, 17, 16, 22};
+ v16i8 shuffler1 = {5, 4, 10, 9, 8, 14, 13, 12,
+ 18, 17, 16, 22, 21, 20, 26, 25};
+ v16i8 shuffler2 = {8, 14, 13, 12, 18, 17, 16, 22,
+ 21, 20, 26, 25, 24, 30, 29, 28};
+
+ for (x = 0; x < width; x += 16) {
+ src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0);
+ src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16);
+ src2 = (v16u8)__msa_ld_b((v16i8*)src_argb, 32);
+ src3 = (v16u8)__msa_ld_b((v16i8*)src_argb, 48);
+ dst0 = (v16u8)__msa_vshf_b(shuffler0, (v16i8)src1, (v16i8)src0);
+ dst1 = (v16u8)__msa_vshf_b(shuffler1, (v16i8)src2, (v16i8)src1);
+ dst2 = (v16u8)__msa_vshf_b(shuffler2, (v16i8)src3, (v16i8)src2);
+ ST_UB2(dst0, dst1, dst_rgb, 16);
+ ST_UB(dst2, (dst_rgb + 32));
+ src_argb += 64;
+ dst_rgb += 48;
+ }
+}
+
+void ARGBToRGB565Row_MSA(const uint8* src_argb, uint8* dst_rgb, int width) {
+ int x;
+ v16u8 src0, src1, dst0;
+ v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ v16i8 zero = {0};
+
+ for (x = 0; x < width; x += 8) {
+ src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0);
+ src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16);
+ vec0 = (v16u8)__msa_srai_b((v16i8)src0, 3);
+ vec1 = (v16u8)__msa_slli_b((v16i8)src0, 3);
+ vec2 = (v16u8)__msa_srai_b((v16i8)src0, 5);
+ vec4 = (v16u8)__msa_srai_b((v16i8)src1, 3);
+ vec5 = (v16u8)__msa_slli_b((v16i8)src1, 3);
+ vec6 = (v16u8)__msa_srai_b((v16i8)src1, 5);
+ vec1 = (v16u8)__msa_sldi_b(zero, (v16i8)vec1, 1);
+ vec2 = (v16u8)__msa_sldi_b(zero, (v16i8)vec2, 1);
+ vec5 = (v16u8)__msa_sldi_b(zero, (v16i8)vec5, 1);
+ vec6 = (v16u8)__msa_sldi_b(zero, (v16i8)vec6, 1);
+ vec3 = (v16u8)__msa_sldi_b(zero, (v16i8)src0, 2);
+ vec7 = (v16u8)__msa_sldi_b(zero, (v16i8)src1, 2);
+ vec0 = __msa_binsli_b(vec0, vec1, 2);
+ vec1 = __msa_binsli_b(vec2, vec3, 4);
+ vec4 = __msa_binsli_b(vec4, vec5, 2);
+ vec5 = __msa_binsli_b(vec6, vec7, 4);
+ vec0 = (v16u8)__msa_ilvev_b((v16i8)vec1, (v16i8)vec0);
+ vec4 = (v16u8)__msa_ilvev_b((v16i8)vec5, (v16i8)vec4);
+ dst0 = (v16u8)__msa_pckev_h((v8i16)vec4, (v8i16)vec0);
+ ST_UB(dst0, dst_rgb);
+ src_argb += 32;
+ dst_rgb += 16;
+ }
+}
+
+void ARGBToARGB1555Row_MSA(const uint8* src_argb, uint8* dst_rgb, int width) {
+ int x;
+ v16u8 src0, src1, dst0;
+ v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
+ v16i8 zero = {0};
+
+ for (x = 0; x < width; x += 8) {
+ src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0);
+ src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16);
+ vec0 = (v16u8)__msa_srai_b((v16i8)src0, 3);
+ vec1 = (v16u8)__msa_slli_b((v16i8)src0, 2);
+ vec2 = (v16u8)__msa_srai_b((v16i8)vec0, 3);
+ vec1 = (v16u8)__msa_sldi_b(zero, (v16i8)vec1, 1);
+ vec2 = (v16u8)__msa_sldi_b(zero, (v16i8)vec2, 1);
+ vec3 = (v16u8)__msa_srai_b((v16i8)src0, 1);
+ vec5 = (v16u8)__msa_srai_b((v16i8)src1, 3);
+ vec6 = (v16u8)__msa_slli_b((v16i8)src1, 2);
+ vec7 = (v16u8)__msa_srai_b((v16i8)vec5, 3);
+ vec6 = (v16u8)__msa_sldi_b(zero, (v16i8)vec6, 1);
+ vec7 = (v16u8)__msa_sldi_b(zero, (v16i8)vec7, 1);
+ vec8 = (v16u8)__msa_srai_b((v16i8)src1, 1);
+ vec3 = (v16u8)__msa_sldi_b(zero, (v16i8)vec3, 2);
+ vec8 = (v16u8)__msa_sldi_b(zero, (v16i8)vec8, 2);
+ vec4 = (v16u8)__msa_sldi_b(zero, (v16i8)src0, 3);
+ vec9 = (v16u8)__msa_sldi_b(zero, (v16i8)src1, 3);
+ vec0 = __msa_binsli_b(vec0, vec1, 2);
+ vec5 = __msa_binsli_b(vec5, vec6, 2);
+ vec1 = __msa_binsli_b(vec2, vec3, 5);
+ vec6 = __msa_binsli_b(vec7, vec8, 5);
+ vec1 = __msa_binsli_b(vec1, vec4, 0);
+ vec6 = __msa_binsli_b(vec6, vec9, 0);
+ vec0 = (v16u8)__msa_ilvev_b((v16i8)vec1, (v16i8)vec0);
+ vec1 = (v16u8)__msa_ilvev_b((v16i8)vec6, (v16i8)vec5);
+ dst0 = (v16u8)__msa_pckev_h((v8i16)vec1, (v8i16)vec0);
+ ST_UB(dst0, dst_rgb);
+ src_argb += 32;
+ dst_rgb += 16;
+ }
+}
+
+void ARGBToARGB4444Row_MSA(const uint8* src_argb, uint8* dst_rgb, int width) {
+ int x;
+ v16u8 src0, src1;
+ v16u8 vec0, vec1;
+ v16u8 dst0;
+ v16i8 zero = {0};
+
+ for (x = 0; x < width; x += 8) {
+ src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0);
+ src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16);
+ vec0 = (v16u8)__msa_srai_b((v16i8)src0, 4);
+ vec1 = (v16u8)__msa_srai_b((v16i8)src1, 4);
+ src0 = (v16u8)__msa_sldi_b(zero, (v16i8)src0, 1);
+ src1 = (v16u8)__msa_sldi_b(zero, (v16i8)src1, 1);
+ vec0 = __msa_binsli_b(vec0, src0, 3);
+ vec1 = __msa_binsli_b(vec1, src1, 3);
+ dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+ ST_UB(dst0, dst_rgb);
+ src_argb += 32;
+ dst_rgb += 16;
+ }
+}
+
+void ARGBToUV444Row_MSA(const uint8* src_argb,
+ uint8* dst_u,
+ uint8* dst_v,
+ int32 width) {
+ int32 x;
+ v16u8 src0, src1, src2, src3, reg0, reg1, reg2, reg3, dst0, dst1;
+ v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ v8u16 vec8, vec9, vec10, vec11;
+ v8u16 const_112 = (v8u16)__msa_ldi_h(112);
+ v8u16 const_74 = (v8u16)__msa_ldi_h(74);
+ v8u16 const_38 = (v8u16)__msa_ldi_h(38);
+ v8u16 const_94 = (v8u16)__msa_ldi_h(94);
+ v8u16 const_18 = (v8u16)__msa_ldi_h(18);
+ v8u16 const_32896 = (v8u16)__msa_fill_h(32896);
+ v16i8 zero = {0};
+
+ for (x = width; x > 0; x -= 16) {
+ src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0);
+ src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16);
+ src2 = (v16u8)__msa_ld_b((v16i8*)src_argb, 32);
+ src3 = (v16u8)__msa_ld_b((v16i8*)src_argb, 48);
+ reg0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
+ reg1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
+ reg2 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
+ reg3 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
+ src0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0);
+ src1 = (v16u8)__msa_pckev_b((v16i8)reg3, (v16i8)reg2);
+ src2 = (v16u8)__msa_pckod_b((v16i8)reg1, (v16i8)reg0);
+ vec0 = (v8u16)__msa_ilvr_b(zero, (v16i8)src0);
+ vec1 = (v8u16)__msa_ilvl_b(zero, (v16i8)src0);
+ vec2 = (v8u16)__msa_ilvr_b(zero, (v16i8)src1);
+ vec3 = (v8u16)__msa_ilvl_b(zero, (v16i8)src1);
+ vec4 = (v8u16)__msa_ilvr_b(zero, (v16i8)src2);
+ vec5 = (v8u16)__msa_ilvl_b(zero, (v16i8)src2);
+ vec10 = vec0 * const_18;
+ vec11 = vec1 * const_18;
+ vec8 = vec2 * const_94;
+ vec9 = vec3 * const_94;
+ vec6 = vec4 * const_112;
+ vec7 = vec5 * const_112;
+ vec0 *= const_112;
+ vec1 *= const_112;
+ vec2 *= const_74;
+ vec3 *= const_74;
+ vec4 *= const_38;
+ vec5 *= const_38;
+ vec8 += vec10;
+ vec9 += vec11;
+ vec6 += const_32896;
+ vec7 += const_32896;
+ vec0 += const_32896;
+ vec1 += const_32896;
+ vec2 += vec4;
+ vec3 += vec5;
+ vec0 -= vec2;
+ vec1 -= vec3;
+ vec6 -= vec8;
+ vec7 -= vec9;
+ vec0 = (v8u16)__msa_srai_h((v8i16)vec0, 8);
+ vec1 = (v8u16)__msa_srai_h((v8i16)vec1, 8);
+ vec6 = (v8u16)__msa_srai_h((v8i16)vec6, 8);
+ vec7 = (v8u16)__msa_srai_h((v8i16)vec7, 8);
+ dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+ dst1 = (v16u8)__msa_pckev_b((v16i8)vec7, (v16i8)vec6);
+ ST_UB(dst0, dst_u);
+ ST_UB(dst1, dst_v);
+ src_argb += 64;
+ dst_u += 16;
+ dst_v += 16;
+ }
+}
+
+void ARGBMultiplyRow_MSA(const uint8* src_argb0,
+ const uint8* src_argb1,
+ uint8* dst_argb,
+ int width) {
+ int x;
+ v16u8 src0, src1, dst0;
+ v8u16 vec0, vec1, vec2, vec3;
+ v4u32 reg0, reg1, reg2, reg3;
+ v8i16 zero = {0};
+
+ for (x = 0; x < width; x += 4) {
+ src0 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 0);
+ src1 = (v16u8)__msa_ld_b((v16i8*)src_argb1, 0);
+ vec0 = (v8u16)__msa_ilvr_b((v16i8)src0, (v16i8)src0);
+ vec1 = (v8u16)__msa_ilvl_b((v16i8)src0, (v16i8)src0);
+ vec2 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src1);
+ vec3 = (v8u16)__msa_ilvl_b((v16i8)zero, (v16i8)src1);
+ reg0 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec0);
+ reg1 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec0);
+ reg2 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec1);
+ reg3 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec1);
+ reg0 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec2);
+ reg1 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec2);
+ reg2 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec3);
+ reg3 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec3);
+ reg0 = (v4u32)__msa_srai_w((v4i32)reg0, 16);
+ reg1 = (v4u32)__msa_srai_w((v4i32)reg1, 16);
+ reg2 = (v4u32)__msa_srai_w((v4i32)reg2, 16);
+ reg3 = (v4u32)__msa_srai_w((v4i32)reg3, 16);
+ vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
+ vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
+ dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+ ST_UB(dst0, dst_argb);
+ src_argb0 += 16;
+ src_argb1 += 16;
+ dst_argb += 16;
+ }
+}
+
+void ARGBAddRow_MSA(const uint8* src_argb0,
+ const uint8* src_argb1,
+ uint8* dst_argb,
+ int width) {
+ int x;
+ v16u8 src0, src1, src2, src3, dst0, dst1;
+
+ for (x = 0; x < width; x += 8) {
+ src0 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 0);
+ src1 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 16);
+ src2 = (v16u8)__msa_ld_b((v16i8*)src_argb1, 0);
+ src3 = (v16u8)__msa_ld_b((v16i8*)src_argb1, 16);
+ dst0 = __msa_adds_u_b(src0, src2);
+ dst1 = __msa_adds_u_b(src1, src3);
+ ST_UB2(dst0, dst1, dst_argb, 16);
+ src_argb0 += 32;
+ src_argb1 += 32;
+ dst_argb += 32;
+ }
+}
+
+void ARGBSubtractRow_MSA(const uint8* src_argb0,
+ const uint8* src_argb1,
+ uint8* dst_argb,
+ int width) {
+ int x;
+ v16u8 src0, src1, src2, src3, dst0, dst1;
+
+ for (x = 0; x < width; x += 8) {
+ src0 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 0);
+ src1 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 16);
+ src2 = (v16u8)__msa_ld_b((v16i8*)src_argb1, 0);
+ src3 = (v16u8)__msa_ld_b((v16i8*)src_argb1, 16);
+ dst0 = __msa_subs_u_b(src0, src2);
+ dst1 = __msa_subs_u_b(src1, src3);
+ ST_UB2(dst0, dst1, dst_argb, 16);
+ src_argb0 += 32;
+ src_argb1 += 32;
+ dst_argb += 32;
+ }
+}
+
+void ARGBAttenuateRow_MSA(const uint8* src_argb, uint8* dst_argb, int width) {
+ int x;
+ v16u8 src0, src1, dst0, dst1;
+ v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
+ v4u32 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
+ v8i16 zero = {0};
+ v16u8 mask = {0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255};
+
+ for (x = 0; x < width; x += 8) {
+ src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0);
+ src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16);
+ vec0 = (v8u16)__msa_ilvr_b((v16i8)src0, (v16i8)src0);
+ vec1 = (v8u16)__msa_ilvl_b((v16i8)src0, (v16i8)src0);
+ vec2 = (v8u16)__msa_ilvr_b((v16i8)src1, (v16i8)src1);
+ vec3 = (v8u16)__msa_ilvl_b((v16i8)src1, (v16i8)src1);
+ vec4 = (v8u16)__msa_fill_h(vec0[3]);
+ vec5 = (v8u16)__msa_fill_h(vec0[7]);
+ vec6 = (v8u16)__msa_fill_h(vec1[3]);
+ vec7 = (v8u16)__msa_fill_h(vec1[7]);
+ vec4 = (v8u16)__msa_pckev_d((v2i64)vec5, (v2i64)vec4);
+ vec5 = (v8u16)__msa_pckev_d((v2i64)vec7, (v2i64)vec6);
+ vec6 = (v8u16)__msa_fill_h(vec2[3]);
+ vec7 = (v8u16)__msa_fill_h(vec2[7]);
+ vec8 = (v8u16)__msa_fill_h(vec3[3]);
+ vec9 = (v8u16)__msa_fill_h(vec3[7]);
+ vec6 = (v8u16)__msa_pckev_d((v2i64)vec7, (v2i64)vec6);
+ vec7 = (v8u16)__msa_pckev_d((v2i64)vec9, (v2i64)vec8);
+ reg0 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec4);
+ reg1 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec4);
+ reg2 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec5);
+ reg3 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec5);
+ reg4 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec6);
+ reg5 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec6);
+ reg6 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec7);
+ reg7 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec7);
+ reg0 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec0);
+ reg1 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec0);
+ reg2 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec1);
+ reg3 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec1);
+ reg4 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec2);
+ reg5 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec2);
+ reg6 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec3);
+ reg7 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec3);
+ reg0 = (v4u32)__msa_srai_w((v4i32)reg0, 24);
+ reg1 = (v4u32)__msa_srai_w((v4i32)reg1, 24);
+ reg2 = (v4u32)__msa_srai_w((v4i32)reg2, 24);
+ reg3 = (v4u32)__msa_srai_w((v4i32)reg3, 24);
+ reg4 = (v4u32)__msa_srai_w((v4i32)reg4, 24);
+ reg5 = (v4u32)__msa_srai_w((v4i32)reg5, 24);
+ reg6 = (v4u32)__msa_srai_w((v4i32)reg6, 24);
+ reg7 = (v4u32)__msa_srai_w((v4i32)reg7, 24);
+ vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
+ vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
+ vec2 = (v8u16)__msa_pckev_h((v8i16)reg5, (v8i16)reg4);
+ vec3 = (v8u16)__msa_pckev_h((v8i16)reg7, (v8i16)reg6);
+ dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+ dst1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
+ dst0 = __msa_bmnz_v(dst0, src0, mask);
+ dst1 = __msa_bmnz_v(dst1, src1, mask);
+ ST_UB2(dst0, dst1, dst_argb, 16);
+ src_argb += 32;
+ dst_argb += 32;
+ }
+}
+
+void ARGBToRGB565DitherRow_MSA(const uint8* src_argb,
+ uint8* dst_rgb,
+ uint32 dither4,
+ int width) {
+ int x;
+ v16u8 src0, src1, dst0, vec0, vec1;
+ v8i16 vec_d0;
+ v8i16 reg0, reg1, reg2;
+ v16i8 zero = {0};
+ v8i16 max = __msa_ldi_h(0xFF);
+
+ vec_d0 = (v8i16)__msa_fill_w(dither4);
+ vec_d0 = (v8i16)__msa_ilvr_b(zero, (v16i8)vec_d0);
+
+ for (x = 0; x < width; x += 8) {
+ src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0);
+ src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16);
+ vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
+ vec1 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
+ reg0 = (v8i16)__msa_ilvev_b(zero, (v16i8)vec0);
+ reg1 = (v8i16)__msa_ilvev_b(zero, (v16i8)vec1);
+ reg2 = (v8i16)__msa_ilvod_b(zero, (v16i8)vec0);
+ reg0 += vec_d0;
+ reg1 += vec_d0;
+ reg2 += vec_d0;
+ reg0 = __msa_maxi_s_h((v8i16)reg0, 0);
+ reg1 = __msa_maxi_s_h((v8i16)reg1, 0);
+ reg2 = __msa_maxi_s_h((v8i16)reg2, 0);
+ reg0 = __msa_min_s_h((v8i16)max, (v8i16)reg0);
+ reg1 = __msa_min_s_h((v8i16)max, (v8i16)reg1);
+ reg2 = __msa_min_s_h((v8i16)max, (v8i16)reg2);
+ reg0 = __msa_srai_h(reg0, 3);
+ reg2 = __msa_srai_h(reg2, 3);
+ reg1 = __msa_srai_h(reg1, 2);
+ reg2 = __msa_slli_h(reg2, 11);
+ reg1 = __msa_slli_h(reg1, 5);
+ reg0 |= reg1;
+ dst0 = (v16u8)(reg0 | reg2);
+ ST_UB(dst0, dst_rgb);
+ src_argb += 32;
+ dst_rgb += 16;
+ }
+}
+
+void ARGBShuffleRow_MSA(const uint8* src_argb,
+ uint8* dst_argb,
+ const uint8* shuffler,
+ int width) {
+ int x;
+ v16u8 src0, src1, dst0, dst1;
+ v16i8 vec0;
+ v16i8 shuffler_vec = {0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12};
+ int32 val = LW((int32*)shuffler);
+
+ vec0 = (v16i8)__msa_fill_w(val);
+ shuffler_vec += vec0;
+
+ for (x = 0; x < width; x += 8) {
+ src0 = (v16u8)__msa_ld_b((v16u8*)src_argb, 0);
+ src1 = (v16u8)__msa_ld_b((v16u8*)src_argb, 16);
+ dst0 = (v16u8)__msa_vshf_b(shuffler_vec, (v16i8)src0, (v16i8)src0);
+ dst1 = (v16u8)__msa_vshf_b(shuffler_vec, (v16i8)src1, (v16i8)src1);
+ ST_UB2(dst0, dst1, dst_argb, 16);
+ src_argb += 32;
+ dst_argb += 32;
+ }
+}
+
+void ARGBShadeRow_MSA(const uint8* src_argb,
+ uint8* dst_argb,
+ int width,
+ uint32 value) {
+ int x;
+ v16u8 src0, dst0;
+ v8u16 vec0, vec1;
+ v4u32 reg0, reg1, reg2, reg3, rgba_scale;
+ v8i16 zero = {0};
+
+ rgba_scale[0] = value;
+ rgba_scale = (v4u32)__msa_ilvr_b((v16i8)rgba_scale, (v16i8)rgba_scale);
+ rgba_scale = (v4u32)__msa_ilvr_h(zero, (v8i16)rgba_scale);
+
+ for (x = 0; x < width; x += 4) {
+ src0 = (v16u8)__msa_ld_b((v16u8*)src_argb, 0);
+ vec0 = (v8u16)__msa_ilvr_b((v16i8)src0, (v16i8)src0);
+ vec1 = (v8u16)__msa_ilvl_b((v16i8)src0, (v16i8)src0);
+ reg0 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec0);
+ reg1 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec0);
+ reg2 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec1);
+ reg3 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec1);
+ reg0 *= rgba_scale;
+ reg1 *= rgba_scale;
+ reg2 *= rgba_scale;
+ reg3 *= rgba_scale;
+ reg0 = (v4u32)__msa_srai_w((v4i32)reg0, 24);
+ reg1 = (v4u32)__msa_srai_w((v4i32)reg1, 24);
+ reg2 = (v4u32)__msa_srai_w((v4i32)reg2, 24);
+ reg3 = (v4u32)__msa_srai_w((v4i32)reg3, 24);
+ vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
+ vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
+ dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+ ST_UB(dst0, dst_argb);
+ src_argb += 16;
+ dst_argb += 16;
+ }
+}
+
+void ARGBGrayRow_MSA(const uint8* src_argb, uint8* dst_argb, int width) {
+ int x;
+ v16u8 src0, src1, vec0, vec1, dst0, dst1;
+ v8u16 reg0;
+ v16u8 const_0x26 = (v16u8)__msa_ldi_h(0x26);
+ v16u8 const_0x4B0F = (v16u8)__msa_fill_h(0x4B0F);
+
+ for (x = 0; x < width; x += 8) {
+ src0 = (v16u8)__msa_ld_b((v16u8*)src_argb, 0);
+ src1 = (v16u8)__msa_ld_b((v16u8*)src_argb, 16);
+ vec0 = (v16u8)__msa_pckev_h((v8i16)src1, (v8i16)src0);
+ vec1 = (v16u8)__msa_pckod_h((v8i16)src1, (v8i16)src0);
+ reg0 = __msa_dotp_u_h(vec0, const_0x4B0F);
+ reg0 = __msa_dpadd_u_h(reg0, vec1, const_0x26);
+ reg0 = (v8u16)__msa_srari_h((v8i16)reg0, 7);
+ vec0 = (v16u8)__msa_ilvev_b((v16i8)reg0, (v16i8)reg0);
+ vec1 = (v16u8)__msa_ilvod_b((v16i8)vec1, (v16i8)vec0);
+ dst0 = (v16u8)__msa_ilvr_b((v16i8)vec1, (v16i8)vec0);
+ dst1 = (v16u8)__msa_ilvl_b((v16i8)vec1, (v16i8)vec0);
+ ST_UB2(dst0, dst1, dst_argb, 16);
+ src_argb += 32;
+ dst_argb += 32;
+ }
+}
+
+void ARGBSepiaRow_MSA(uint8* dst_argb, int width) {
+ int x;
+ v16u8 src0, src1, dst0, dst1, vec0, vec1, vec2, vec3, vec4, vec5;
+ v8u16 reg0, reg1, reg2;
+ v16u8 const_0x4411 = (v16u8)__msa_fill_h(0x4411);
+ v16u8 const_0x23 = (v16u8)__msa_ldi_h(0x23);
+ v16u8 const_0x5816 = (v16u8)__msa_fill_h(0x5816);
+ v16u8 const_0x2D = (v16u8)__msa_ldi_h(0x2D);
+ v16u8 const_0x6218 = (v16u8)__msa_fill_h(0x6218);
+ v16u8 const_0x32 = (v16u8)__msa_ldi_h(0x32);
+ v8u16 const_0xFF = (v8u16)__msa_ldi_h(0xFF);
+
+ for (x = 0; x < width; x += 8) {
+ src0 = (v16u8)__msa_ld_b((v16u8*)dst_argb, 0);
+ src1 = (v16u8)__msa_ld_b((v16u8*)dst_argb, 16);
+ vec0 = (v16u8)__msa_pckev_h((v8i16)src1, (v8i16)src0);
+ vec1 = (v16u8)__msa_pckod_h((v8i16)src1, (v8i16)src0);
+ vec3 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec1);
+ reg0 = (v8u16)__msa_dotp_u_h(vec0, const_0x4411);
+ reg1 = (v8u16)__msa_dotp_u_h(vec0, const_0x5816);
+ reg2 = (v8u16)__msa_dotp_u_h(vec0, const_0x6218);
+ reg0 = (v8u16)__msa_dpadd_u_h(reg0, vec1, const_0x23);
+ reg1 = (v8u16)__msa_dpadd_u_h(reg1, vec1, const_0x2D);
+ reg2 = (v8u16)__msa_dpadd_u_h(reg2, vec1, const_0x32);
+ reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 7);
+ reg1 = (v8u16)__msa_srai_h((v8i16)reg1, 7);
+ reg2 = (v8u16)__msa_srai_h((v8i16)reg2, 7);
+ reg1 = (v8u16)__msa_min_u_h((v8u16)reg1, const_0xFF);
+ reg2 = (v8u16)__msa_min_u_h((v8u16)reg2, const_0xFF);
+ vec0 = (v16u8)__msa_pckev_b((v16i8)reg0, (v16i8)reg0);
+ vec1 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg1);
+ vec2 = (v16u8)__msa_pckev_b((v16i8)reg2, (v16i8)reg2);
+ vec4 = (v16u8)__msa_ilvr_b((v16i8)vec2, (v16i8)vec0);
+ vec5 = (v16u8)__msa_ilvr_b((v16i8)vec3, (v16i8)vec1);
+ dst0 = (v16u8)__msa_ilvr_b((v16i8)vec5, (v16i8)vec4);
+ dst1 = (v16u8)__msa_ilvl_b((v16i8)vec5, (v16i8)vec4);
+ ST_UB2(dst0, dst1, dst_argb, 16);
+ dst_argb += 32;
+ }
+}
+
+void ARGB4444ToARGBRow_MSA(const uint8* src_argb4444,
+ uint8* dst_argb,
+ int width) {
+ int x;
+ v16u8 src0, src1;
+ v8u16 vec0, vec1, vec2, vec3;
+ v16u8 dst0, dst1, dst2, dst3;
+
+ for (x = 0; x < width; x += 16) {
+ src0 = (v16u8)__msa_ld_b((v16u8*)src_argb4444, 0);
+ src1 = (v16u8)__msa_ld_b((v16u8*)src_argb4444, 16);
+ vec0 = (v8u16)__msa_andi_b(src0, 0x0F);
+ vec1 = (v8u16)__msa_andi_b(src1, 0x0F);
+ vec2 = (v8u16)__msa_andi_b(src0, 0xF0);
+ vec3 = (v8u16)__msa_andi_b(src1, 0xF0);
+ vec0 |= (v8u16)__msa_slli_b((v16i8)vec0, 4);
+ vec1 |= (v8u16)__msa_slli_b((v16i8)vec1, 4);
+ vec2 |= (v8u16)__msa_srli_b((v16i8)vec2, 4);
+ vec3 |= (v8u16)__msa_srli_b((v16i8)vec3, 4);
+ dst0 = (v16u8)__msa_ilvr_b((v16i8)vec2, (v16i8)vec0);
+ dst1 = (v16u8)__msa_ilvl_b((v16i8)vec2, (v16i8)vec0);
+ dst2 = (v16u8)__msa_ilvr_b((v16i8)vec3, (v16i8)vec1);
+ dst3 = (v16u8)__msa_ilvl_b((v16i8)vec3, (v16i8)vec1);
+ ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
+ src_argb4444 += 32;
+ dst_argb += 64;
+ }
+}
+
+void ARGB1555ToARGBRow_MSA(const uint8* src_argb1555,
+ uint8* dst_argb,
+ int width) {
+ int x;
+ v8u16 src0, src1;
+ v8u16 vec0, vec1, vec2, vec3, vec4, vec5;
+ v16u8 reg0, reg1, reg2, reg3, reg4, reg5, reg6;
+ v16u8 dst0, dst1, dst2, dst3;
+ v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F);
+
+ for (x = 0; x < width; x += 16) {
+ src0 = (v8u16)__msa_ld_h((v8u16*)src_argb1555, 0);
+ src1 = (v8u16)__msa_ld_h((v8u16*)src_argb1555, 16);
+ vec0 = src0 & const_0x1F;
+ vec1 = src1 & const_0x1F;
+ src0 = (v8u16)__msa_srli_h((v8i16)src0, 5);
+ src1 = (v8u16)__msa_srli_h((v8i16)src1, 5);
+ vec2 = src0 & const_0x1F;
+ vec3 = src1 & const_0x1F;
+ src0 = (v8u16)__msa_srli_h((v8i16)src0, 5);
+ src1 = (v8u16)__msa_srli_h((v8i16)src1, 5);
+ vec4 = src0 & const_0x1F;
+ vec5 = src1 & const_0x1F;
+ src0 = (v8u16)__msa_srli_h((v8i16)src0, 5);
+ src1 = (v8u16)__msa_srli_h((v8i16)src1, 5);
+ reg0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+ reg1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
+ reg2 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec4);
+ reg3 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
+ reg4 = (v16u8)__msa_slli_b((v16i8)reg0, 3);
+ reg5 = (v16u8)__msa_slli_b((v16i8)reg1, 3);
+ reg6 = (v16u8)__msa_slli_b((v16i8)reg2, 3);
+ reg4 |= (v16u8)__msa_srai_b((v16i8)reg0, 2);
+ reg5 |= (v16u8)__msa_srai_b((v16i8)reg1, 2);
+ reg6 |= (v16u8)__msa_srai_b((v16i8)reg2, 2);
+ reg3 = -reg3;
+ reg0 = (v16u8)__msa_ilvr_b((v16i8)reg6, (v16i8)reg4);
+ reg1 = (v16u8)__msa_ilvl_b((v16i8)reg6, (v16i8)reg4);
+ reg2 = (v16u8)__msa_ilvr_b((v16i8)reg3, (v16i8)reg5);
+ reg3 = (v16u8)__msa_ilvl_b((v16i8)reg3, (v16i8)reg5);
+ dst0 = (v16u8)__msa_ilvr_b((v16i8)reg2, (v16i8)reg0);
+ dst1 = (v16u8)__msa_ilvl_b((v16i8)reg2, (v16i8)reg0);
+ dst2 = (v16u8)__msa_ilvr_b((v16i8)reg3, (v16i8)reg1);
+ dst3 = (v16u8)__msa_ilvl_b((v16i8)reg3, (v16i8)reg1);
+ ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
+ src_argb1555 += 32;
+ dst_argb += 64;
+ }
+}
+
+void RGB565ToARGBRow_MSA(const uint8* src_rgb565, uint8* dst_argb, int width) {
+ int x;
+ v8u16 src0, src1, vec0, vec1, vec2, vec3, vec4, vec5;
+ v8u16 reg0, reg1, reg2, reg3, reg4, reg5;
+ v16u8 res0, res1, res2, res3, dst0, dst1, dst2, dst3;
+ v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
+ v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F);
+ v8u16 const_0x7E0 = (v8u16)__msa_fill_h(0x7E0);
+ v8u16 const_0xF800 = (v8u16)__msa_fill_h(0xF800);
+
+ for (x = 0; x < width; x += 16) {
+ src0 = (v8u16)__msa_ld_h((v8u16*)src_rgb565, 0);
+ src1 = (v8u16)__msa_ld_h((v8u16*)src_rgb565, 16);
+ vec0 = src0 & const_0x1F;
+ vec1 = src0 & const_0x7E0;
+ vec2 = src0 & const_0xF800;
+ vec3 = src1 & const_0x1F;
+ vec4 = src1 & const_0x7E0;
+ vec5 = src1 & const_0xF800;
+ reg0 = (v8u16)__msa_slli_h((v8i16)vec0, 3);
+ reg1 = (v8u16)__msa_srli_h((v8i16)vec1, 3);
+ reg2 = (v8u16)__msa_srli_h((v8i16)vec2, 8);
+ reg3 = (v8u16)__msa_slli_h((v8i16)vec3, 3);
+ reg4 = (v8u16)__msa_srli_h((v8i16)vec4, 3);
+ reg5 = (v8u16)__msa_srli_h((v8i16)vec5, 8);
+ reg0 |= (v8u16)__msa_srli_h((v8i16)vec0, 2);
+ reg1 |= (v8u16)__msa_srli_h((v8i16)vec1, 9);
+ reg2 |= (v8u16)__msa_srli_h((v8i16)vec2, 13);
+ reg3 |= (v8u16)__msa_srli_h((v8i16)vec3, 2);
+ reg4 |= (v8u16)__msa_srli_h((v8i16)vec4, 9);
+ reg5 |= (v8u16)__msa_srli_h((v8i16)vec5, 13);
+ res0 = (v16u8)__msa_ilvev_b((v16i8)reg2, (v16i8)reg0);
+ res1 = (v16u8)__msa_ilvev_b((v16i8)alpha, (v16i8)reg1);
+ res2 = (v16u8)__msa_ilvev_b((v16i8)reg5, (v16i8)reg3);
+ res3 = (v16u8)__msa_ilvev_b((v16i8)alpha, (v16i8)reg4);
+ dst0 = (v16u8)__msa_ilvr_b((v16i8)res1, (v16i8)res0);
+ dst1 = (v16u8)__msa_ilvl_b((v16i8)res1, (v16i8)res0);
+ dst2 = (v16u8)__msa_ilvr_b((v16i8)res3, (v16i8)res2);
+ dst3 = (v16u8)__msa_ilvl_b((v16i8)res3, (v16i8)res2);
+ ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
+ src_rgb565 += 32;
+ dst_argb += 64;
+ }
+}
+
+void RGB24ToARGBRow_MSA(const uint8* src_rgb24, uint8* dst_argb, int width) {
+ int x;
+ v16u8 src0, src1, src2;
+ v16u8 vec0, vec1, vec2;
+ v16u8 dst0, dst1, dst2, dst3;
+ v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
+ v16i8 shuffler = {0, 1, 2, 16, 3, 4, 5, 17, 6, 7, 8, 18, 9, 10, 11, 19};
+
+ for (x = 0; x < width; x += 16) {
+ src0 = (v16u8)__msa_ld_b((v16i8*)src_rgb24, 0);
+ src1 = (v16u8)__msa_ld_b((v16i8*)src_rgb24, 16);
+ src2 = (v16u8)__msa_ld_b((v16i8*)src_rgb24, 32);
+ vec0 = (v16u8)__msa_sldi_b((v16i8)src1, (v16i8)src0, 12);
+ vec1 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src1, 8);
+ vec2 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src2, 4);
+ dst0 = (v16u8)__msa_vshf_b(shuffler, (v16i8)alpha, (v16i8)src0);
+ dst1 = (v16u8)__msa_vshf_b(shuffler, (v16i8)alpha, (v16i8)vec0);
+ dst2 = (v16u8)__msa_vshf_b(shuffler, (v16i8)alpha, (v16i8)vec1);
+ dst3 = (v16u8)__msa_vshf_b(shuffler, (v16i8)alpha, (v16i8)vec2);
+ ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
+ src_rgb24 += 48;
+ dst_argb += 64;
+ }
+}
+
+void RAWToARGBRow_MSA(const uint8* src_raw, uint8* dst_argb, int width) {
+ int x;
+ v16u8 src0, src1, src2;
+ v16u8 vec0, vec1, vec2;
+ v16u8 dst0, dst1, dst2, dst3;
+ v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
+ v16i8 mask = {2, 1, 0, 16, 5, 4, 3, 17, 8, 7, 6, 18, 11, 10, 9, 19};
+
+ for (x = 0; x < width; x += 16) {
+ src0 = (v16u8)__msa_ld_b((v16i8*)src_raw, 0);
+ src1 = (v16u8)__msa_ld_b((v16i8*)src_raw, 16);
+ src2 = (v16u8)__msa_ld_b((v16i8*)src_raw, 32);
+ vec0 = (v16u8)__msa_sldi_b((v16i8)src1, (v16i8)src0, 12);
+ vec1 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src1, 8);
+ vec2 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src2, 4);
+ dst0 = (v16u8)__msa_vshf_b(mask, (v16i8)alpha, (v16i8)src0);
+ dst1 = (v16u8)__msa_vshf_b(mask, (v16i8)alpha, (v16i8)vec0);
+ dst2 = (v16u8)__msa_vshf_b(mask, (v16i8)alpha, (v16i8)vec1);
+ dst3 = (v16u8)__msa_vshf_b(mask, (v16i8)alpha, (v16i8)vec2);
+ ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
+ src_raw += 48;
+ dst_argb += 64;
+ }
+}
+
+void ARGB1555ToYRow_MSA(const uint8* src_argb1555, uint8* dst_y, int width) {
+ int x;
+ v8u16 src0, src1, vec0, vec1, vec2, vec3, vec4, vec5;
+ v8u16 reg0, reg1, reg2, reg3, reg4, reg5;
+ v16u8 dst0;
+ v8u16 const_0x19 = (v8u16)__msa_ldi_h(0x19);
+ v8u16 const_0x81 = (v8u16)__msa_ldi_h(0x81);
+ v8u16 const_0x42 = (v8u16)__msa_ldi_h(0x42);
+ v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F);
+ v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);
+
+ for (x = 0; x < width; x += 16) {
+ src0 = (v8u16)__msa_ld_b((v8i16*)src_argb1555, 0);
+ src1 = (v8u16)__msa_ld_b((v8i16*)src_argb1555, 16);
+ vec0 = src0 & const_0x1F;
+ vec1 = src1 & const_0x1F;
+ src0 = (v8u16)__msa_srai_h((v8i16)src0, 5);
+ src1 = (v8u16)__msa_srai_h((v8i16)src1, 5);
+ vec2 = src0 & const_0x1F;
+ vec3 = src1 & const_0x1F;
+ src0 = (v8u16)__msa_srai_h((v8i16)src0, 5);
+ src1 = (v8u16)__msa_srai_h((v8i16)src1, 5);
+ vec4 = src0 & const_0x1F;
+ vec5 = src1 & const_0x1F;
+ reg0 = (v8u16)__msa_slli_h((v8i16)vec0, 3);
+ reg1 = (v8u16)__msa_slli_h((v8i16)vec1, 3);
+ reg0 |= (v8u16)__msa_srai_h((v8i16)vec0, 2);
+ reg1 |= (v8u16)__msa_srai_h((v8i16)vec1, 2);
+ reg2 = (v8u16)__msa_slli_h((v8i16)vec2, 3);
+ reg3 = (v8u16)__msa_slli_h((v8i16)vec3, 3);
+ reg2 |= (v8u16)__msa_srai_h((v8i16)vec2, 2);
+ reg3 |= (v8u16)__msa_srai_h((v8i16)vec3, 2);
+ reg4 = (v8u16)__msa_slli_h((v8i16)vec4, 3);
+ reg5 = (v8u16)__msa_slli_h((v8i16)vec5, 3);
+ reg4 |= (v8u16)__msa_srai_h((v8i16)vec4, 2);
+ reg5 |= (v8u16)__msa_srai_h((v8i16)vec5, 2);
+ reg0 *= const_0x19;
+ reg1 *= const_0x19;
+ reg2 *= const_0x81;
+ reg3 *= const_0x81;
+ reg4 *= const_0x42;
+ reg5 *= const_0x42;
+ reg0 += reg2;
+ reg1 += reg3;
+ reg0 += reg4;
+ reg1 += reg5;
+ reg0 += const_0x1080;
+ reg1 += const_0x1080;
+ reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 8);
+ reg1 = (v8u16)__msa_srai_h((v8i16)reg1, 8);
+ dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0);
+ ST_UB(dst0, dst_y);
+ src_argb1555 += 32;
+ dst_y += 16;
+ }
+}
+
+void RGB565ToYRow_MSA(const uint8* src_rgb565, uint8* dst_y, int width) {
+ int x;
+ v8u16 src0, src1, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ v8u16 reg0, reg1, reg2, reg3, reg4, reg5;
+ v4u32 res0, res1, res2, res3;
+ v16u8 dst0;
+ v4u32 const_0x810019 = (v4u32)__msa_fill_w(0x810019);
+ v4u32 const_0x010042 = (v4u32)__msa_fill_w(0x010042);
+ v8i16 const_0x1080 = __msa_fill_h(0x1080);
+ v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F);
+ v8u16 const_0x7E0 = (v8u16)__msa_fill_h(0x7E0);
+ v8u16 const_0xF800 = (v8u16)__msa_fill_h(0xF800);
+
+ for (x = 0; x < width; x += 16) {
+ src0 = (v8u16)__msa_ld_b((v8i16*)src_rgb565, 0);
+ src1 = (v8u16)__msa_ld_b((v8i16*)src_rgb565, 16);
+ vec0 = src0 & const_0x1F;
+ vec1 = src0 & const_0x7E0;
+ vec2 = src0 & const_0xF800;
+ vec3 = src1 & const_0x1F;
+ vec4 = src1 & const_0x7E0;
+ vec5 = src1 & const_0xF800;
+ reg0 = (v8u16)__msa_slli_h((v8i16)vec0, 3);
+ reg1 = (v8u16)__msa_srli_h((v8i16)vec1, 3);
+ reg2 = (v8u16)__msa_srli_h((v8i16)vec2, 8);
+ reg3 = (v8u16)__msa_slli_h((v8i16)vec3, 3);
+ reg4 = (v8u16)__msa_srli_h((v8i16)vec4, 3);
+ reg5 = (v8u16)__msa_srli_h((v8i16)vec5, 8);
+ reg0 |= (v8u16)__msa_srli_h((v8i16)vec0, 2);
+ reg1 |= (v8u16)__msa_srli_h((v8i16)vec1, 9);
+ reg2 |= (v8u16)__msa_srli_h((v8i16)vec2, 13);
+ reg3 |= (v8u16)__msa_srli_h((v8i16)vec3, 2);
+ reg4 |= (v8u16)__msa_srli_h((v8i16)vec4, 9);
+ reg5 |= (v8u16)__msa_srli_h((v8i16)vec5, 13);
+ vec0 = (v8u16)__msa_ilvr_h((v8i16)reg1, (v8i16)reg0);
+ vec1 = (v8u16)__msa_ilvl_h((v8i16)reg1, (v8i16)reg0);
+ vec2 = (v8u16)__msa_ilvr_h((v8i16)reg4, (v8i16)reg3);
+ vec3 = (v8u16)__msa_ilvl_h((v8i16)reg4, (v8i16)reg3);
+ vec4 = (v8u16)__msa_ilvr_h(const_0x1080, (v8i16)reg2);
+ vec5 = (v8u16)__msa_ilvl_h(const_0x1080, (v8i16)reg2);
+ vec6 = (v8u16)__msa_ilvr_h(const_0x1080, (v8i16)reg5);
+ vec7 = (v8u16)__msa_ilvl_h(const_0x1080, (v8i16)reg5);
+ res0 = __msa_dotp_u_w(vec0, (v8u16)const_0x810019);
+ res1 = __msa_dotp_u_w(vec1, (v8u16)const_0x810019);
+ res2 = __msa_dotp_u_w(vec2, (v8u16)const_0x810019);
+ res3 = __msa_dotp_u_w(vec3, (v8u16)const_0x810019);
+ res0 = __msa_dpadd_u_w(res0, vec4, (v8u16)const_0x010042);
+ res1 = __msa_dpadd_u_w(res1, vec5, (v8u16)const_0x010042);
+ res2 = __msa_dpadd_u_w(res2, vec6, (v8u16)const_0x010042);
+ res3 = __msa_dpadd_u_w(res3, vec7, (v8u16)const_0x010042);
+ res0 = (v4u32)__msa_srai_w((v4i32)res0, 8);
+ res1 = (v4u32)__msa_srai_w((v4i32)res1, 8);
+ res2 = (v4u32)__msa_srai_w((v4i32)res2, 8);
+ res3 = (v4u32)__msa_srai_w((v4i32)res3, 8);
+ vec0 = (v8u16)__msa_pckev_h((v8i16)res1, (v8i16)res0);
+ vec1 = (v8u16)__msa_pckev_h((v8i16)res3, (v8i16)res2);
+ dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+ ST_UB(dst0, dst_y);
+ src_rgb565 += 32;
+ dst_y += 16;
+ }
+}
+
+void RGB24ToYRow_MSA(const uint8* src_argb0, uint8* dst_y, int width) {
+ int x;
+ v16u8 src0, src1, src2, reg0, reg1, reg2, reg3, dst0;
+ v8u16 vec0, vec1, vec2, vec3;
+ v8u16 const_0x8119 = (v8u16)__msa_fill_h(0x8119);
+ v8u16 const_0x42 = (v8u16)__msa_fill_h(0x42);
+ v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);
+ v16i8 mask0 = {0, 1, 2, 3, 3, 4, 5, 6, 6, 7, 8, 9, 9, 10, 11, 12};
+ v16i8 mask1 = {12, 13, 14, 15, 15, 16, 17, 18,
+ 18, 19, 20, 21, 21, 22, 23, 24};
+ v16i8 mask2 = {8, 9, 10, 11, 11, 12, 13, 14, 14, 15, 16, 17, 17, 18, 19, 20};
+ v16i8 mask3 = {4, 5, 6, 7, 7, 8, 9, 10, 10, 11, 12, 13, 13, 14, 15, 16};
+ v16i8 zero = {0};
+
+ for (x = 0; x < width; x += 16) {
+ src0 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 0);
+ src1 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 16);
+ src2 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 32);
+ reg0 = (v16u8)__msa_vshf_b(mask0, zero, (v16i8)src0);
+ reg1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0);
+ reg2 = (v16u8)__msa_vshf_b(mask2, (v16i8)src2, (v16i8)src1);
+ reg3 = (v16u8)__msa_vshf_b(mask3, zero, (v16i8)src2);
+ vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
+ vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
+ vec2 = (v8u16)__msa_pckod_h((v8i16)reg1, (v8i16)reg0);
+ vec3 = (v8u16)__msa_pckod_h((v8i16)reg3, (v8i16)reg2);
+ vec0 = __msa_dotp_u_h((v16u8)vec0, (v16u8)const_0x8119);
+ vec1 = __msa_dotp_u_h((v16u8)vec1, (v16u8)const_0x8119);
+ vec0 = __msa_dpadd_u_h(vec0, (v16u8)vec2, (v16u8)const_0x42);
+ vec1 = __msa_dpadd_u_h(vec1, (v16u8)vec3, (v16u8)const_0x42);
+ vec0 += const_0x1080;
+ vec1 += const_0x1080;
+ vec0 = (v8u16)__msa_srai_h((v8i16)vec0, 8);
+ vec1 = (v8u16)__msa_srai_h((v8i16)vec1, 8);
+ dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+ ST_UB(dst0, dst_y);
+ src_argb0 += 48;
+ dst_y += 16;
+ }
+}
+
+void RAWToYRow_MSA(const uint8* src_argb0, uint8* dst_y, int width) {
+ int x;
+ v16u8 src0, src1, src2, reg0, reg1, reg2, reg3, dst0;
+ v8u16 vec0, vec1, vec2, vec3;
+ v8u16 const_0x8142 = (v8u16)__msa_fill_h(0x8142);
+ v8u16 const_0x19 = (v8u16)__msa_fill_h(0x19);
+ v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);
+ v16i8 mask0 = {0, 1, 2, 3, 3, 4, 5, 6, 6, 7, 8, 9, 9, 10, 11, 12};
+ v16i8 mask1 = {12, 13, 14, 15, 15, 16, 17, 18,
+ 18, 19, 20, 21, 21, 22, 23, 24};
+ v16i8 mask2 = {8, 9, 10, 11, 11, 12, 13, 14, 14, 15, 16, 17, 17, 18, 19, 20};
+ v16i8 mask3 = {4, 5, 6, 7, 7, 8, 9, 10, 10, 11, 12, 13, 13, 14, 15, 16};
+ v16i8 zero = {0};
+
+ for (x = 0; x < width; x += 16) {
+ src0 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 0);
+ src1 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 16);
+ src2 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 32);
+ reg0 = (v16u8)__msa_vshf_b(mask0, zero, (v16i8)src0);
+ reg1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0);
+ reg2 = (v16u8)__msa_vshf_b(mask2, (v16i8)src2, (v16i8)src1);
+ reg3 = (v16u8)__msa_vshf_b(mask3, zero, (v16i8)src2);
+ vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
+ vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
+ vec2 = (v8u16)__msa_pckod_h((v8i16)reg1, (v8i16)reg0);
+ vec3 = (v8u16)__msa_pckod_h((v8i16)reg3, (v8i16)reg2);
+ vec0 = __msa_dotp_u_h((v16u8)vec0, (v16u8)const_0x8142);
+ vec1 = __msa_dotp_u_h((v16u8)vec1, (v16u8)const_0x8142);
+ vec0 = __msa_dpadd_u_h(vec0, (v16u8)vec2, (v16u8)const_0x19);
+ vec1 = __msa_dpadd_u_h(vec1, (v16u8)vec3, (v16u8)const_0x19);
+ vec0 += const_0x1080;
+ vec1 += const_0x1080;
+ vec0 = (v8u16)__msa_srai_h((v8i16)vec0, 8);
+ vec1 = (v8u16)__msa_srai_h((v8i16)vec1, 8);
+ dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+ ST_UB(dst0, dst_y);
+ src_argb0 += 48;
+ dst_y += 16;
+ }
+}
+
+void ARGB1555ToUVRow_MSA(const uint8* src_argb1555,
+ int src_stride_argb1555,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
+ int x;
+ const uint16* s = (const uint16*)src_argb1555;
+ const uint16* t = (const uint16*)(src_argb1555 + src_stride_argb1555);
+ int64_t res0, res1;
+ v8u16 src0, src1, src2, src3, reg0, reg1, reg2, reg3;
+ v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6;
+ v16u8 dst0;
+ v8u16 const_0x70 = (v8u16)__msa_ldi_h(0x70);
+ v8u16 const_0x4A = (v8u16)__msa_ldi_h(0x4A);
+ v8u16 const_0x26 = (v8u16)__msa_ldi_h(0x26);
+ v8u16 const_0x5E = (v8u16)__msa_ldi_h(0x5E);
+ v8u16 const_0x12 = (v8u16)__msa_ldi_h(0x12);
+ v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
+ v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F);
+
+ for (x = 0; x < width; x += 16) {
+ src0 = (v8u16)__msa_ld_b((v8i16*)s, 0);
+ src1 = (v8u16)__msa_ld_b((v8i16*)s, 16);
+ src2 = (v8u16)__msa_ld_b((v8i16*)t, 0);
+ src3 = (v8u16)__msa_ld_b((v8i16*)t, 16);
+ vec0 = src0 & const_0x1F;
+ vec1 = src1 & const_0x1F;
+ vec0 += src2 & const_0x1F;
+ vec1 += src3 & const_0x1F;
+ vec0 = (v8u16)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+ src0 = (v8u16)__msa_srai_h((v8i16)src0, 5);
+ src1 = (v8u16)__msa_srai_h((v8i16)src1, 5);
+ src2 = (v8u16)__msa_srai_h((v8i16)src2, 5);
+ src3 = (v8u16)__msa_srai_h((v8i16)src3, 5);
+ vec2 = src0 & const_0x1F;
+ vec3 = src1 & const_0x1F;
+ vec2 += src2 & const_0x1F;
+ vec3 += src3 & const_0x1F;
+ vec2 = (v8u16)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
+ src0 = (v8u16)__msa_srai_h((v8i16)src0, 5);
+ src1 = (v8u16)__msa_srai_h((v8i16)src1, 5);
+ src2 = (v8u16)__msa_srai_h((v8i16)src2, 5);
+ src3 = (v8u16)__msa_srai_h((v8i16)src3, 5);
+ vec4 = src0 & const_0x1F;
+ vec5 = src1 & const_0x1F;
+ vec4 += src2 & const_0x1F;
+ vec5 += src3 & const_0x1F;
+ vec4 = (v8u16)__msa_pckev_b((v16i8)vec5, (v16i8)vec4);
+ vec0 = __msa_hadd_u_h((v16u8)vec0, (v16u8)vec0);
+ vec2 = __msa_hadd_u_h((v16u8)vec2, (v16u8)vec2);
+ vec4 = __msa_hadd_u_h((v16u8)vec4, (v16u8)vec4);
+ vec6 = (v8u16)__msa_slli_h((v8i16)vec0, 1);
+ vec6 |= (v8u16)__msa_srai_h((v8i16)vec0, 6);
+ vec0 = (v8u16)__msa_slli_h((v8i16)vec2, 1);
+ vec0 |= (v8u16)__msa_srai_h((v8i16)vec2, 6);
+ vec2 = (v8u16)__msa_slli_h((v8i16)vec4, 1);
+ vec2 |= (v8u16)__msa_srai_h((v8i16)vec4, 6);
+ reg0 = vec6 * const_0x70;
+ reg1 = vec0 * const_0x4A;
+ reg2 = vec2 * const_0x70;
+ reg3 = vec0 * const_0x5E;
+ reg0 += const_0x8080;
+ reg1 += vec2 * const_0x26;
+ reg2 += const_0x8080;
+ reg3 += vec6 * const_0x12;
+ reg0 -= reg1;
+ reg2 -= reg3;
+ reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 8);
+ reg2 = (v8u16)__msa_srai_h((v8i16)reg2, 8);
+ dst0 = (v16u8)__msa_pckev_b((v16i8)reg2, (v16i8)reg0);
+ res0 = __msa_copy_u_d((v2i64)dst0, 0);
+ res1 = __msa_copy_u_d((v2i64)dst0, 1);
+ SD(res0, dst_u);
+ SD(res1, dst_v);
+ s += 16;
+ t += 16;
+ dst_u += 8;
+ dst_v += 8;
+ }
+}
+
+void RGB565ToUVRow_MSA(const uint8* src_rgb565,
+ int src_stride_rgb565,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
+ int x;
+ const uint16* s = (const uint16*)src_rgb565;
+ const uint16* t = (const uint16*)(src_rgb565 + src_stride_rgb565);
+ int64_t res0, res1;
+ v8u16 src0, src1, src2, src3, reg0, reg1, reg2, reg3;
+ v8u16 vec0, vec1, vec2, vec3, vec4, vec5;
+ v16u8 dst0;
+ v8u16 const_0x70 = (v8u16)__msa_ldi_h(0x70);
+ v8u16 const_0x4A = (v8u16)__msa_ldi_h(0x4A);
+ v8u16 const_0x26 = (v8u16)__msa_ldi_h(0x26);
+ v8u16 const_0x5E = (v8u16)__msa_ldi_h(0x5E);
+ v8u16 const_0x12 = (v8u16)__msa_ldi_h(0x12);
+ v8u16 const_32896 = (v8u16)__msa_fill_h(0x8080);
+ v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F);
+ v8u16 const_0x3F = (v8u16)__msa_fill_h(0x3F);
+
+ for (x = 0; x < width; x += 16) {
+ src0 = (v8u16)__msa_ld_b((v8i16*)s, 0);
+ src1 = (v8u16)__msa_ld_b((v8i16*)s, 16);
+ src2 = (v8u16)__msa_ld_b((v8i16*)t, 0);
+ src3 = (v8u16)__msa_ld_b((v8i16*)t, 16);
+ vec0 = src0 & const_0x1F;
+ vec1 = src1 & const_0x1F;
+ vec0 += src2 & const_0x1F;
+ vec1 += src3 & const_0x1F;
+ vec0 = (v8u16)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+ src0 = (v8u16)__msa_srai_h((v8i16)src0, 5);
+ src1 = (v8u16)__msa_srai_h((v8i16)src1, 5);
+ src2 = (v8u16)__msa_srai_h((v8i16)src2, 5);
+ src3 = (v8u16)__msa_srai_h((v8i16)src3, 5);
+ vec2 = src0 & const_0x3F;
+ vec3 = src1 & const_0x3F;
+ vec2 += src2 & const_0x3F;
+ vec3 += src3 & const_0x3F;
+ vec1 = (v8u16)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
+ src0 = (v8u16)__msa_srai_h((v8i16)src0, 6);
+ src1 = (v8u16)__msa_srai_h((v8i16)src1, 6);
+ src2 = (v8u16)__msa_srai_h((v8i16)src2, 6);
+ src3 = (v8u16)__msa_srai_h((v8i16)src3, 6);
+ vec4 = src0 & const_0x1F;
+ vec5 = src1 & const_0x1F;
+ vec4 += src2 & const_0x1F;
+ vec5 += src3 & const_0x1F;
+ vec2 = (v8u16)__msa_pckev_b((v16i8)vec5, (v16i8)vec4);
+ vec0 = __msa_hadd_u_h((v16u8)vec0, (v16u8)vec0);
+ vec1 = __msa_hadd_u_h((v16u8)vec1, (v16u8)vec1);
+ vec2 = __msa_hadd_u_h((v16u8)vec2, (v16u8)vec2);
+ vec3 = (v8u16)__msa_slli_h((v8i16)vec0, 1);
+ vec3 |= (v8u16)__msa_srai_h((v8i16)vec0, 6);
+ vec4 = (v8u16)__msa_slli_h((v8i16)vec2, 1);
+ vec4 |= (v8u16)__msa_srai_h((v8i16)vec2, 6);
+ reg0 = vec3 * const_0x70;
+ reg1 = vec1 * const_0x4A;
+ reg2 = vec4 * const_0x70;
+ reg3 = vec1 * const_0x5E;
+ reg0 += const_32896;
+ reg1 += vec4 * const_0x26;
+ reg2 += const_32896;
+ reg3 += vec3 * const_0x12;
+ reg0 -= reg1;
+ reg2 -= reg3;
+ reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 8);
+ reg2 = (v8u16)__msa_srai_h((v8i16)reg2, 8);
+ dst0 = (v16u8)__msa_pckev_b((v16i8)reg2, (v16i8)reg0);
+ res0 = __msa_copy_u_d((v2i64)dst0, 0);
+ res1 = __msa_copy_u_d((v2i64)dst0, 1);
+ SD(res0, dst_u);
+ SD(res1, dst_v);
+ s += 16;
+ t += 16;
+ dst_u += 8;
+ dst_v += 8;
+ }
+}
+
+void RGB24ToUVRow_MSA(const uint8* src_rgb0,
+ int src_stride_rgb,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
+ int x;
+ const uint8* s = src_rgb0;
+ const uint8* t = src_rgb0 + src_stride_rgb;
+ int64 res0, res1;
+ v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+ v16u8 inp0, inp1, inp2, inp3, inp4, inp5;
+ v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ v8i16 reg0, reg1, reg2, reg3;
+ v16u8 dst0;
+ v8u16 const_0x70 = (v8u16)__msa_fill_h(0x70);
+ v8u16 const_0x4A = (v8u16)__msa_fill_h(0x4A);
+ v8u16 const_0x26 = (v8u16)__msa_fill_h(0x26);
+ v8u16 const_0x5E = (v8u16)__msa_fill_h(0x5E);
+ v8u16 const_0x12 = (v8u16)__msa_fill_h(0x12);
+ v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
+ v16i8 mask = {0, 1, 2, 16, 3, 4, 5, 17, 6, 7, 8, 18, 9, 10, 11, 19};
+ v16i8 zero = {0};
+
+ for (x = 0; x < width; x += 16) {
+ inp0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+ inp1 = (v16u8)__msa_ld_b((v16i8*)s, 16);
+ inp2 = (v16u8)__msa_ld_b((v16i8*)s, 32);
+ inp3 = (v16u8)__msa_ld_b((v16i8*)t, 0);
+ inp4 = (v16u8)__msa_ld_b((v16i8*)t, 16);
+ inp5 = (v16u8)__msa_ld_b((v16i8*)t, 32);
+ src1 = (v16u8)__msa_sldi_b((v16i8)inp1, (v16i8)inp0, 12);
+ src5 = (v16u8)__msa_sldi_b((v16i8)inp4, (v16i8)inp3, 12);
+ src2 = (v16u8)__msa_sldi_b((v16i8)inp2, (v16i8)inp1, 8);
+ src6 = (v16u8)__msa_sldi_b((v16i8)inp5, (v16i8)inp4, 8);
+ src3 = (v16u8)__msa_sldi_b((v16i8)inp2, (v16i8)inp2, 4);
+ src7 = (v16u8)__msa_sldi_b((v16i8)inp5, (v16i8)inp5, 4);
+ src0 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)inp0);
+ src1 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src1);
+ src2 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src2);
+ src3 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src3);
+ src4 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)inp3);
+ src5 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src5);
+ src6 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src6);
+ src7 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src7);
+ vec0 = (v8u16)__msa_ilvr_b((v16i8)src4, (v16i8)src0);
+ vec1 = (v8u16)__msa_ilvl_b((v16i8)src4, (v16i8)src0);
+ vec2 = (v8u16)__msa_ilvr_b((v16i8)src5, (v16i8)src1);
+ vec3 = (v8u16)__msa_ilvl_b((v16i8)src5, (v16i8)src1);
+ vec4 = (v8u16)__msa_ilvr_b((v16i8)src6, (v16i8)src2);
+ vec5 = (v8u16)__msa_ilvl_b((v16i8)src6, (v16i8)src2);
+ vec6 = (v8u16)__msa_ilvr_b((v16i8)src7, (v16i8)src3);
+ vec7 = (v8u16)__msa_ilvl_b((v16i8)src7, (v16i8)src3);
+ vec0 = (v8u16)__msa_hadd_u_h((v16u8)vec0, (v16u8)vec0);
+ vec1 = (v8u16)__msa_hadd_u_h((v16u8)vec1, (v16u8)vec1);
+ vec2 = (v8u16)__msa_hadd_u_h((v16u8)vec2, (v16u8)vec2);
+ vec3 = (v8u16)__msa_hadd_u_h((v16u8)vec3, (v16u8)vec3);
+ vec4 = (v8u16)__msa_hadd_u_h((v16u8)vec4, (v16u8)vec4);
+ vec5 = (v8u16)__msa_hadd_u_h((v16u8)vec5, (v16u8)vec5);
+ vec6 = (v8u16)__msa_hadd_u_h((v16u8)vec6, (v16u8)vec6);
+ vec7 = (v8u16)__msa_hadd_u_h((v16u8)vec7, (v16u8)vec7);
+ reg0 = (v8i16)__msa_pckev_d((v2i64)vec1, (v2i64)vec0);
+ reg1 = (v8i16)__msa_pckev_d((v2i64)vec3, (v2i64)vec2);
+ reg2 = (v8i16)__msa_pckev_d((v2i64)vec5, (v2i64)vec4);
+ reg3 = (v8i16)__msa_pckev_d((v2i64)vec7, (v2i64)vec6);
+ reg0 += (v8i16)__msa_pckod_d((v2i64)vec1, (v2i64)vec0);
+ reg1 += (v8i16)__msa_pckod_d((v2i64)vec3, (v2i64)vec2);
+ reg2 += (v8i16)__msa_pckod_d((v2i64)vec5, (v2i64)vec4);
+ reg3 += (v8i16)__msa_pckod_d((v2i64)vec7, (v2i64)vec6);
+ reg0 = __msa_srai_h((v8i16)reg0, 2);
+ reg1 = __msa_srai_h((v8i16)reg1, 2);
+ reg2 = __msa_srai_h((v8i16)reg2, 2);
+ reg3 = __msa_srai_h((v8i16)reg3, 2);
+ vec4 = (v8u16)__msa_pckev_h(reg1, reg0);
+ vec5 = (v8u16)__msa_pckev_h(reg3, reg2);
+ vec6 = (v8u16)__msa_pckod_h(reg1, reg0);
+ vec7 = (v8u16)__msa_pckod_h(reg3, reg2);
+ vec0 = (v8u16)__msa_pckev_h((v8i16)vec5, (v8i16)vec4);
+ vec1 = (v8u16)__msa_pckev_h((v8i16)vec7, (v8i16)vec6);
+ vec2 = (v8u16)__msa_pckod_h((v8i16)vec5, (v8i16)vec4);
+ vec3 = vec0 * const_0x70;
+ vec4 = vec1 * const_0x4A;
+ vec5 = vec2 * const_0x26;
+ vec2 *= const_0x70;
+ vec1 *= const_0x5E;
+ vec0 *= const_0x12;
+ reg0 = __msa_subv_h((v8i16)vec3, (v8i16)vec4);
+ reg1 = __msa_subv_h((v8i16)const_0x8080, (v8i16)vec5);
+ reg2 = __msa_subv_h((v8i16)vec2, (v8i16)vec1);
+ reg3 = __msa_subv_h((v8i16)const_0x8080, (v8i16)vec0);
+ reg0 += reg1;
+ reg2 += reg3;
+ reg0 = __msa_srai_h(reg0, 8);
+ reg2 = __msa_srai_h(reg2, 8);
+ dst0 = (v16u8)__msa_pckev_b((v16i8)reg2, (v16i8)reg0);
+ res0 = __msa_copy_u_d((v2i64)dst0, 0);
+ res1 = __msa_copy_u_d((v2i64)dst0, 1);
+ SD(res0, dst_u);
+ SD(res1, dst_v);
+ t += 48;
+ s += 48;
+ dst_u += 8;
+ dst_v += 8;
+ }
+}
+
+void RAWToUVRow_MSA(const uint8* src_rgb0,
+ int src_stride_rgb,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
+ int x;
+ const uint8* s = src_rgb0;
+ const uint8* t = src_rgb0 + src_stride_rgb;
+ int64 res0, res1;
+ v16u8 inp0, inp1, inp2, inp3, inp4, inp5;
+ v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+ v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ v8i16 reg0, reg1, reg2, reg3;
+ v16u8 dst0;
+ v8u16 const_0x70 = (v8u16)__msa_fill_h(0x70);
+ v8u16 const_0x4A = (v8u16)__msa_fill_h(0x4A);
+ v8u16 const_0x26 = (v8u16)__msa_fill_h(0x26);
+ v8u16 const_0x5E = (v8u16)__msa_fill_h(0x5E);
+ v8u16 const_0x12 = (v8u16)__msa_fill_h(0x12);
+ v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
+ v16i8 mask = {0, 1, 2, 16, 3, 4, 5, 17, 6, 7, 8, 18, 9, 10, 11, 19};
+ v16i8 zero = {0};
+
+ for (x = 0; x < width; x += 16) {
+ inp0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+ inp1 = (v16u8)__msa_ld_b((v16i8*)s, 16);
+ inp2 = (v16u8)__msa_ld_b((v16i8*)s, 32);
+ inp3 = (v16u8)__msa_ld_b((v16i8*)t, 0);
+ inp4 = (v16u8)__msa_ld_b((v16i8*)t, 16);
+ inp5 = (v16u8)__msa_ld_b((v16i8*)t, 32);
+ src1 = (v16u8)__msa_sldi_b((v16i8)inp1, (v16i8)inp0, 12);
+ src5 = (v16u8)__msa_sldi_b((v16i8)inp4, (v16i8)inp3, 12);
+ src2 = (v16u8)__msa_sldi_b((v16i8)inp2, (v16i8)inp1, 8);
+ src6 = (v16u8)__msa_sldi_b((v16i8)inp5, (v16i8)inp4, 8);
+ src3 = (v16u8)__msa_sldi_b((v16i8)inp2, (v16i8)inp2, 4);
+ src7 = (v16u8)__msa_sldi_b((v16i8)inp5, (v16i8)inp5, 4);
+ src0 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)inp0);
+ src1 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src1);
+ src2 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src2);
+ src3 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src3);
+ src4 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)inp3);
+ src5 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src5);
+ src6 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src6);
+ src7 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src7);
+ vec0 = (v8u16)__msa_ilvr_b((v16i8)src4, (v16i8)src0);
+ vec1 = (v8u16)__msa_ilvl_b((v16i8)src4, (v16i8)src0);
+ vec2 = (v8u16)__msa_ilvr_b((v16i8)src5, (v16i8)src1);
+ vec3 = (v8u16)__msa_ilvl_b((v16i8)src5, (v16i8)src1);
+ vec4 = (v8u16)__msa_ilvr_b((v16i8)src6, (v16i8)src2);
+ vec5 = (v8u16)__msa_ilvl_b((v16i8)src6, (v16i8)src2);
+ vec6 = (v8u16)__msa_ilvr_b((v16i8)src7, (v16i8)src3);
+ vec7 = (v8u16)__msa_ilvl_b((v16i8)src7, (v16i8)src3);
+ vec0 = (v8u16)__msa_hadd_u_h((v16u8)vec0, (v16u8)vec0);
+ vec1 = (v8u16)__msa_hadd_u_h((v16u8)vec1, (v16u8)vec1);
+ vec2 = (v8u16)__msa_hadd_u_h((v16u8)vec2, (v16u8)vec2);
+ vec3 = (v8u16)__msa_hadd_u_h((v16u8)vec3, (v16u8)vec3);
+ vec4 = (v8u16)__msa_hadd_u_h((v16u8)vec4, (v16u8)vec4);
+ vec5 = (v8u16)__msa_hadd_u_h((v16u8)vec5, (v16u8)vec5);
+ vec6 = (v8u16)__msa_hadd_u_h((v16u8)vec6, (v16u8)vec6);
+ vec7 = (v8u16)__msa_hadd_u_h((v16u8)vec7, (v16u8)vec7);
+ reg0 = (v8i16)__msa_pckev_d((v2i64)vec1, (v2i64)vec0);
+ reg1 = (v8i16)__msa_pckev_d((v2i64)vec3, (v2i64)vec2);
+ reg2 = (v8i16)__msa_pckev_d((v2i64)vec5, (v2i64)vec4);
+ reg3 = (v8i16)__msa_pckev_d((v2i64)vec7, (v2i64)vec6);
+ reg0 += (v8i16)__msa_pckod_d((v2i64)vec1, (v2i64)vec0);
+ reg1 += (v8i16)__msa_pckod_d((v2i64)vec3, (v2i64)vec2);
+ reg2 += (v8i16)__msa_pckod_d((v2i64)vec5, (v2i64)vec4);
+ reg3 += (v8i16)__msa_pckod_d((v2i64)vec7, (v2i64)vec6);
+ reg0 = __msa_srai_h(reg0, 2);
+ reg1 = __msa_srai_h(reg1, 2);
+ reg2 = __msa_srai_h(reg2, 2);
+ reg3 = __msa_srai_h(reg3, 2);
+ vec4 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
+ vec5 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
+ vec6 = (v8u16)__msa_pckod_h((v8i16)reg1, (v8i16)reg0);
+ vec7 = (v8u16)__msa_pckod_h((v8i16)reg3, (v8i16)reg2);
+ vec0 = (v8u16)__msa_pckod_h((v8i16)vec5, (v8i16)vec4);
+ vec1 = (v8u16)__msa_pckev_h((v8i16)vec7, (v8i16)vec6);
+ vec2 = (v8u16)__msa_pckev_h((v8i16)vec5, (v8i16)vec4);
+ vec3 = vec0 * const_0x70;
+ vec4 = vec1 * const_0x4A;
+ vec5 = vec2 * const_0x26;
+ vec2 *= const_0x70;
+ vec1 *= const_0x5E;
+ vec0 *= const_0x12;
+ reg0 = __msa_subv_h((v8i16)vec3, (v8i16)vec4);
+ reg1 = __msa_subv_h((v8i16)const_0x8080, (v8i16)vec5);
+ reg2 = __msa_subv_h((v8i16)vec2, (v8i16)vec1);
+ reg3 = __msa_subv_h((v8i16)const_0x8080, (v8i16)vec0);
+ reg0 += reg1;
+ reg2 += reg3;
+ reg0 = __msa_srai_h(reg0, 8);
+ reg2 = __msa_srai_h(reg2, 8);
+ dst0 = (v16u8)__msa_pckev_b((v16i8)reg2, (v16i8)reg0);
+ res0 = __msa_copy_u_d((v2i64)dst0, 0);
+ res1 = __msa_copy_u_d((v2i64)dst0, 1);
+ SD(res0, dst_u);
+ SD(res1, dst_v);
+ t += 48;
+ s += 48;
+ dst_u += 8;
+ dst_v += 8;
+ }
+}
+
+void NV12ToARGBRow_MSA(const uint8* src_y,
+ const uint8* src_uv,
+ uint8* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ uint64 val0, val1;
+ v16u8 src0, src1, res0, res1, dst0, dst1;
+ v8i16 vec0, vec1, vec2;
+ v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
+ v4i32 vec_ubvr, vec_ugvg;
+ v16u8 zero = {0};
+ v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
+
+ YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
+ vec_br, vec_yg);
+ vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
+ vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
+
+ for (x = 0; x < width; x += 8) {
+ val0 = LD(src_y);
+ val1 = LD(src_uv);
+ src0 = (v16u8)__msa_insert_d((v2i64)zero, 0, val0);
+ src1 = (v16u8)__msa_insert_d((v2i64)zero, 0, val1);
+ YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
+ vec0, vec1, vec2);
+ res0 = (v16u8)__msa_ilvev_b((v16i8)vec2, (v16i8)vec0);
+ res1 = (v16u8)__msa_ilvev_b((v16i8)alpha, (v16i8)vec1);
+ dst0 = (v16u8)__msa_ilvr_b((v16i8)res1, (v16i8)res0);
+ dst1 = (v16u8)__msa_ilvl_b((v16i8)res1, (v16i8)res0);
+ ST_UB2(dst0, dst1, rgb_buf, 16);
+ src_y += 8;
+ src_uv += 8;
+ rgb_buf += 32;
+ }
+}
+
+void NV12ToRGB565Row_MSA(const uint8* src_y,
+ const uint8* src_uv,
+ uint8* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ uint64 val0, val1;
+ v16u8 src0, src1, dst0;
+ v8i16 vec0, vec1, vec2;
+ v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
+ v4i32 vec_ubvr, vec_ugvg;
+ v16u8 zero = {0};
+
+ YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
+ vec_br, vec_yg);
+ vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
+ vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
+
+ for (x = 0; x < width; x += 8) {
+ val0 = LD(src_y);
+ val1 = LD(src_uv);
+ src0 = (v16u8)__msa_insert_d((v2i64)zero, 0, val0);
+ src1 = (v16u8)__msa_insert_d((v2i64)zero, 0, val1);
+ YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
+ vec0, vec1, vec2);
+ vec0 = vec0 >> 3;
+ vec1 = (vec1 >> 2) << 5;
+ vec2 = (vec2 >> 3) << 11;
+ dst0 = (v16u8)(vec0 | vec1 | vec2);
+ ST_UB(dst0, rgb_buf);
+ src_y += 8;
+ src_uv += 8;
+ rgb_buf += 16;
+ }
+}
+
+void NV21ToARGBRow_MSA(const uint8* src_y,
+ const uint8* src_vu,
+ uint8* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ uint64 val0, val1;
+ v16u8 src0, src1, res0, res1, dst0, dst1;
+ v8i16 vec0, vec1, vec2;
+ v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
+ v4i32 vec_ubvr, vec_ugvg;
+ v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
+ v16u8 zero = {0};
+ v16i8 shuffler = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
+
+ YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
+ vec_br, vec_yg);
+ vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
+ vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
+
+ for (x = 0; x < width; x += 8) {
+ val0 = LD(src_y);
+ val1 = LD(src_vu);
+ src0 = (v16u8)__msa_insert_d((v2i64)zero, 0, val0);
+ src1 = (v16u8)__msa_insert_d((v2i64)zero, 0, val1);
+ src1 = (v16u8)__msa_vshf_b(shuffler, (v16i8)src1, (v16i8)src1);
+ YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
+ vec0, vec1, vec2);
+ res0 = (v16u8)__msa_ilvev_b((v16i8)vec2, (v16i8)vec0);
+ res1 = (v16u8)__msa_ilvev_b((v16i8)alpha, (v16i8)vec1);
+ dst0 = (v16u8)__msa_ilvr_b((v16i8)res1, (v16i8)res0);
+ dst1 = (v16u8)__msa_ilvl_b((v16i8)res1, (v16i8)res0);
+ ST_UB2(dst0, dst1, rgb_buf, 16);
+ src_y += 8;
+ src_vu += 8;
+ rgb_buf += 32;
+ }
+}
+
+void SobelRow_MSA(const uint8* src_sobelx,
+ const uint8* src_sobely,
+ uint8* dst_argb,
+ int width) {
+ int x;
+ v16u8 src0, src1, vec0, dst0, dst1, dst2, dst3;
+ v16i8 mask0 = {0, 0, 0, 16, 1, 1, 1, 16, 2, 2, 2, 16, 3, 3, 3, 16};
+ v16i8 const_0x4 = __msa_ldi_b(0x4);
+ v16i8 mask1 = mask0 + const_0x4;
+ v16i8 mask2 = mask1 + const_0x4;
+ v16i8 mask3 = mask2 + const_0x4;
+ v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
+
+ for (x = 0; x < width; x += 16) {
+ src0 = (v16u8)__msa_ld_b((v16i8*)src_sobelx, 0);
+ src1 = (v16u8)__msa_ld_b((v16i8*)src_sobely, 0);
+ vec0 = __msa_adds_u_b(src0, src1);
+ dst0 = (v16u8)__msa_vshf_b(mask0, (v16i8)alpha, (v16i8)vec0);
+ dst1 = (v16u8)__msa_vshf_b(mask1, (v16i8)alpha, (v16i8)vec0);
+ dst2 = (v16u8)__msa_vshf_b(mask2, (v16i8)alpha, (v16i8)vec0);
+ dst3 = (v16u8)__msa_vshf_b(mask3, (v16i8)alpha, (v16i8)vec0);
+ ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
+ src_sobelx += 16;
+ src_sobely += 16;
+ dst_argb += 64;
+ }
+}
+
+void SobelToPlaneRow_MSA(const uint8* src_sobelx,
+ const uint8* src_sobely,
+ uint8* dst_y,
+ int width) {
+ int x;
+ v16u8 src0, src1, src2, src3, dst0, dst1;
+
+ for (x = 0; x < width; x += 32) {
+ src0 = (v16u8)__msa_ld_b((v16i8*)src_sobelx, 0);
+ src1 = (v16u8)__msa_ld_b((v16i8*)src_sobelx, 16);
+ src2 = (v16u8)__msa_ld_b((v16i8*)src_sobely, 0);
+ src3 = (v16u8)__msa_ld_b((v16i8*)src_sobely, 16);
+ dst0 = __msa_adds_u_b(src0, src2);
+ dst1 = __msa_adds_u_b(src1, src3);
+ ST_UB2(dst0, dst1, dst_y, 16);
+ src_sobelx += 32;
+ src_sobely += 32;
+ dst_y += 32;
+ }
+}
+
+void SobelXYRow_MSA(const uint8* src_sobelx,
+ const uint8* src_sobely,
+ uint8* dst_argb,
+ int width) {
+ int x;
+ v16u8 src0, src1, vec0, vec1, vec2;
+ v16u8 reg0, reg1, dst0, dst1, dst2, dst3;
+ v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
+
+ for (x = 0; x < width; x += 16) {
+ src0 = (v16u8)__msa_ld_b((v16i8*)src_sobelx, 0);
+ src1 = (v16u8)__msa_ld_b((v16i8*)src_sobely, 0);
+ vec0 = __msa_adds_u_b(src0, src1);
+ vec1 = (v16u8)__msa_ilvr_b((v16i8)src0, (v16i8)src1);
+ vec2 = (v16u8)__msa_ilvl_b((v16i8)src0, (v16i8)src1);
+ reg0 = (v16u8)__msa_ilvr_b((v16i8)alpha, (v16i8)vec0);
+ reg1 = (v16u8)__msa_ilvl_b((v16i8)alpha, (v16i8)vec0);
+ dst0 = (v16u8)__msa_ilvr_b((v16i8)reg0, (v16i8)vec1);
+ dst1 = (v16u8)__msa_ilvl_b((v16i8)reg0, (v16i8)vec1);
+ dst2 = (v16u8)__msa_ilvr_b((v16i8)reg1, (v16i8)vec2);
+ dst3 = (v16u8)__msa_ilvl_b((v16i8)reg1, (v16i8)vec2);
+ ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
+ src_sobelx += 16;
+ src_sobely += 16;
+ dst_argb += 64;
+ }
+}
+
+void ARGBToYJRow_MSA(const uint8* src_argb0, uint8* dst_y, int width) {
+ int x;
+ v16u8 src0, src1, src2, src3, dst0;
+ v16u8 const_0x4B0F = (v16u8)__msa_fill_h(0x4B0F);
+ v16u8 const_0x26 = (v16u8)__msa_fill_h(0x26);
+ v8u16 const_0x40 = (v8u16)__msa_fill_h(0x40);
+
+ for (x = 0; x < width; x += 16) {
+ src0 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 0);
+ src1 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 16);
+ src2 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 32);
+ src3 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 48);
+ ARGBTOY(src0, src1, src2, src3, const_0x4B0F, const_0x26, const_0x40, 7,
+ dst0);
+ ST_UB(dst0, dst_y);
+ src_argb0 += 64;
+ dst_y += 16;
+ }
+}
+
+void BGRAToYRow_MSA(const uint8* src_argb0, uint8* dst_y, int width) {
+ int x;
+ v16u8 src0, src1, src2, src3, dst0;
+ v16u8 const_0x4200 = (v16u8)__msa_fill_h(0x4200);
+ v16u8 const_0x1981 = (v16u8)__msa_fill_h(0x1981);
+ v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);
+
+ for (x = 0; x < width; x += 16) {
+ src0 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 0);
+ src1 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 16);
+ src2 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 32);
+ src3 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 48);
+ ARGBTOY(src0, src1, src2, src3, const_0x4200, const_0x1981, const_0x1080, 8,
+ dst0);
+ ST_UB(dst0, dst_y);
+ src_argb0 += 64;
+ dst_y += 16;
+ }
+}
+
+void ABGRToYRow_MSA(const uint8* src_argb0, uint8* dst_y, int width) {
+ int x;
+ v16u8 src0, src1, src2, src3, dst0;
+ v16u8 const_0x8142 = (v16u8)__msa_fill_h(0x8142);
+ v16u8 const_0x19 = (v16u8)__msa_fill_h(0x19);
+ v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);
+
+ for (x = 0; x < width; x += 16) {
+ src0 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 0);
+ src1 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 16);
+ src2 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 32);
+ src3 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 48);
+ ARGBTOY(src0, src1, src2, src3, const_0x8142, const_0x19, const_0x1080, 8,
+ dst0);
+ ST_UB(dst0, dst_y);
+ src_argb0 += 64;
+ dst_y += 16;
+ }
+}
+
+void RGBAToYRow_MSA(const uint8* src_argb0, uint8* dst_y, int width) {
+ int x;
+ v16u8 src0, src1, src2, src3, dst0;
+ v16u8 const_0x1900 = (v16u8)__msa_fill_h(0x1900);
+ v16u8 const_0x4281 = (v16u8)__msa_fill_h(0x4281);
+ v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);
+
+ for (x = 0; x < width; x += 16) {
+ src0 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 0);
+ src1 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 16);
+ src2 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 32);
+ src3 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 48);
+ ARGBTOY(src0, src1, src2, src3, const_0x1900, const_0x4281, const_0x1080, 8,
+ dst0);
+ ST_UB(dst0, dst_y);
+ src_argb0 += 64;
+ dst_y += 16;
+ }
+}
+
+void ARGBToUVJRow_MSA(const uint8* src_rgb0,
+ int src_stride_rgb,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
+ int x;
+ const uint8* s = src_rgb0;
+ const uint8* t = src_rgb0 + src_stride_rgb;
+ v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+ v16u8 vec0, vec1, vec2, vec3;
+ v16u8 dst0, dst1;
+ v16i8 shuffler0 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29};
+ v16i8 shuffler1 = {2, 3, 6, 7, 10, 11, 14, 15,
+ 18, 19, 22, 23, 26, 27, 30, 31};
+ v16i8 shuffler2 = {0, 3, 4, 7, 8, 11, 12, 15, 16, 19, 20, 23, 24, 27, 28, 31};
+ v16i8 shuffler3 = {1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30};
+ v16u8 const_0x7F = (v16u8)__msa_fill_h(0x7F);
+ v16u8 const_0x6B14 = (v16u8)__msa_fill_h(0x6B14);
+ v16u8 const_0x2B54 = (v16u8)__msa_fill_h(0x2B54);
+ v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
+
+ for (x = 0; x < width; x += 32) {
+ src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+ src1 = (v16u8)__msa_ld_b((v16i8*)s, 16);
+ src2 = (v16u8)__msa_ld_b((v16i8*)s, 32);
+ src3 = (v16u8)__msa_ld_b((v16i8*)s, 48);
+ src4 = (v16u8)__msa_ld_b((v16i8*)t, 0);
+ src5 = (v16u8)__msa_ld_b((v16i8*)t, 16);
+ src6 = (v16u8)__msa_ld_b((v16i8*)t, 32);
+ src7 = (v16u8)__msa_ld_b((v16i8*)t, 48);
+ src0 = __msa_aver_u_b(src0, src4);
+ src1 = __msa_aver_u_b(src1, src5);
+ src2 = __msa_aver_u_b(src2, src6);
+ src3 = __msa_aver_u_b(src3, src7);
+ src4 = (v16u8)__msa_pckev_w((v4i32)src1, (v4i32)src0);
+ src5 = (v16u8)__msa_pckev_w((v4i32)src3, (v4i32)src2);
+ src6 = (v16u8)__msa_pckod_w((v4i32)src1, (v4i32)src0);
+ src7 = (v16u8)__msa_pckod_w((v4i32)src3, (v4i32)src2);
+ vec0 = __msa_aver_u_b(src4, src6);
+ vec1 = __msa_aver_u_b(src5, src7);
+ src0 = (v16u8)__msa_ld_b((v16i8*)s, 64);
+ src1 = (v16u8)__msa_ld_b((v16i8*)s, 80);
+ src2 = (v16u8)__msa_ld_b((v16i8*)s, 96);
+ src3 = (v16u8)__msa_ld_b((v16i8*)s, 112);
+ src4 = (v16u8)__msa_ld_b((v16i8*)t, 64);
+ src5 = (v16u8)__msa_ld_b((v16i8*)t, 80);
+ src6 = (v16u8)__msa_ld_b((v16i8*)t, 96);
+ src7 = (v16u8)__msa_ld_b((v16i8*)t, 112);
+ src0 = __msa_aver_u_b(src0, src4);
+ src1 = __msa_aver_u_b(src1, src5);
+ src2 = __msa_aver_u_b(src2, src6);
+ src3 = __msa_aver_u_b(src3, src7);
+ src4 = (v16u8)__msa_pckev_w((v4i32)src1, (v4i32)src0);
+ src5 = (v16u8)__msa_pckev_w((v4i32)src3, (v4i32)src2);
+ src6 = (v16u8)__msa_pckod_w((v4i32)src1, (v4i32)src0);
+ src7 = (v16u8)__msa_pckod_w((v4i32)src3, (v4i32)src2);
+ vec2 = __msa_aver_u_b(src4, src6);
+ vec3 = __msa_aver_u_b(src5, src7);
+ ARGBTOUV(vec0, vec1, vec2, vec3, const_0x6B14, const_0x7F, const_0x2B54,
+ const_0x8080, shuffler1, shuffler0, shuffler2, shuffler3, dst0,
+ dst1);
+ ST_UB(dst0, dst_v);
+ ST_UB(dst1, dst_u);
+ s += 128;
+ t += 128;
+ dst_v += 16;
+ dst_u += 16;
+ }
+}
+
+void BGRAToUVRow_MSA(const uint8* src_rgb0,
+ int src_stride_rgb,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
+ int x;
+ const uint8* s = src_rgb0;
+ const uint8* t = src_rgb0 + src_stride_rgb;
+ v16u8 dst0, dst1, vec0, vec1, vec2, vec3;
+ v16i8 shuffler0 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29};
+ v16i8 shuffler1 = {2, 3, 6, 7, 10, 11, 14, 15,
+ 18, 19, 22, 23, 26, 27, 30, 31};
+ v16i8 shuffler2 = {0, 3, 4, 7, 8, 11, 12, 15, 16, 19, 20, 23, 24, 27, 28, 31};
+ v16i8 shuffler3 = {2, 1, 6, 5, 10, 9, 14, 13, 18, 17, 22, 21, 26, 25, 30, 29};
+ v16u8 const_0x125E = (v16u8)__msa_fill_h(0x125E);
+ v16u8 const_0x7000 = (v16u8)__msa_fill_h(0x7000);
+ v16u8 const_0x264A = (v16u8)__msa_fill_h(0x264A);
+ v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
+
+ for (x = 0; x < width; x += 32) {
+ READ_ARGB(s, t, vec0, vec1, vec2, vec3);
+ ARGBTOUV(vec0, vec1, vec2, vec3, const_0x125E, const_0x7000, const_0x264A,
+ const_0x8080, shuffler0, shuffler1, shuffler2, shuffler3, dst0,
+ dst1);
+ ST_UB(dst0, dst_v);
+ ST_UB(dst1, dst_u);
+ s += 128;
+ t += 128;
+ dst_v += 16;
+ dst_u += 16;
+ }
+}
+
+void ABGRToUVRow_MSA(const uint8* src_rgb0,
+ int src_stride_rgb,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
+ int x;
+ const uint8* s = src_rgb0;
+ const uint8* t = src_rgb0 + src_stride_rgb;
+ v16u8 src0, src1, src2, src3;
+ v16u8 dst0, dst1;
+ v16i8 shuffler0 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29};
+ v16i8 shuffler1 = {2, 3, 6, 7, 10, 11, 14, 15,
+ 18, 19, 22, 23, 26, 27, 30, 31};
+ v16i8 shuffler2 = {0, 3, 4, 7, 8, 11, 12, 15, 16, 19, 20, 23, 24, 27, 28, 31};
+ v16i8 shuffler3 = {1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30};
+ v16u8 const_0x4A26 = (v16u8)__msa_fill_h(0x4A26);
+ v16u8 const_0x0070 = (v16u8)__msa_fill_h(0x0070);
+ v16u8 const_0x125E = (v16u8)__msa_fill_h(0x125E);
+ v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
+
+ for (x = 0; x < width; x += 32) {
+ READ_ARGB(s, t, src0, src1, src2, src3);
+ ARGBTOUV(src0, src1, src2, src3, const_0x4A26, const_0x0070, const_0x125E,
+ const_0x8080, shuffler1, shuffler0, shuffler2, shuffler3, dst0,
+ dst1);
+ ST_UB(dst0, dst_u);
+ ST_UB(dst1, dst_v);
+ s += 128;
+ t += 128;
+ dst_u += 16;
+ dst_v += 16;
+ }
+}
+
+void RGBAToUVRow_MSA(const uint8* src_rgb0,
+ int src_stride_rgb,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
+ int x;
+ const uint8* s = src_rgb0;
+ const uint8* t = src_rgb0 + src_stride_rgb;
+ v16u8 dst0, dst1, vec0, vec1, vec2, vec3;
+ v16i8 shuffler0 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29};
+ v16i8 shuffler1 = {2, 3, 6, 7, 10, 11, 14, 15,
+ 18, 19, 22, 23, 26, 27, 30, 31};
+ v16i8 shuffler2 = {0, 3, 4, 7, 8, 11, 12, 15, 16, 19, 20, 23, 24, 27, 28, 31};
+ v16i8 shuffler3 = {2, 1, 6, 5, 10, 9, 14, 13, 18, 17, 22, 21, 26, 25, 30, 29};
+ v16u8 const_0x125E = (v16u8)__msa_fill_h(0x264A);
+ v16u8 const_0x7000 = (v16u8)__msa_fill_h(0x7000);
+ v16u8 const_0x264A = (v16u8)__msa_fill_h(0x125E);
+ v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
+
+ for (x = 0; x < width; x += 32) {
+ READ_ARGB(s, t, vec0, vec1, vec2, vec3);
+ ARGBTOUV(vec0, vec1, vec2, vec3, const_0x125E, const_0x7000, const_0x264A,
+ const_0x8080, shuffler0, shuffler1, shuffler2, shuffler3, dst0,
+ dst1);
+ ST_UB(dst0, dst_u);
+ ST_UB(dst1, dst_v);
+ s += 128;
+ t += 128;
+ dst_u += 16;
+ dst_v += 16;
+ }
+}
+
+void I444ToARGBRow_MSA(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ v16u8 src0, src1, src2, dst0, dst1;
+ v8u16 vec0, vec1, vec2;
+ v4i32 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9;
+ v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
+ v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
+ v8i16 zero = {0};
+
+ YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
+ vec_br, vec_yg);
+
+ for (x = 0; x < width; x += 8) {
+ READI444(src_y, src_u, src_v, src0, src1, src2);
+ vec0 = (v8u16)__msa_ilvr_b((v16i8)src0, (v16i8)src0);
+ reg0 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec0);
+ reg1 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec0);
+ reg0 *= vec_yg;
+ reg1 *= vec_yg;
+ reg0 = __msa_srai_w(reg0, 16);
+ reg1 = __msa_srai_w(reg1, 16);
+ reg4 = reg0 + vec_br;
+ reg5 = reg1 + vec_br;
+ reg2 = reg0 + vec_bg;
+ reg3 = reg1 + vec_bg;
+ reg0 += vec_bb;
+ reg1 += vec_bb;
+ vec0 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src1);
+ vec1 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src2);
+ reg6 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec0);
+ reg7 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec0);
+ reg8 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec1);
+ reg9 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec1);
+ reg0 -= reg6 * vec_ub;
+ reg1 -= reg7 * vec_ub;
+ reg2 -= reg6 * vec_ug;
+ reg3 -= reg7 * vec_ug;
+ reg4 -= reg8 * vec_vr;
+ reg5 -= reg9 * vec_vr;
+ reg2 -= reg8 * vec_vg;
+ reg3 -= reg9 * vec_vg;
+ reg0 = __msa_srai_w(reg0, 6);
+ reg1 = __msa_srai_w(reg1, 6);
+ reg2 = __msa_srai_w(reg2, 6);
+ reg3 = __msa_srai_w(reg3, 6);
+ reg4 = __msa_srai_w(reg4, 6);
+ reg5 = __msa_srai_w(reg5, 6);
+ CLIP_0TO255(reg0, reg1, reg2, reg3, reg4, reg5);
+ vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
+ vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
+ vec2 = (v8u16)__msa_pckev_h((v8i16)reg5, (v8i16)reg4);
+ vec0 = (v8u16)__msa_ilvev_b((v16i8)vec1, (v16i8)vec0);
+ vec1 = (v8u16)__msa_ilvev_b((v16i8)alpha, (v16i8)vec2);
+ dst0 = (v16u8)__msa_ilvr_h((v8i16)vec1, (v8i16)vec0);
+ dst1 = (v16u8)__msa_ilvl_h((v8i16)vec1, (v8i16)vec0);
+ ST_UB2(dst0, dst1, rgb_buf, 16);
+ src_y += 8;
+ src_u += 8;
+ src_v += 8;
+ rgb_buf += 32;
+ }
+}
+
+void I400ToARGBRow_MSA(const uint8* src_y, uint8* rgb_buf, int width) {
+ int x;
+ v16u8 src0, res0, res1, res2, res3, res4, dst0, dst1, dst2, dst3;
+ v8i16 vec0, vec1;
+ v4i32 reg0, reg1, reg2, reg3;
+ v4i32 vec_yg = __msa_fill_w(0x4A35);
+ v8i16 vec_ygb = __msa_fill_h(0xFB78);
+ v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
+ v8i16 max = __msa_ldi_h(0xFF);
+ v8i16 zero = {0};
+
+ for (x = 0; x < width; x += 16) {
+ src0 = (v16u8)__msa_ld_b((v16i8*)src_y, 0);
+ vec0 = (v8i16)__msa_ilvr_b((v16i8)src0, (v16i8)src0);
+ vec1 = (v8i16)__msa_ilvl_b((v16i8)src0, (v16i8)src0);
+ reg0 = (v4i32)__msa_ilvr_h(zero, vec0);
+ reg1 = (v4i32)__msa_ilvl_h(zero, vec0);
+ reg2 = (v4i32)__msa_ilvr_h(zero, vec1);
+ reg3 = (v4i32)__msa_ilvl_h(zero, vec1);
+ reg0 *= vec_yg;
+ reg1 *= vec_yg;
+ reg2 *= vec_yg;
+ reg3 *= vec_yg;
+ reg0 = __msa_srai_w(reg0, 16);
+ reg1 = __msa_srai_w(reg1, 16);
+ reg2 = __msa_srai_w(reg2, 16);
+ reg3 = __msa_srai_w(reg3, 16);
+ vec0 = (v8i16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
+ vec1 = (v8i16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
+ vec0 += vec_ygb;
+ vec1 += vec_ygb;
+ vec0 = __msa_srai_h(vec0, 6);
+ vec1 = __msa_srai_h(vec1, 6);
+ vec0 = __msa_maxi_s_h(vec0, 0);
+ vec1 = __msa_maxi_s_h(vec1, 0);
+ vec0 = __msa_min_s_h(max, vec0);
+ vec1 = __msa_min_s_h(max, vec1);
+ res0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+ res1 = (v16u8)__msa_ilvr_b((v16i8)res0, (v16i8)res0);
+ res2 = (v16u8)__msa_ilvl_b((v16i8)res0, (v16i8)res0);
+ res3 = (v16u8)__msa_ilvr_b((v16i8)alpha, (v16i8)res0);
+ res4 = (v16u8)__msa_ilvl_b((v16i8)alpha, (v16i8)res0);
+ dst0 = (v16u8)__msa_ilvr_b((v16i8)res3, (v16i8)res1);
+ dst1 = (v16u8)__msa_ilvl_b((v16i8)res3, (v16i8)res1);
+ dst2 = (v16u8)__msa_ilvr_b((v16i8)res4, (v16i8)res2);
+ dst3 = (v16u8)__msa_ilvl_b((v16i8)res4, (v16i8)res2);
+ ST_UB4(dst0, dst1, dst2, dst3, rgb_buf, 16);
+ src_y += 16;
+ rgb_buf += 64;
+ }
+}
+
+void J400ToARGBRow_MSA(const uint8* src_y, uint8* dst_argb, int width) {
+ int x;
+ v16u8 src0, vec0, vec1, vec2, vec3, dst0, dst1, dst2, dst3;
+ v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
+
+ for (x = 0; x < width; x += 16) {
+ src0 = (v16u8)__msa_ld_b((v16i8*)src_y, 0);
+ vec0 = (v16u8)__msa_ilvr_b((v16i8)src0, (v16i8)src0);
+ vec1 = (v16u8)__msa_ilvl_b((v16i8)src0, (v16i8)src0);
+ vec2 = (v16u8)__msa_ilvr_b((v16i8)alpha, (v16i8)src0);
+ vec3 = (v16u8)__msa_ilvl_b((v16i8)alpha, (v16i8)src0);
+ dst0 = (v16u8)__msa_ilvr_b((v16i8)vec2, (v16i8)vec0);
+ dst1 = (v16u8)__msa_ilvl_b((v16i8)vec2, (v16i8)vec0);
+ dst2 = (v16u8)__msa_ilvr_b((v16i8)vec3, (v16i8)vec1);
+ dst3 = (v16u8)__msa_ilvl_b((v16i8)vec3, (v16i8)vec1);
+ ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
+ src_y += 16;
+ dst_argb += 64;
+ }
+}
+
+void YUY2ToARGBRow_MSA(const uint8* src_yuy2,
+ uint8* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ v16u8 src0, src1, src2;
+ v8i16 vec0, vec1, vec2;
+ v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
+ v4i32 vec_ubvr, vec_ugvg;
+ v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
+
+ YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
+ vec_br, vec_yg);
+ vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
+ vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
+
+ for (x = 0; x < width; x += 8) {
+ src0 = (v16u8)__msa_ld_b((v16i8*)src_yuy2, 0);
+ src1 = (v16u8)__msa_pckev_b((v16i8)src0, (v16i8)src0);
+ src2 = (v16u8)__msa_pckod_b((v16i8)src0, (v16i8)src0);
+ YUVTORGB(src1, src2, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
+ vec0, vec1, vec2);
+ STOREARGB(vec0, vec1, vec2, alpha, rgb_buf);
+ src_yuy2 += 16;
+ rgb_buf += 32;
+ }
+}
+
+void UYVYToARGBRow_MSA(const uint8* src_uyvy,
+ uint8* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ v16u8 src0, src1, src2;
+ v8i16 vec0, vec1, vec2;
+ v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
+ v4i32 vec_ubvr, vec_ugvg;
+ v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
+
+ YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
+ vec_br, vec_yg);
+ vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
+ vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
+
+ for (x = 0; x < width; x += 8) {
+ src0 = (v16u8)__msa_ld_b((v16i8*)src_uyvy, 0);
+ src1 = (v16u8)__msa_pckod_b((v16i8)src0, (v16i8)src0);
+ src2 = (v16u8)__msa_pckev_b((v16i8)src0, (v16i8)src0);
+ YUVTORGB(src1, src2, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
+ vec0, vec1, vec2);
+ STOREARGB(vec0, vec1, vec2, alpha, rgb_buf);
+ src_uyvy += 16;
+ rgb_buf += 32;
+ }
+}
+
+void InterpolateRow_MSA(uint8* dst_ptr,
+ const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ int width,
+ int32 source_y_fraction) {
+ int32 y1_fraction = source_y_fraction;
+ int32 y0_fraction = 256 - y1_fraction;
+ uint16 y_fractions;
+ const uint8* s = src_ptr;
+ const uint8* t = src_ptr + src_stride;
+ int x;
+ v16u8 src0, src1, src2, src3, dst0, dst1;
+ v8u16 vec0, vec1, vec2, vec3, y_frac;
+
+ if (0 == y1_fraction) {
+ memcpy(dst_ptr, src_ptr, width);
+ return;
+ }
+
+ if (128 == y1_fraction) {
+ for (x = 0; x < width; x += 32) {
+ src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+ src1 = (v16u8)__msa_ld_b((v16i8*)s, 16);
+ src2 = (v16u8)__msa_ld_b((v16i8*)t, 0);
+ src3 = (v16u8)__msa_ld_b((v16i8*)t, 16);
+ dst0 = __msa_aver_u_b(src0, src2);
+ dst1 = __msa_aver_u_b(src1, src3);
+ ST_UB2(dst0, dst1, dst_ptr, 16);
+ s += 32;
+ t += 32;
+ dst_ptr += 32;
+ }
+ return;
+ }
+
+ y_fractions = (uint16)(y0_fraction + (y1_fraction << 8));
+ y_frac = (v8u16)__msa_fill_h(y_fractions);
+
+ for (x = 0; x < width; x += 32) {
+ src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+ src1 = (v16u8)__msa_ld_b((v16i8*)s, 16);
+ src2 = (v16u8)__msa_ld_b((v16i8*)t, 0);
+ src3 = (v16u8)__msa_ld_b((v16i8*)t, 16);
+ vec0 = (v8u16)__msa_ilvr_b((v16i8)src2, (v16i8)src0);
+ vec1 = (v8u16)__msa_ilvl_b((v16i8)src2, (v16i8)src0);
+ vec2 = (v8u16)__msa_ilvr_b((v16i8)src3, (v16i8)src1);
+ vec3 = (v8u16)__msa_ilvl_b((v16i8)src3, (v16i8)src1);
+ vec0 = (v8u16)__msa_dotp_u_h((v16u8)vec0, (v16u8)y_frac);
+ vec1 = (v8u16)__msa_dotp_u_h((v16u8)vec1, (v16u8)y_frac);
+ vec2 = (v8u16)__msa_dotp_u_h((v16u8)vec2, (v16u8)y_frac);
+ vec3 = (v8u16)__msa_dotp_u_h((v16u8)vec3, (v16u8)y_frac);
+ vec0 = (v8u16)__msa_srari_h((v8i16)vec0, 8);
+ vec1 = (v8u16)__msa_srari_h((v8i16)vec1, 8);
+ vec2 = (v8u16)__msa_srari_h((v8i16)vec2, 8);
+ vec3 = (v8u16)__msa_srari_h((v8i16)vec3, 8);
+ dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+ dst1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
+ ST_UB2(dst0, dst1, dst_ptr, 16);
+ s += 32;
+ t += 32;
+ dst_ptr += 32;
+ }
+}
+
+void ARGBSetRow_MSA(uint8* dst_argb, uint32 v32, int width) {
+ int x;
+ v16u8 dst0 = (v16u8)__msa_fill_w(v32);
+
+ for (x = 0; x < width; x += 4) {
+ ST_UB(dst0, dst_argb);
+ dst_argb += 16;
+ }
+}
+
+void RAWToRGB24Row_MSA(const uint8* src_raw, uint8* dst_rgb24, int width) {
+ int x;
+ v16u8 src0, src1, src2, src3, src4, dst0, dst1, dst2;
+ v16i8 shuffler0 = {2, 1, 0, 5, 4, 3, 8, 7, 6, 11, 10, 9, 14, 13, 12, 17};
+ v16i8 shuffler1 = {8, 7, 12, 11, 10, 15, 14, 13,
+ 18, 17, 16, 21, 20, 19, 24, 23};
+ v16i8 shuffler2 = {14, 19, 18, 17, 22, 21, 20, 25,
+ 24, 23, 28, 27, 26, 31, 30, 29};
+
+ for (x = 0; x < width; x += 16) {
+ src0 = (v16u8)__msa_ld_b((v16i8*)src_raw, 0);
+ src1 = (v16u8)__msa_ld_b((v16i8*)src_raw, 16);
+ src2 = (v16u8)__msa_ld_b((v16i8*)src_raw, 32);
+ src3 = (v16u8)__msa_sldi_b((v16i8)src1, (v16i8)src0, 8);
+ src4 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src1, 8);
+ dst0 = (v16u8)__msa_vshf_b(shuffler0, (v16i8)src1, (v16i8)src0);
+ dst1 = (v16u8)__msa_vshf_b(shuffler1, (v16i8)src4, (v16i8)src3);
+ dst2 = (v16u8)__msa_vshf_b(shuffler2, (v16i8)src2, (v16i8)src1);
+ ST_UB2(dst0, dst1, dst_rgb24, 16);
+ ST_UB(dst2, (dst_rgb24 + 32));
+ src_raw += 48;
+ dst_rgb24 += 48;
+ }
+}
+
+void MergeUVRow_MSA(const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_uv,
+ int width) {
+ int x;
+ v16u8 src0, src1, dst0, dst1;
+
+ for (x = 0; x < width; x += 16) {
+ src0 = (v16u8)__msa_ld_b((v16i8*)src_u, 0);
+ src1 = (v16u8)__msa_ld_b((v16i8*)src_v, 0);
+ dst0 = (v16u8)__msa_ilvr_b((v16i8)src1, (v16i8)src0);
+ dst1 = (v16u8)__msa_ilvl_b((v16i8)src1, (v16i8)src0);
+ ST_UB2(dst0, dst1, dst_uv, 16);
+ src_u += 16;
+ src_v += 16;
+ dst_uv += 32;
+ }
+}
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
+
+#endif // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
diff --git a/libyuv/src/main/cpp/libyuv/source/row_neon.cc b/libyuv/src/main/cpp/libyuv/source/row_neon.cc
new file mode 100644
index 0000000..fcbade3
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/source/row_neon.cc
@@ -0,0 +1,2558 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+
+#include
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for GCC Neon
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
+ !defined(__aarch64__)
+
+// Read 8 Y, 4 U and 4 V from 422
+#define READYUV422 \
+ "vld1.8 {d0}, [%0]! \n" \
+ "vld1.32 {d2[0]}, [%1]! \n" \
+ "vld1.32 {d2[1]}, [%2]! \n"
+
+// Read 8 Y, 8 U and 8 V from 444
+#define READYUV444 \
+ "vld1.8 {d0}, [%0]! \n" \
+ "vld1.8 {d2}, [%1]! \n" \
+ "vld1.8 {d3}, [%2]! \n" \
+ "vpaddl.u8 q1, q1 \n" \
+ "vrshrn.u16 d2, q1, #1 \n"
+
+// Read 8 Y, and set 4 U and 4 V to 128
+#define READYUV400 \
+ "vld1.8 {d0}, [%0]! \n" \
+ "vmov.u8 d2, #128 \n"
+
+// Read 8 Y and 4 UV from NV12
+#define READNV12 \
+ "vld1.8 {d0}, [%0]! \n" \
+ "vld1.8 {d2}, [%1]! \n" \
+ "vmov.u8 d3, d2 \n" /* split odd/even uv apart */ \
+ "vuzp.u8 d2, d3 \n" \
+ "vtrn.u32 d2, d3 \n"
+
+// Read 8 Y and 4 VU from NV21
+#define READNV21 \
+ "vld1.8 {d0}, [%0]! \n" \
+ "vld1.8 {d2}, [%1]! \n" \
+ "vmov.u8 d3, d2 \n" /* split odd/even uv apart */ \
+ "vuzp.u8 d3, d2 \n" \
+ "vtrn.u32 d2, d3 \n"
+
+// Read 8 YUY2
+#define READYUY2 \
+ "vld2.8 {d0, d2}, [%0]! \n" \
+ "vmov.u8 d3, d2 \n" \
+ "vuzp.u8 d2, d3 \n" \
+ "vtrn.u32 d2, d3 \n"
+
+// Read 8 UYVY
+#define READUYVY \
+ "vld2.8 {d2, d3}, [%0]! \n" \
+ "vmov.u8 d0, d3 \n" \
+ "vmov.u8 d3, d2 \n" \
+ "vuzp.u8 d2, d3 \n" \
+ "vtrn.u32 d2, d3 \n"
+
+#define YUVTORGB_SETUP \
+ "vld1.8 {d24}, [%[kUVToRB]] \n" \
+ "vld1.8 {d25}, [%[kUVToG]] \n" \
+ "vld1.16 {d26[], d27[]}, [%[kUVBiasBGR]]! \n" \
+ "vld1.16 {d8[], d9[]}, [%[kUVBiasBGR]]! \n" \
+ "vld1.16 {d28[], d29[]}, [%[kUVBiasBGR]] \n" \
+ "vld1.32 {d30[], d31[]}, [%[kYToRgb]] \n"
+
+#define YUVTORGB \
+ "vmull.u8 q8, d2, d24 \n" /* u/v B/R component */ \
+ "vmull.u8 q9, d2, d25 \n" /* u/v G component */ \
+ "vmovl.u8 q0, d0 \n" /* Y */ \
+ "vmovl.s16 q10, d1 \n" \
+ "vmovl.s16 q0, d0 \n" \
+ "vmul.s32 q10, q10, q15 \n" \
+ "vmul.s32 q0, q0, q15 \n" \
+ "vqshrun.s32 d0, q0, #16 \n" \
+ "vqshrun.s32 d1, q10, #16 \n" /* Y */ \
+ "vadd.s16 d18, d19 \n" \
+ "vshll.u16 q1, d16, #16 \n" /* Replicate u * UB */ \
+ "vshll.u16 q10, d17, #16 \n" /* Replicate v * VR */ \
+ "vshll.u16 q3, d18, #16 \n" /* Replicate (v*VG + u*UG)*/ \
+ "vaddw.u16 q1, q1, d16 \n" \
+ "vaddw.u16 q10, q10, d17 \n" \
+ "vaddw.u16 q3, q3, d18 \n" \
+ "vqadd.s16 q8, q0, q13 \n" /* B */ \
+ "vqadd.s16 q9, q0, q14 \n" /* R */ \
+ "vqadd.s16 q0, q0, q4 \n" /* G */ \
+ "vqadd.s16 q8, q8, q1 \n" /* B */ \
+ "vqadd.s16 q9, q9, q10 \n" /* R */ \
+ "vqsub.s16 q0, q0, q3 \n" /* G */ \
+ "vqshrun.s16 d20, q8, #6 \n" /* B */ \
+ "vqshrun.s16 d22, q9, #6 \n" /* R */ \
+ "vqshrun.s16 d21, q0, #6 \n" /* G */
+
+void I444ToARGBRow_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile(
+ YUVTORGB_SETUP
+ "vmov.u8 d23, #255 \n"
+ "1: \n" READYUV444 YUVTORGB
+ "subs %4, %4, #8 \n"
+ "vst4.8 {d20, d21, d22, d23}, [%3]! \n"
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_argb), // %3
+ "+r"(width) // %4
+ : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+ [kUVToG] "r"(&yuvconstants->kUVToG),
+ [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+ [kYToRgb] "r"(&yuvconstants->kYToRgb)
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
+ "q12", "q13", "q14", "q15");
+}
+
+void I422ToARGBRow_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile(
+ YUVTORGB_SETUP
+ "vmov.u8 d23, #255 \n"
+ "1: \n" READYUV422 YUVTORGB
+ "subs %4, %4, #8 \n"
+ "vst4.8 {d20, d21, d22, d23}, [%3]! \n"
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_argb), // %3
+ "+r"(width) // %4
+ : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+ [kUVToG] "r"(&yuvconstants->kUVToG),
+ [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+ [kYToRgb] "r"(&yuvconstants->kYToRgb)
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
+ "q12", "q13", "q14", "q15");
+}
+
+void I422AlphaToARGBRow_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ const uint8* src_a,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile(
+ YUVTORGB_SETUP
+ "1: \n" READYUV422 YUVTORGB
+ "subs %5, %5, #8 \n"
+ "vld1.8 {d23}, [%3]! \n"
+ "vst4.8 {d20, d21, d22, d23}, [%4]! \n"
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(src_a), // %3
+ "+r"(dst_argb), // %4
+ "+r"(width) // %5
+ : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+ [kUVToG] "r"(&yuvconstants->kUVToG),
+ [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+ [kYToRgb] "r"(&yuvconstants->kYToRgb)
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
+ "q12", "q13", "q14", "q15");
+}
+
+void I422ToRGBARow_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_rgba,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile(
+ YUVTORGB_SETUP
+ "1: \n" READYUV422 YUVTORGB
+ "subs %4, %4, #8 \n"
+ "vmov.u8 d19, #255 \n" // d19 modified by
+ // YUVTORGB
+ "vst4.8 {d19, d20, d21, d22}, [%3]! \n"
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_rgba), // %3
+ "+r"(width) // %4
+ : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+ [kUVToG] "r"(&yuvconstants->kUVToG),
+ [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+ [kYToRgb] "r"(&yuvconstants->kYToRgb)
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
+ "q12", "q13", "q14", "q15");
+}
+
+void I422ToRGB24Row_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile(
+ YUVTORGB_SETUP
+ "1: \n" READYUV422 YUVTORGB
+ "subs %4, %4, #8 \n"
+ "vst3.8 {d20, d21, d22}, [%3]! \n"
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_rgb24), // %3
+ "+r"(width) // %4
+ : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+ [kUVToG] "r"(&yuvconstants->kUVToG),
+ [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+ [kYToRgb] "r"(&yuvconstants->kYToRgb)
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
+ "q12", "q13", "q14", "q15");
+}
+
+#define ARGBTORGB565 \
+ "vshll.u8 q0, d22, #8 \n" /* R */ \
+ "vshll.u8 q8, d21, #8 \n" /* G */ \
+ "vshll.u8 q9, d20, #8 \n" /* B */ \
+ "vsri.16 q0, q8, #5 \n" /* RG */ \
+ "vsri.16 q0, q9, #11 \n" /* RGB */
+
+void I422ToRGB565Row_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_rgb565,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile(
+ YUVTORGB_SETUP
+ "1: \n" READYUV422 YUVTORGB
+ "subs %4, %4, #8 \n" ARGBTORGB565
+ "vst1.8 {q0}, [%3]! \n" // store 8 pixels RGB565.
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_rgb565), // %3
+ "+r"(width) // %4
+ : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+ [kUVToG] "r"(&yuvconstants->kUVToG),
+ [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+ [kYToRgb] "r"(&yuvconstants->kYToRgb)
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
+ "q12", "q13", "q14", "q15");
+}
+
+#define ARGBTOARGB1555 \
+ "vshll.u8 q0, d23, #8 \n" /* A */ \
+ "vshll.u8 q8, d22, #8 \n" /* R */ \
+ "vshll.u8 q9, d21, #8 \n" /* G */ \
+ "vshll.u8 q10, d20, #8 \n" /* B */ \
+ "vsri.16 q0, q8, #1 \n" /* AR */ \
+ "vsri.16 q0, q9, #6 \n" /* ARG */ \
+ "vsri.16 q0, q10, #11 \n" /* ARGB */
+
+void I422ToARGB1555Row_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb1555,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile(
+ YUVTORGB_SETUP
+ "1: \n" READYUV422 YUVTORGB
+ "subs %4, %4, #8 \n"
+ "vmov.u8 d23, #255 \n" ARGBTOARGB1555
+ "vst1.8 {q0}, [%3]! \n" // store 8 pixels
+ // ARGB1555.
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_argb1555), // %3
+ "+r"(width) // %4
+ : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+ [kUVToG] "r"(&yuvconstants->kUVToG),
+ [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+ [kYToRgb] "r"(&yuvconstants->kYToRgb)
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
+ "q12", "q13", "q14", "q15");
+}
+
+#define ARGBTOARGB4444 \
+ "vshr.u8 d20, d20, #4 \n" /* B */ \
+ "vbic.32 d21, d21, d4 \n" /* G */ \
+ "vshr.u8 d22, d22, #4 \n" /* R */ \
+ "vbic.32 d23, d23, d4 \n" /* A */ \
+ "vorr d0, d20, d21 \n" /* BG */ \
+ "vorr d1, d22, d23 \n" /* RA */ \
+ "vzip.u8 d0, d1 \n" /* BGRA */
+
+void I422ToARGB4444Row_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb4444,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile(
+ YUVTORGB_SETUP
+ "vmov.u8 d4, #0x0f \n" // bits to clear with
+ // vbic.
+ "1: \n" READYUV422 YUVTORGB
+ "subs %4, %4, #8 \n"
+ "vmov.u8 d23, #255 \n" ARGBTOARGB4444
+ "vst1.8 {q0}, [%3]! \n" // store 8 pixels
+ // ARGB4444.
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_argb4444), // %3
+ "+r"(width) // %4
+ : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+ [kUVToG] "r"(&yuvconstants->kUVToG),
+ [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+ [kYToRgb] "r"(&yuvconstants->kYToRgb)
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
+ "q12", "q13", "q14", "q15");
+}
+
+void I400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width) {
+ asm volatile(
+ YUVTORGB_SETUP
+ "vmov.u8 d23, #255 \n"
+ "1: \n" READYUV400 YUVTORGB
+ "subs %2, %2, #8 \n"
+ "vst4.8 {d20, d21, d22, d23}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : [kUVToRB] "r"(&kYuvI601Constants.kUVToRB),
+ [kUVToG] "r"(&kYuvI601Constants.kUVToG),
+ [kUVBiasBGR] "r"(&kYuvI601Constants.kUVBiasBGR),
+ [kYToRgb] "r"(&kYuvI601Constants.kYToRgb)
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
+ "q12", "q13", "q14", "q15");
+}
+
+void J400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width) {
+ asm volatile(
+ "vmov.u8 d23, #255 \n"
+ "1: \n"
+ "vld1.8 {d20}, [%0]! \n"
+ "vmov d21, d20 \n"
+ "vmov d22, d20 \n"
+ "subs %2, %2, #8 \n"
+ "vst4.8 {d20, d21, d22, d23}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "d20", "d21", "d22", "d23");
+}
+
+void NV12ToARGBRow_NEON(const uint8* src_y,
+ const uint8* src_uv,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile(
+ YUVTORGB_SETUP
+ "vmov.u8 d23, #255 \n"
+ "1: \n" READNV12 YUVTORGB
+ "subs %3, %3, #8 \n"
+ "vst4.8 {d20, d21, d22, d23}, [%2]! \n"
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_uv), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+ [kUVToG] "r"(&yuvconstants->kUVToG),
+ [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+ [kYToRgb] "r"(&yuvconstants->kYToRgb)
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
+ "q12", "q13", "q14", "q15");
+}
+
+void NV21ToARGBRow_NEON(const uint8* src_y,
+ const uint8* src_vu,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile(
+ YUVTORGB_SETUP
+ "vmov.u8 d23, #255 \n"
+ "1: \n" READNV21 YUVTORGB
+ "subs %3, %3, #8 \n"
+ "vst4.8 {d20, d21, d22, d23}, [%2]! \n"
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_vu), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+ [kUVToG] "r"(&yuvconstants->kUVToG),
+ [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+ [kYToRgb] "r"(&yuvconstants->kYToRgb)
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
+ "q12", "q13", "q14", "q15");
+}
+
+void NV12ToRGB565Row_NEON(const uint8* src_y,
+ const uint8* src_uv,
+ uint8* dst_rgb565,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile(
+ YUVTORGB_SETUP
+ "1: \n" READNV12 YUVTORGB
+ "subs %3, %3, #8 \n" ARGBTORGB565
+ "vst1.8 {q0}, [%2]! \n" // store 8 pixels RGB565.
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_uv), // %1
+ "+r"(dst_rgb565), // %2
+ "+r"(width) // %3
+ : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+ [kUVToG] "r"(&yuvconstants->kUVToG),
+ [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+ [kYToRgb] "r"(&yuvconstants->kYToRgb)
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
+ "q12", "q13", "q14", "q15");
+}
+
+void YUY2ToARGBRow_NEON(const uint8* src_yuy2,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile(
+ YUVTORGB_SETUP
+ "vmov.u8 d23, #255 \n"
+ "1: \n" READYUY2 YUVTORGB
+ "subs %2, %2, #8 \n"
+ "vst4.8 {d20, d21, d22, d23}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src_yuy2), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+ [kUVToG] "r"(&yuvconstants->kUVToG),
+ [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+ [kYToRgb] "r"(&yuvconstants->kYToRgb)
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
+ "q12", "q13", "q14", "q15");
+}
+
+void UYVYToARGBRow_NEON(const uint8* src_uyvy,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile(
+ YUVTORGB_SETUP
+ "vmov.u8 d23, #255 \n"
+ "1: \n" READUYVY YUVTORGB
+ "subs %2, %2, #8 \n"
+ "vst4.8 {d20, d21, d22, d23}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src_uyvy), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+ [kUVToG] "r"(&yuvconstants->kUVToG),
+ [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+ [kYToRgb] "r"(&yuvconstants->kYToRgb)
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
+ "q12", "q13", "q14", "q15");
+}
+
+// Reads 16 pairs of UV and write even values to dst_u and odd to dst_v.
+void SplitUVRow_NEON(const uint8* src_uv,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "vld2.8 {q0, q1}, [%0]! \n" // load 16 pairs of UV
+ "subs %3, %3, #16 \n" // 16 processed per loop
+ "vst1.8 {q0}, [%1]! \n" // store U
+ "vst1.8 {q1}, [%2]! \n" // store V
+ "bgt 1b \n"
+ : "+r"(src_uv), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(width) // %3 // Output registers
+ : // Input registers
+ : "cc", "memory", "q0", "q1" // Clobber List
+ );
+}
+
+// Reads 16 U's and V's and writes out 16 pairs of UV.
+void MergeUVRow_NEON(const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_uv,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "vld1.8 {q0}, [%0]! \n" // load U
+ "vld1.8 {q1}, [%1]! \n" // load V
+ "subs %3, %3, #16 \n" // 16 processed per loop
+ "vst2.u8 {q0, q1}, [%2]! \n" // store 16 pairs of UV
+ "bgt 1b \n"
+ : "+r"(src_u), // %0
+ "+r"(src_v), // %1
+ "+r"(dst_uv), // %2
+ "+r"(width) // %3 // Output registers
+ : // Input registers
+ : "cc", "memory", "q0", "q1" // Clobber List
+ );
+}
+
+// Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15.
+void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
+ asm volatile(
+ "1: \n"
+ "vld1.8 {d0, d1, d2, d3}, [%0]! \n" // load 32
+ "subs %2, %2, #32 \n" // 32 processed per loop
+ "vst1.8 {d0, d1, d2, d3}, [%1]! \n" // store 32
+ "bgt 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(count) // %2 // Output registers
+ : // Input registers
+ : "cc", "memory", "q0", "q1" // Clobber List
+ );
+}
+
+// SetRow writes 'count' bytes using an 8 bit value repeated.
+void SetRow_NEON(uint8* dst, uint8 v8, int count) {
+ asm volatile(
+ "vdup.8 q0, %2 \n" // duplicate 16 bytes
+ "1: \n"
+ "subs %1, %1, #16 \n" // 16 bytes per loop
+ "vst1.8 {q0}, [%0]! \n" // store
+ "bgt 1b \n"
+ : "+r"(dst), // %0
+ "+r"(count) // %1
+ : "r"(v8) // %2
+ : "cc", "memory", "q0");
+}
+
+// ARGBSetRow writes 'count' pixels using an 32 bit value repeated.
+void ARGBSetRow_NEON(uint8* dst, uint32 v32, int count) {
+ asm volatile(
+ "vdup.u32 q0, %2 \n" // duplicate 4 ints
+ "1: \n"
+ "subs %1, %1, #4 \n" // 4 pixels per loop
+ "vst1.8 {q0}, [%0]! \n" // store
+ "bgt 1b \n"
+ : "+r"(dst), // %0
+ "+r"(count) // %1
+ : "r"(v32) // %2
+ : "cc", "memory", "q0");
+}
+
+void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
+ asm volatile(
+ // Start at end of source row.
+ "mov r3, #-16 \n"
+ "add %0, %0, %2 \n"
+ "sub %0, #16 \n"
+
+ "1: \n"
+ "vld1.8 {q0}, [%0], r3 \n" // src -= 16
+ "subs %2, #16 \n" // 16 pixels per loop.
+ "vrev64.8 q0, q0 \n"
+ "vst1.8 {d1}, [%1]! \n" // dst += 16
+ "vst1.8 {d0}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "r3", "q0");
+}
+
+void MirrorUVRow_NEON(const uint8* src_uv,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
+ asm volatile(
+ // Start at end of source row.
+ "mov r12, #-16 \n"
+ "add %0, %0, %3, lsl #1 \n"
+ "sub %0, #16 \n"
+
+ "1: \n"
+ "vld2.8 {d0, d1}, [%0], r12 \n" // src -= 16
+ "subs %3, #8 \n" // 8 pixels per loop.
+ "vrev64.8 q0, q0 \n"
+ "vst1.8 {d0}, [%1]! \n" // dst += 8
+ "vst1.8 {d1}, [%2]! \n"
+ "bgt 1b \n"
+ : "+r"(src_uv), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "r12", "q0");
+}
+
+void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {
+ asm volatile(
+ // Start at end of source row.
+ "mov r3, #-16 \n"
+ "add %0, %0, %2, lsl #2 \n"
+ "sub %0, #16 \n"
+
+ "1: \n"
+ "vld1.8 {q0}, [%0], r3 \n" // src -= 16
+ "subs %2, #4 \n" // 4 pixels per loop.
+ "vrev64.32 q0, q0 \n"
+ "vst1.8 {d1}, [%1]! \n" // dst += 16
+ "vst1.8 {d0}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "r3", "q0");
+}
+
+void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int width) {
+ asm volatile(
+ "vmov.u8 d4, #255 \n" // Alpha
+ "1: \n"
+ "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RGB24.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB.
+ "bgt 1b \n"
+ : "+r"(src_rgb24), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List
+ );
+}
+
+void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int width) {
+ asm volatile(
+ "vmov.u8 d4, #255 \n" // Alpha
+ "1: \n"
+ "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vswp.u8 d1, d3 \n" // swap R, B
+ "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB.
+ "bgt 1b \n"
+ : "+r"(src_raw), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List
+ );
+}
+
+void RAWToRGB24Row_NEON(const uint8* src_raw, uint8* dst_rgb24, int width) {
+ asm volatile(
+ "1: \n"
+ "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vswp.u8 d1, d3 \n" // swap R, B
+ "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of
+ // RGB24.
+ "bgt 1b \n"
+ : "+r"(src_raw), // %0
+ "+r"(dst_rgb24), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "d1", "d2", "d3" // Clobber List
+ );
+}
+
+#define RGB565TOARGB \
+ "vshrn.u16 d6, q0, #5 \n" /* G xxGGGGGG */ \
+ "vuzp.u8 d0, d1 \n" /* d0 xxxBBBBB RRRRRxxx */ \
+ "vshl.u8 d6, d6, #2 \n" /* G GGGGGG00 upper 6 */ \
+ "vshr.u8 d1, d1, #3 \n" /* R 000RRRRR lower 5 */ \
+ "vshl.u8 q0, q0, #3 \n" /* B,R BBBBB000 upper 5 */ \
+ "vshr.u8 q2, q0, #5 \n" /* B,R 00000BBB lower 3 */ \
+ "vorr.u8 d0, d0, d4 \n" /* B */ \
+ "vshr.u8 d4, d6, #6 \n" /* G 000000GG lower 2 */ \
+ "vorr.u8 d2, d1, d5 \n" /* R */ \
+ "vorr.u8 d1, d4, d6 \n" /* G */
+
+void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int width) {
+ asm volatile(
+ "vmov.u8 d3, #255 \n" // Alpha
+ "1: \n"
+ "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ RGB565TOARGB
+ "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
+ "bgt 1b \n"
+ : "+r"(src_rgb565), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List
+ );
+}
+
+#define ARGB1555TOARGB \
+ "vshrn.u16 d7, q0, #8 \n" /* A Arrrrrxx */ \
+ "vshr.u8 d6, d7, #2 \n" /* R xxxRRRRR */ \
+ "vshrn.u16 d5, q0, #5 \n" /* G xxxGGGGG */ \
+ "vmovn.u16 d4, q0 \n" /* B xxxBBBBB */ \
+ "vshr.u8 d7, d7, #7 \n" /* A 0000000A */ \
+ "vneg.s8 d7, d7 \n" /* A AAAAAAAA upper 8 */ \
+ "vshl.u8 d6, d6, #3 \n" /* R RRRRR000 upper 5 */ \
+ "vshr.u8 q1, q3, #5 \n" /* R,A 00000RRR lower 3 */ \
+ "vshl.u8 q0, q2, #3 \n" /* B,G BBBBB000 upper 5 */ \
+ "vshr.u8 q2, q0, #5 \n" /* B,G 00000BBB lower 3 */ \
+ "vorr.u8 q1, q1, q3 \n" /* R,A */ \
+ "vorr.u8 q0, q0, q2 \n" /* B,G */
+
+// RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha.
+#define RGB555TOARGB \
+ "vshrn.u16 d6, q0, #5 \n" /* G xxxGGGGG */ \
+ "vuzp.u8 d0, d1 \n" /* d0 xxxBBBBB xRRRRRxx */ \
+ "vshl.u8 d6, d6, #3 \n" /* G GGGGG000 upper 5 */ \
+ "vshr.u8 d1, d1, #2 \n" /* R 00xRRRRR lower 5 */ \
+ "vshl.u8 q0, q0, #3 \n" /* B,R BBBBB000 upper 5 */ \
+ "vshr.u8 q2, q0, #5 \n" /* B,R 00000BBB lower 3 */ \
+ "vorr.u8 d0, d0, d4 \n" /* B */ \
+ "vshr.u8 d4, d6, #5 \n" /* G 00000GGG lower 3 */ \
+ "vorr.u8 d2, d1, d5 \n" /* R */ \
+ "vorr.u8 d1, d4, d6 \n" /* G */
+
+void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555,
+ uint8* dst_argb,
+ int width) {
+ asm volatile(
+ "vmov.u8 d3, #255 \n" // Alpha
+ "1: \n"
+ "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ ARGB1555TOARGB
+ "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
+ "bgt 1b \n"
+ : "+r"(src_argb1555), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List
+ );
+}
+
+#define ARGB4444TOARGB \
+ "vuzp.u8 d0, d1 \n" /* d0 BG, d1 RA */ \
+ "vshl.u8 q2, q0, #4 \n" /* B,R BBBB0000 */ \
+ "vshr.u8 q1, q0, #4 \n" /* G,A 0000GGGG */ \
+ "vshr.u8 q0, q2, #4 \n" /* B,R 0000BBBB */ \
+ "vorr.u8 q0, q0, q2 \n" /* B,R BBBBBBBB */ \
+ "vshl.u8 q2, q1, #4 \n" /* G,A GGGG0000 */ \
+ "vorr.u8 q1, q1, q2 \n" /* G,A GGGGGGGG */ \
+ "vswp.u8 d1, d2 \n" /* B,R,G,A -> B,G,R,A */
+
+void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444,
+ uint8* dst_argb,
+ int width) {
+ asm volatile(
+ "vmov.u8 d3, #255 \n" // Alpha
+ "1: \n"
+ "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ ARGB4444TOARGB
+ "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
+ "bgt 1b \n"
+ : "+r"(src_argb4444), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "q0", "q1", "q2" // Clobber List
+ );
+}
+
+void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int width) {
+ asm volatile(
+ "1: \n"
+ "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of
+ // RGB24.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_rgb24), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List
+ );
+}
+
+void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int width) {
+ asm volatile(
+ "1: \n"
+ "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vswp.u8 d1, d3 \n" // swap R, B
+ "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of RAW.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_raw), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List
+ );
+}
+
+void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int width) {
+ asm volatile(
+ "1: \n"
+ "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2.
+ "subs %2, %2, #16 \n" // 16 processed per loop.
+ "vst1.8 {q0}, [%1]! \n" // store 16 pixels of Y.
+ "bgt 1b \n"
+ : "+r"(src_yuy2), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "q0", "q1" // Clobber List
+ );
+}
+
+void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int width) {
+ asm volatile(
+ "1: \n"
+ "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of UYVY.
+ "subs %2, %2, #16 \n" // 16 processed per loop.
+ "vst1.8 {q1}, [%1]! \n" // store 16 pixels of Y.
+ "bgt 1b \n"
+ : "+r"(src_uyvy), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "q0", "q1" // Clobber List
+ );
+}
+
+void YUY2ToUV422Row_NEON(const uint8* src_yuy2,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2.
+ "subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
+ "vst1.8 {d1}, [%1]! \n" // store 8 U.
+ "vst1.8 {d3}, [%2]! \n" // store 8 V.
+ "bgt 1b \n"
+ : "+r"(src_yuy2), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "d0", "d1", "d2", "d3" // Clobber List
+ );
+}
+
+void UYVYToUV422Row_NEON(const uint8* src_uyvy,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY.
+ "subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
+ "vst1.8 {d0}, [%1]! \n" // store 8 U.
+ "vst1.8 {d2}, [%2]! \n" // store 8 V.
+ "bgt 1b \n"
+ : "+r"(src_uyvy), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "d0", "d1", "d2", "d3" // Clobber List
+ );
+}
+
+void YUY2ToUVRow_NEON(const uint8* src_yuy2,
+ int stride_yuy2,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
+ asm volatile(
+ "add %1, %0, %1 \n" // stride + src_yuy2
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2.
+ "subs %4, %4, #16 \n" // 16 pixels = 8 UVs.
+ "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row YUY2.
+ "vrhadd.u8 d1, d1, d5 \n" // average rows of U
+ "vrhadd.u8 d3, d3, d7 \n" // average rows of V
+ "vst1.8 {d1}, [%2]! \n" // store 8 U.
+ "vst1.8 {d3}, [%3]! \n" // store 8 V.
+ "bgt 1b \n"
+ : "+r"(src_yuy2), // %0
+ "+r"(stride_yuy2), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6",
+ "d7" // Clobber List
+ );
+}
+
+void UYVYToUVRow_NEON(const uint8* src_uyvy,
+ int stride_uyvy,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
+ asm volatile(
+ "add %1, %0, %1 \n" // stride + src_uyvy
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY.
+ "subs %4, %4, #16 \n" // 16 pixels = 8 UVs.
+ "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row UYVY.
+ "vrhadd.u8 d0, d0, d4 \n" // average rows of U
+ "vrhadd.u8 d2, d2, d6 \n" // average rows of V
+ "vst1.8 {d0}, [%2]! \n" // store 8 U.
+ "vst1.8 {d2}, [%3]! \n" // store 8 V.
+ "bgt 1b \n"
+ : "+r"(src_uyvy), // %0
+ "+r"(stride_uyvy), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6",
+ "d7" // Clobber List
+ );
+}
+
+// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
+void ARGBShuffleRow_NEON(const uint8* src_argb,
+ uint8* dst_argb,
+ const uint8* shuffler,
+ int width) {
+ asm volatile(
+ "vld1.8 {q2}, [%3] \n" // shuffler
+ "1: \n"
+ "vld1.8 {q0}, [%0]! \n" // load 4 pixels.
+ "subs %2, %2, #4 \n" // 4 processed per loop
+ "vtbl.8 d2, {d0, d1}, d4 \n" // look up 2 first pixels
+ "vtbl.8 d3, {d0, d1}, d5 \n" // look up 2 next pixels
+ "vst1.8 {q1}, [%1]! \n" // store 4.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "r"(shuffler) // %3
+ : "cc", "memory", "q0", "q1", "q2" // Clobber List
+ );
+}
+
+void I422ToYUY2Row_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_yuy2,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "vld2.8 {d0, d2}, [%0]! \n" // load 16 Ys
+ "vld1.8 {d1}, [%1]! \n" // load 8 Us
+ "vld1.8 {d3}, [%2]! \n" // load 8 Vs
+ "subs %4, %4, #16 \n" // 16 pixels
+ "vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 YUY2/16 pixels.
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_yuy2), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "d0", "d1", "d2", "d3");
+}
+
+void I422ToUYVYRow_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_uyvy,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "vld2.8 {d1, d3}, [%0]! \n" // load 16 Ys
+ "vld1.8 {d0}, [%1]! \n" // load 8 Us
+ "vld1.8 {d2}, [%2]! \n" // load 8 Vs
+ "subs %4, %4, #16 \n" // 16 pixels
+ "vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 UYVY/16 pixels.
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_uyvy), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "d0", "d1", "d2", "d3");
+}
+
+void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int width) {
+ asm volatile(
+ "1: \n"
+ "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ ARGBTORGB565
+ "vst1.8 {q0}, [%1]! \n" // store 8 pixels RGB565.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_rgb565), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "q0", "q8", "q9", "q10", "q11");
+}
+
+void ARGBToRGB565DitherRow_NEON(const uint8* src_argb,
+ uint8* dst_rgb,
+ const uint32 dither4,
+ int width) {
+ asm volatile(
+ "vdup.32 d2, %2 \n" // dither4
+ "1: \n"
+ "vld4.8 {d20, d21, d22, d23}, [%1]! \n" // load 8 pixels of ARGB.
+ "subs %3, %3, #8 \n" // 8 processed per loop.
+ "vqadd.u8 d20, d20, d2 \n"
+ "vqadd.u8 d21, d21, d2 \n"
+ "vqadd.u8 d22, d22, d2 \n" ARGBTORGB565
+ "vst1.8 {q0}, [%0]! \n" // store 8 pixels RGB565.
+ "bgt 1b \n"
+ : "+r"(dst_rgb) // %0
+ : "r"(src_argb), // %1
+ "r"(dither4), // %2
+ "r"(width) // %3
+ : "cc", "memory", "q0", "q1", "q8", "q9", "q10", "q11");
+}
+
+void ARGBToARGB1555Row_NEON(const uint8* src_argb,
+ uint8* dst_argb1555,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ ARGBTOARGB1555
+ "vst1.8 {q0}, [%1]! \n" // store 8 pixels
+ // ARGB1555.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb1555), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "q0", "q8", "q9", "q10", "q11");
+}
+
+void ARGBToARGB4444Row_NEON(const uint8* src_argb,
+ uint8* dst_argb4444,
+ int width) {
+ asm volatile(
+ "vmov.u8 d4, #0x0f \n" // bits to clear with
+ // vbic.
+ "1: \n"
+ "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ ARGBTOARGB4444
+ "vst1.8 {q0}, [%1]! \n" // store 8 pixels
+ // ARGB4444.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb4444), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "q0", "q8", "q9", "q10", "q11");
+}
+
+void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int width) {
+ asm volatile(
+ "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient
+ "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient
+ "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient
+ "vmov.u8 d27, #16 \n" // Add 16 constant
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmull.u8 q2, d0, d24 \n" // B
+ "vmlal.u8 q2, d1, d25 \n" // G
+ "vmlal.u8 q2, d2, d26 \n" // R
+ "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y
+ "vqadd.u8 d0, d27 \n"
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q12", "q13");
+}
+
+void ARGBExtractAlphaRow_NEON(const uint8* src_argb, uint8* dst_a, int width) {
+ asm volatile(
+ "1: \n"
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels
+ "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels
+ "subs %2, %2, #16 \n" // 16 processed per loop
+ "vst1.8 {q3}, [%1]! \n" // store 16 A's.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_a), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List
+ );
+}
+
+void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int width) {
+ asm volatile(
+ "vmov.u8 d24, #15 \n" // B * 0.11400 coefficient
+ "vmov.u8 d25, #75 \n" // G * 0.58700 coefficient
+ "vmov.u8 d26, #38 \n" // R * 0.29900 coefficient
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmull.u8 q2, d0, d24 \n" // B
+ "vmlal.u8 q2, d1, d25 \n" // G
+ "vmlal.u8 q2, d2, d26 \n" // R
+ "vqrshrun.s16 d0, q2, #7 \n" // 15 bit to 8 bit Y
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q12", "q13");
+}
+
+// 8x1 pixels.
+void ARGBToUV444Row_NEON(const uint8* src_argb,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
+ asm volatile(
+ "vmov.u8 d24, #112 \n" // UB / VR 0.875
+ // coefficient
+ "vmov.u8 d25, #74 \n" // UG -0.5781 coefficient
+ "vmov.u8 d26, #38 \n" // UR -0.2969 coefficient
+ "vmov.u8 d27, #18 \n" // VB -0.1406 coefficient
+ "vmov.u8 d28, #94 \n" // VG -0.7344 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
+ "subs %3, %3, #8 \n" // 8 processed per loop.
+ "vmull.u8 q2, d0, d24 \n" // B
+ "vmlsl.u8 q2, d1, d25 \n" // G
+ "vmlsl.u8 q2, d2, d26 \n" // R
+ "vadd.u16 q2, q2, q15 \n" // +128 -> unsigned
+
+ "vmull.u8 q3, d2, d24 \n" // R
+ "vmlsl.u8 q3, d1, d28 \n" // G
+ "vmlsl.u8 q3, d0, d27 \n" // B
+ "vadd.u16 q3, q3, q15 \n" // +128 -> unsigned
+
+ "vqshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit U
+ "vqshrn.u16 d1, q3, #8 \n" // 16 bit to 8 bit V
+
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels U.
+ "vst1.8 {d1}, [%2]! \n" // store 8 pixels V.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q12", "q13", "q14",
+ "q15");
+}
+
+// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
+#define RGBTOUV(QB, QG, QR) \
+ "vmul.s16 q8, " #QB \
+ ", q10 \n" /* B */ \
+ "vmls.s16 q8, " #QG \
+ ", q11 \n" /* G */ \
+ "vmls.s16 q8, " #QR \
+ ", q12 \n" /* R */ \
+ "vadd.u16 q8, q8, q15 \n" /* +128 -> unsigned */ \
+ "vmul.s16 q9, " #QR \
+ ", q10 \n" /* R */ \
+ "vmls.s16 q9, " #QG \
+ ", q14 \n" /* G */ \
+ "vmls.s16 q9, " #QB \
+ ", q13 \n" /* B */ \
+ "vadd.u16 q9, q9, q15 \n" /* +128 -> unsigned */ \
+ "vqshrn.u16 d0, q8, #8 \n" /* 16 bit to 8 bit U */ \
+ "vqshrn.u16 d1, q9, #8 \n" /* 16 bit to 8 bit V */
+
+// TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.
+void ARGBToUVRow_NEON(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
+ asm volatile (
+ "add %1, %0, %1 \n" // src_stride + src_argb
+ "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
+ "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
+ "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
+ "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
+ "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
+ "1: \n"
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
+ "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
+ "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
+ "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
+ "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
+ "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels.
+ "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels.
+ "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
+ "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
+ "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts.
+
+ "vrshr.u16 q0, q0, #1 \n" // 2x average
+ "vrshr.u16 q1, q1, #1 \n"
+ "vrshr.u16 q2, q2, #1 \n"
+
+ "subs %4, %4, #16 \n" // 32 processed per loop.
+ RGBTOUV(q0, q1, q2)
+ "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(src_stride_argb), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+
+// TODO(fbarchard): Subsample match C code.
+void ARGBToUVJRow_NEON(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
+ asm volatile (
+ "add %1, %0, %1 \n" // src_stride + src_argb
+ "vmov.s16 q10, #127 / 2 \n" // UB / VR 0.500 coefficient
+ "vmov.s16 q11, #84 / 2 \n" // UG -0.33126 coefficient
+ "vmov.s16 q12, #43 / 2 \n" // UR -0.16874 coefficient
+ "vmov.s16 q13, #20 / 2 \n" // VB -0.08131 coefficient
+ "vmov.s16 q14, #107 / 2 \n" // VG -0.41869 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
+ "1: \n"
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
+ "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
+ "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
+ "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
+ "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
+ "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels.
+ "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels.
+ "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
+ "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
+ "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts.
+
+ "vrshr.u16 q0, q0, #1 \n" // 2x average
+ "vrshr.u16 q1, q1, #1 \n"
+ "vrshr.u16 q2, q2, #1 \n"
+
+ "subs %4, %4, #16 \n" // 32 processed per loop.
+ RGBTOUV(q0, q1, q2)
+ "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(src_stride_argb), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+
+void BGRAToUVRow_NEON(const uint8* src_bgra,
+ int src_stride_bgra,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
+ asm volatile (
+ "add %1, %0, %1 \n" // src_stride + src_bgra
+ "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
+ "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
+ "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
+ "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
+ "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
+ "1: \n"
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 BGRA pixels.
+ "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 BGRA pixels.
+ "vpaddl.u8 q3, q3 \n" // B 16 bytes -> 8 shorts.
+ "vpaddl.u8 q2, q2 \n" // G 16 bytes -> 8 shorts.
+ "vpaddl.u8 q1, q1 \n" // R 16 bytes -> 8 shorts.
+ "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more BGRA pixels.
+ "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 BGRA pixels.
+ "vpadal.u8 q3, q7 \n" // B 16 bytes -> 8 shorts.
+ "vpadal.u8 q2, q6 \n" // G 16 bytes -> 8 shorts.
+ "vpadal.u8 q1, q5 \n" // R 16 bytes -> 8 shorts.
+
+ "vrshr.u16 q1, q1, #1 \n" // 2x average
+ "vrshr.u16 q2, q2, #1 \n"
+ "vrshr.u16 q3, q3, #1 \n"
+
+ "subs %4, %4, #16 \n" // 32 processed per loop.
+ RGBTOUV(q3, q2, q1)
+ "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
+ "bgt 1b \n"
+ : "+r"(src_bgra), // %0
+ "+r"(src_stride_bgra), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+
+void ABGRToUVRow_NEON(const uint8* src_abgr,
+ int src_stride_abgr,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
+ asm volatile (
+ "add %1, %0, %1 \n" // src_stride + src_abgr
+ "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
+ "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
+ "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
+ "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
+ "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
+ "1: \n"
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ABGR pixels.
+ "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ABGR pixels.
+ "vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts.
+ "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
+ "vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts.
+ "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ABGR pixels.
+ "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ABGR pixels.
+ "vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts.
+ "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
+ "vpadal.u8 q0, q4 \n" // R 16 bytes -> 8 shorts.
+
+ "vrshr.u16 q0, q0, #1 \n" // 2x average
+ "vrshr.u16 q1, q1, #1 \n"
+ "vrshr.u16 q2, q2, #1 \n"
+
+ "subs %4, %4, #16 \n" // 32 processed per loop.
+ RGBTOUV(q2, q1, q0)
+ "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
+ "bgt 1b \n"
+ : "+r"(src_abgr), // %0
+ "+r"(src_stride_abgr), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+
+void RGBAToUVRow_NEON(const uint8* src_rgba,
+ int src_stride_rgba,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
+ asm volatile (
+ "add %1, %0, %1 \n" // src_stride + src_rgba
+ "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
+ "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
+ "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
+ "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
+ "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
+ "1: \n"
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 RGBA pixels.
+ "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 RGBA pixels.
+ "vpaddl.u8 q0, q1 \n" // B 16 bytes -> 8 shorts.
+ "vpaddl.u8 q1, q2 \n" // G 16 bytes -> 8 shorts.
+ "vpaddl.u8 q2, q3 \n" // R 16 bytes -> 8 shorts.
+ "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more RGBA pixels.
+ "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 RGBA pixels.
+ "vpadal.u8 q0, q5 \n" // B 16 bytes -> 8 shorts.
+ "vpadal.u8 q1, q6 \n" // G 16 bytes -> 8 shorts.
+ "vpadal.u8 q2, q7 \n" // R 16 bytes -> 8 shorts.
+
+ "vrshr.u16 q0, q0, #1 \n" // 2x average
+ "vrshr.u16 q1, q1, #1 \n"
+ "vrshr.u16 q2, q2, #1 \n"
+
+ "subs %4, %4, #16 \n" // 32 processed per loop.
+ RGBTOUV(q0, q1, q2)
+ "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
+ "bgt 1b \n"
+ : "+r"(src_rgba), // %0
+ "+r"(src_stride_rgba), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+
+void RGB24ToUVRow_NEON(const uint8* src_rgb24,
+ int src_stride_rgb24,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
+ asm volatile (
+ "add %1, %0, %1 \n" // src_stride + src_rgb24
+ "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
+ "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
+ "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
+ "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
+ "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
+ "1: \n"
+ "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB24 pixels.
+ "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RGB24 pixels.
+ "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
+ "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
+ "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
+ "vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RGB24 pixels.
+ "vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RGB24 pixels.
+ "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
+ "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
+ "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts.
+
+ "vrshr.u16 q0, q0, #1 \n" // 2x average
+ "vrshr.u16 q1, q1, #1 \n"
+ "vrshr.u16 q2, q2, #1 \n"
+
+ "subs %4, %4, #16 \n" // 32 processed per loop.
+ RGBTOUV(q0, q1, q2)
+ "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
+ "bgt 1b \n"
+ : "+r"(src_rgb24), // %0
+ "+r"(src_stride_rgb24), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+
+void RAWToUVRow_NEON(const uint8* src_raw,
+ int src_stride_raw,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
+ asm volatile (
+ "add %1, %0, %1 \n" // src_stride + src_raw
+ "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
+ "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
+ "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
+ "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
+ "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
+ "1: \n"
+ "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RAW pixels.
+ "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RAW pixels.
+ "vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts.
+ "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
+ "vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts.
+ "vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RAW pixels.
+ "vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RAW pixels.
+ "vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts.
+ "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
+ "vpadal.u8 q0, q4 \n" // R 16 bytes -> 8 shorts.
+
+ "vrshr.u16 q0, q0, #1 \n" // 2x average
+ "vrshr.u16 q1, q1, #1 \n"
+ "vrshr.u16 q2, q2, #1 \n"
+
+ "subs %4, %4, #16 \n" // 32 processed per loop.
+ RGBTOUV(q2, q1, q0)
+ "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
+ "bgt 1b \n"
+ : "+r"(src_raw), // %0
+ "+r"(src_stride_raw), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+
+// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
+void RGB565ToUVRow_NEON(const uint8* src_rgb565,
+ int src_stride_rgb565,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
+ asm volatile(
+ "add %1, %0, %1 \n" // src_stride + src_argb
+ "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875
+ // coefficient
+ "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
+ "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
+ "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
+ "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
+ "1: \n"
+ "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels.
+ RGB565TOARGB
+ "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
+ "vld1.8 {q0}, [%0]! \n" // next 8 RGB565 pixels.
+ RGB565TOARGB
+ "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
+
+ "vld1.8 {q0}, [%1]! \n" // load 8 RGB565 pixels.
+ RGB565TOARGB
+ "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
+ "vld1.8 {q0}, [%1]! \n" // next 8 RGB565 pixels.
+ RGB565TOARGB
+ "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
+
+ "vrshr.u16 q4, q4, #1 \n" // 2x average
+ "vrshr.u16 q5, q5, #1 \n"
+ "vrshr.u16 q6, q6, #1 \n"
+
+ "subs %4, %4, #16 \n" // 16 processed per loop.
+ "vmul.s16 q8, q4, q10 \n" // B
+ "vmls.s16 q8, q5, q11 \n" // G
+ "vmls.s16 q8, q6, q12 \n" // R
+ "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned
+ "vmul.s16 q9, q6, q10 \n" // R
+ "vmls.s16 q9, q5, q14 \n" // G
+ "vmls.s16 q9, q4, q13 \n" // B
+ "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned
+ "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U
+ "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V
+ "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
+ "bgt 1b \n"
+ : "+r"(src_rgb565), // %0
+ "+r"(src_stride_rgb565), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8",
+ "q9", "q10", "q11", "q12", "q13", "q14", "q15");
+}
+
+// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
+void ARGB1555ToUVRow_NEON(const uint8* src_argb1555,
+ int src_stride_argb1555,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
+ asm volatile(
+ "add %1, %0, %1 \n" // src_stride + src_argb
+ "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875
+ // coefficient
+ "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
+ "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
+ "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
+ "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
+ "1: \n"
+ "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels.
+ RGB555TOARGB
+ "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
+ "vld1.8 {q0}, [%0]! \n" // next 8 ARGB1555 pixels.
+ RGB555TOARGB
+ "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
+
+ "vld1.8 {q0}, [%1]! \n" // load 8 ARGB1555 pixels.
+ RGB555TOARGB
+ "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
+ "vld1.8 {q0}, [%1]! \n" // next 8 ARGB1555 pixels.
+ RGB555TOARGB
+ "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
+
+ "vrshr.u16 q4, q4, #1 \n" // 2x average
+ "vrshr.u16 q5, q5, #1 \n"
+ "vrshr.u16 q6, q6, #1 \n"
+
+ "subs %4, %4, #16 \n" // 16 processed per loop.
+ "vmul.s16 q8, q4, q10 \n" // B
+ "vmls.s16 q8, q5, q11 \n" // G
+ "vmls.s16 q8, q6, q12 \n" // R
+ "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned
+ "vmul.s16 q9, q6, q10 \n" // R
+ "vmls.s16 q9, q5, q14 \n" // G
+ "vmls.s16 q9, q4, q13 \n" // B
+ "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned
+ "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U
+ "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V
+ "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
+ "bgt 1b \n"
+ : "+r"(src_argb1555), // %0
+ "+r"(src_stride_argb1555), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8",
+ "q9", "q10", "q11", "q12", "q13", "q14", "q15");
+}
+
+// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
+void ARGB4444ToUVRow_NEON(const uint8* src_argb4444,
+ int src_stride_argb4444,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
+ asm volatile(
+ "add %1, %0, %1 \n" // src_stride + src_argb
+ "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875
+ // coefficient
+ "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
+ "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
+ "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
+ "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
+ "1: \n"
+ "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels.
+ ARGB4444TOARGB
+ "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
+ "vld1.8 {q0}, [%0]! \n" // next 8 ARGB4444 pixels.
+ ARGB4444TOARGB
+ "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
+
+ "vld1.8 {q0}, [%1]! \n" // load 8 ARGB4444 pixels.
+ ARGB4444TOARGB
+ "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
+ "vld1.8 {q0}, [%1]! \n" // next 8 ARGB4444 pixels.
+ ARGB4444TOARGB
+ "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
+
+ "vrshr.u16 q4, q4, #1 \n" // 2x average
+ "vrshr.u16 q5, q5, #1 \n"
+ "vrshr.u16 q6, q6, #1 \n"
+
+ "subs %4, %4, #16 \n" // 16 processed per loop.
+ "vmul.s16 q8, q4, q10 \n" // B
+ "vmls.s16 q8, q5, q11 \n" // G
+ "vmls.s16 q8, q6, q12 \n" // R
+ "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned
+ "vmul.s16 q9, q6, q10 \n" // R
+ "vmls.s16 q9, q5, q14 \n" // G
+ "vmls.s16 q9, q4, q13 \n" // B
+ "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned
+ "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U
+ "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V
+ "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
+ "bgt 1b \n"
+ : "+r"(src_argb4444), // %0
+ "+r"(src_stride_argb4444), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8",
+ "q9", "q10", "q11", "q12", "q13", "q14", "q15");
+}
+
+void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int width) {
+ asm volatile(
+ "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient
+ "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient
+ "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient
+ "vmov.u8 d27, #16 \n" // Add 16 constant
+ "1: \n"
+ "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ RGB565TOARGB
+ "vmull.u8 q2, d0, d24 \n" // B
+ "vmlal.u8 q2, d1, d25 \n" // G
+ "vmlal.u8 q2, d2, d26 \n" // R
+ "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y
+ "vqadd.u8 d0, d27 \n"
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
+ "bgt 1b \n"
+ : "+r"(src_rgb565), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13");
+}
+
+void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int width) {
+ asm volatile(
+ "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient
+ "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient
+ "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient
+ "vmov.u8 d27, #16 \n" // Add 16 constant
+ "1: \n"
+ "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ ARGB1555TOARGB
+ "vmull.u8 q2, d0, d24 \n" // B
+ "vmlal.u8 q2, d1, d25 \n" // G
+ "vmlal.u8 q2, d2, d26 \n" // R
+ "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y
+ "vqadd.u8 d0, d27 \n"
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
+ "bgt 1b \n"
+ : "+r"(src_argb1555), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13");
+}
+
+void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int width) {
+ asm volatile(
+ "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient
+ "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient
+ "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient
+ "vmov.u8 d27, #16 \n" // Add 16 constant
+ "1: \n"
+ "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ ARGB4444TOARGB
+ "vmull.u8 q2, d0, d24 \n" // B
+ "vmlal.u8 q2, d1, d25 \n" // G
+ "vmlal.u8 q2, d2, d26 \n" // R
+ "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y
+ "vqadd.u8 d0, d27 \n"
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
+ "bgt 1b \n"
+ : "+r"(src_argb4444), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13");
+}
+
+void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int width) {
+ asm volatile(
+ "vmov.u8 d4, #33 \n" // R * 0.2578 coefficient
+ "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
+ "vmov.u8 d6, #13 \n" // B * 0.1016 coefficient
+ "vmov.u8 d7, #16 \n" // Add 16 constant
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of BGRA.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmull.u8 q8, d1, d4 \n" // R
+ "vmlal.u8 q8, d2, d5 \n" // G
+ "vmlal.u8 q8, d3, d6 \n" // B
+ "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y
+ "vqadd.u8 d0, d7 \n"
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
+ "bgt 1b \n"
+ : "+r"(src_bgra), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8");
+}
+
+void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int width) {
+ asm volatile(
+ "vmov.u8 d4, #33 \n" // R * 0.2578 coefficient
+ "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
+ "vmov.u8 d6, #13 \n" // B * 0.1016 coefficient
+ "vmov.u8 d7, #16 \n" // Add 16 constant
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ABGR.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmull.u8 q8, d0, d4 \n" // R
+ "vmlal.u8 q8, d1, d5 \n" // G
+ "vmlal.u8 q8, d2, d6 \n" // B
+ "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y
+ "vqadd.u8 d0, d7 \n"
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
+ "bgt 1b \n"
+ : "+r"(src_abgr), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8");
+}
+
+void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int width) {
+ asm volatile(
+ "vmov.u8 d4, #13 \n" // B * 0.1016 coefficient
+ "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
+ "vmov.u8 d6, #33 \n" // R * 0.2578 coefficient
+ "vmov.u8 d7, #16 \n" // Add 16 constant
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of RGBA.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmull.u8 q8, d1, d4 \n" // B
+ "vmlal.u8 q8, d2, d5 \n" // G
+ "vmlal.u8 q8, d3, d6 \n" // R
+ "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y
+ "vqadd.u8 d0, d7 \n"
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
+ "bgt 1b \n"
+ : "+r"(src_rgba), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8");
+}
+
+void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int width) {
+ asm volatile(
+ "vmov.u8 d4, #13 \n" // B * 0.1016 coefficient
+ "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
+ "vmov.u8 d6, #33 \n" // R * 0.2578 coefficient
+ "vmov.u8 d7, #16 \n" // Add 16 constant
+ "1: \n"
+ "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RGB24.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmull.u8 q8, d0, d4 \n" // B
+ "vmlal.u8 q8, d1, d5 \n" // G
+ "vmlal.u8 q8, d2, d6 \n" // R
+ "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y
+ "vqadd.u8 d0, d7 \n"
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
+ "bgt 1b \n"
+ : "+r"(src_rgb24), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8");
+}
+
+void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int width) {
+ asm volatile(
+ "vmov.u8 d4, #33 \n" // R * 0.2578 coefficient
+ "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
+ "vmov.u8 d6, #13 \n" // B * 0.1016 coefficient
+ "vmov.u8 d7, #16 \n" // Add 16 constant
+ "1: \n"
+ "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RAW.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmull.u8 q8, d0, d4 \n" // B
+ "vmlal.u8 q8, d1, d5 \n" // G
+ "vmlal.u8 q8, d2, d6 \n" // R
+ "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y
+ "vqadd.u8 d0, d7 \n"
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
+ "bgt 1b \n"
+ : "+r"(src_raw), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8");
+}
+
+// Bilinear filter 16x2 -> 16x1
+void InterpolateRow_NEON(uint8* dst_ptr,
+ const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ int dst_width,
+ int source_y_fraction) {
+ int y1_fraction = source_y_fraction;
+ asm volatile(
+ "cmp %4, #0 \n"
+ "beq 100f \n"
+ "add %2, %1 \n"
+ "cmp %4, #128 \n"
+ "beq 50f \n"
+
+ "vdup.8 d5, %4 \n"
+ "rsb %4, #256 \n"
+ "vdup.8 d4, %4 \n"
+ // General purpose row blend.
+ "1: \n"
+ "vld1.8 {q0}, [%1]! \n"
+ "vld1.8 {q1}, [%2]! \n"
+ "subs %3, %3, #16 \n"
+ "vmull.u8 q13, d0, d4 \n"
+ "vmull.u8 q14, d1, d4 \n"
+ "vmlal.u8 q13, d2, d5 \n"
+ "vmlal.u8 q14, d3, d5 \n"
+ "vrshrn.u16 d0, q13, #8 \n"
+ "vrshrn.u16 d1, q14, #8 \n"
+ "vst1.8 {q0}, [%0]! \n"
+ "bgt 1b \n"
+ "b 99f \n"
+
+ // Blend 50 / 50.
+ "50: \n"
+ "vld1.8 {q0}, [%1]! \n"
+ "vld1.8 {q1}, [%2]! \n"
+ "subs %3, %3, #16 \n"
+ "vrhadd.u8 q0, q1 \n"
+ "vst1.8 {q0}, [%0]! \n"
+ "bgt 50b \n"
+ "b 99f \n"
+
+ // Blend 100 / 0 - Copy row unchanged.
+ "100: \n"
+ "vld1.8 {q0}, [%1]! \n"
+ "subs %3, %3, #16 \n"
+ "vst1.8 {q0}, [%0]! \n"
+ "bgt 100b \n"
+
+ "99: \n"
+ : "+r"(dst_ptr), // %0
+ "+r"(src_ptr), // %1
+ "+r"(src_stride), // %2
+ "+r"(dst_width), // %3
+ "+r"(y1_fraction) // %4
+ :
+ : "cc", "memory", "q0", "q1", "d4", "d5", "q13", "q14");
+}
+
+// dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr
+void ARGBBlendRow_NEON(const uint8* src_argb0,
+ const uint8* src_argb1,
+ uint8* dst_argb,
+ int width) {
+ asm volatile(
+ "subs %3, #8 \n"
+ "blt 89f \n"
+ // Blend 8 pixels.
+ "8: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB0.
+ "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 pixels of ARGB1.
+ "subs %3, %3, #8 \n" // 8 processed per loop.
+ "vmull.u8 q10, d4, d3 \n" // db * a
+ "vmull.u8 q11, d5, d3 \n" // dg * a
+ "vmull.u8 q12, d6, d3 \n" // dr * a
+ "vqrshrn.u16 d20, q10, #8 \n" // db >>= 8
+ "vqrshrn.u16 d21, q11, #8 \n" // dg >>= 8
+ "vqrshrn.u16 d22, q12, #8 \n" // dr >>= 8
+ "vqsub.u8 q2, q2, q10 \n" // dbg - dbg * a / 256
+ "vqsub.u8 d6, d6, d22 \n" // dr - dr * a / 256
+ "vqadd.u8 q0, q0, q2 \n" // + sbg
+ "vqadd.u8 d2, d2, d6 \n" // + sr
+ "vmov.u8 d3, #255 \n" // a = 255
+ "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 pixels of ARGB.
+ "bge 8b \n"
+
+ "89: \n"
+ "adds %3, #8-1 \n"
+ "blt 99f \n"
+
+ // Blend 1 pixels.
+ "1: \n"
+ "vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [%0]! \n" // load 1 pixel ARGB0.
+ "vld4.8 {d4[0],d5[0],d6[0],d7[0]}, [%1]! \n" // load 1 pixel ARGB1.
+ "subs %3, %3, #1 \n" // 1 processed per loop.
+ "vmull.u8 q10, d4, d3 \n" // db * a
+ "vmull.u8 q11, d5, d3 \n" // dg * a
+ "vmull.u8 q12, d6, d3 \n" // dr * a
+ "vqrshrn.u16 d20, q10, #8 \n" // db >>= 8
+ "vqrshrn.u16 d21, q11, #8 \n" // dg >>= 8
+ "vqrshrn.u16 d22, q12, #8 \n" // dr >>= 8
+ "vqsub.u8 q2, q2, q10 \n" // dbg - dbg * a / 256
+ "vqsub.u8 d6, d6, d22 \n" // dr - dr * a / 256
+ "vqadd.u8 q0, q0, q2 \n" // + sbg
+ "vqadd.u8 d2, d2, d6 \n" // + sr
+ "vmov.u8 d3, #255 \n" // a = 255
+ "vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [%2]! \n" // store 1 pixel.
+ "bge 1b \n"
+
+ "99: \n"
+
+ : "+r"(src_argb0), // %0
+ "+r"(src_argb1), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q10", "q11", "q12");
+}
+
+// Attenuate 8 pixels at a time.
+void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
+ asm volatile(
+ // Attenuate 8 pixels.
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmull.u8 q10, d0, d3 \n" // b * a
+ "vmull.u8 q11, d1, d3 \n" // g * a
+ "vmull.u8 q12, d2, d3 \n" // r * a
+ "vqrshrn.u16 d0, q10, #8 \n" // b >>= 8
+ "vqrshrn.u16 d1, q11, #8 \n" // g >>= 8
+ "vqrshrn.u16 d2, q12, #8 \n" // r >>= 8
+ "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "q0", "q1", "q10", "q11", "q12");
+}
+
+// Quantize 8 ARGB pixels (32 bytes).
+// dst = (dst * scale >> 16) * interval_size + interval_offset;
+void ARGBQuantizeRow_NEON(uint8* dst_argb,
+ int scale,
+ int interval_size,
+ int interval_offset,
+ int width) {
+ asm volatile(
+ "vdup.u16 q8, %2 \n"
+ "vshr.u16 q8, q8, #1 \n" // scale >>= 1
+ "vdup.u16 q9, %3 \n" // interval multiply.
+ "vdup.u16 q10, %4 \n" // interval add
+
+ // 8 pixel loop.
+ "1: \n"
+ "vld4.8 {d0, d2, d4, d6}, [%0] \n" // load 8 pixels of ARGB.
+ "subs %1, %1, #8 \n" // 8 processed per loop.
+ "vmovl.u8 q0, d0 \n" // b (0 .. 255)
+ "vmovl.u8 q1, d2 \n"
+ "vmovl.u8 q2, d4 \n"
+ "vqdmulh.s16 q0, q0, q8 \n" // b * scale
+ "vqdmulh.s16 q1, q1, q8 \n" // g
+ "vqdmulh.s16 q2, q2, q8 \n" // r
+ "vmul.u16 q0, q0, q9 \n" // b * interval_size
+ "vmul.u16 q1, q1, q9 \n" // g
+ "vmul.u16 q2, q2, q9 \n" // r
+ "vadd.u16 q0, q0, q10 \n" // b + interval_offset
+ "vadd.u16 q1, q1, q10 \n" // g
+ "vadd.u16 q2, q2, q10 \n" // r
+ "vqmovn.u16 d0, q0 \n"
+ "vqmovn.u16 d2, q1 \n"
+ "vqmovn.u16 d4, q2 \n"
+ "vst4.8 {d0, d2, d4, d6}, [%0]! \n" // store 8 pixels of ARGB.
+ "bgt 1b \n"
+ : "+r"(dst_argb), // %0
+ "+r"(width) // %1
+ : "r"(scale), // %2
+ "r"(interval_size), // %3
+ "r"(interval_offset) // %4
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10");
+}
+
+// Shade 8 pixels at a time by specified value.
+// NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8.
+// Rounding in vqrdmulh does +1 to high if high bit of low s16 is set.
+void ARGBShadeRow_NEON(const uint8* src_argb,
+ uint8* dst_argb,
+ int width,
+ uint32 value) {
+ asm volatile(
+ "vdup.u32 q0, %3 \n" // duplicate scale value.
+ "vzip.u8 d0, d1 \n" // d0 aarrggbb.
+ "vshr.u16 q0, q0, #1 \n" // scale / 2.
+
+ // 8 pixel loop.
+ "1: \n"
+ "vld4.8 {d20, d22, d24, d26}, [%0]! \n" // load 8 pixels of ARGB.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmovl.u8 q10, d20 \n" // b (0 .. 255)
+ "vmovl.u8 q11, d22 \n"
+ "vmovl.u8 q12, d24 \n"
+ "vmovl.u8 q13, d26 \n"
+ "vqrdmulh.s16 q10, q10, d0[0] \n" // b * scale * 2
+ "vqrdmulh.s16 q11, q11, d0[1] \n" // g
+ "vqrdmulh.s16 q12, q12, d0[2] \n" // r
+ "vqrdmulh.s16 q13, q13, d0[3] \n" // a
+ "vqmovn.u16 d20, q10 \n"
+ "vqmovn.u16 d22, q11 \n"
+ "vqmovn.u16 d24, q12 \n"
+ "vqmovn.u16 d26, q13 \n"
+ "vst4.8 {d20, d22, d24, d26}, [%1]! \n" // store 8 pixels of ARGB.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "r"(value) // %3
+ : "cc", "memory", "q0", "q10", "q11", "q12", "q13");
+}
+
+// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
+// Similar to ARGBToYJ but stores ARGB.
+// C code is (15 * b + 75 * g + 38 * r + 64) >> 7;
+void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
+ asm volatile(
+ "vmov.u8 d24, #15 \n" // B * 0.11400 coefficient
+ "vmov.u8 d25, #75 \n" // G * 0.58700 coefficient
+ "vmov.u8 d26, #38 \n" // R * 0.29900 coefficient
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmull.u8 q2, d0, d24 \n" // B
+ "vmlal.u8 q2, d1, d25 \n" // G
+ "vmlal.u8 q2, d2, d26 \n" // R
+ "vqrshrun.s16 d0, q2, #7 \n" // 15 bit to 8 bit B
+ "vmov d1, d0 \n" // G
+ "vmov d2, d0 \n" // R
+ "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 ARGB pixels.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q12", "q13");
+}
+
+// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
+// b = (r * 35 + g * 68 + b * 17) >> 7
+// g = (r * 45 + g * 88 + b * 22) >> 7
+// r = (r * 50 + g * 98 + b * 24) >> 7
+void ARGBSepiaRow_NEON(uint8* dst_argb, int width) {
+ asm volatile(
+ "vmov.u8 d20, #17 \n" // BB coefficient
+ "vmov.u8 d21, #68 \n" // BG coefficient
+ "vmov.u8 d22, #35 \n" // BR coefficient
+ "vmov.u8 d24, #22 \n" // GB coefficient
+ "vmov.u8 d25, #88 \n" // GG coefficient
+ "vmov.u8 d26, #45 \n" // GR coefficient
+ "vmov.u8 d28, #24 \n" // BB coefficient
+ "vmov.u8 d29, #98 \n" // BG coefficient
+ "vmov.u8 d30, #50 \n" // BR coefficient
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0] \n" // load 8 ARGB pixels.
+ "subs %1, %1, #8 \n" // 8 processed per loop.
+ "vmull.u8 q2, d0, d20 \n" // B to Sepia B
+ "vmlal.u8 q2, d1, d21 \n" // G
+ "vmlal.u8 q2, d2, d22 \n" // R
+ "vmull.u8 q3, d0, d24 \n" // B to Sepia G
+ "vmlal.u8 q3, d1, d25 \n" // G
+ "vmlal.u8 q3, d2, d26 \n" // R
+ "vmull.u8 q8, d0, d28 \n" // B to Sepia R
+ "vmlal.u8 q8, d1, d29 \n" // G
+ "vmlal.u8 q8, d2, d30 \n" // R
+ "vqshrn.u16 d0, q2, #7 \n" // 16 bit to 8 bit B
+ "vqshrn.u16 d1, q3, #7 \n" // 16 bit to 8 bit G
+ "vqshrn.u16 d2, q8, #7 \n" // 16 bit to 8 bit R
+ "vst4.8 {d0, d1, d2, d3}, [%0]! \n" // store 8 ARGB pixels.
+ "bgt 1b \n"
+ : "+r"(dst_argb), // %0
+ "+r"(width) // %1
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q10", "q11", "q12", "q13",
+ "q14", "q15");
+}
+
+// Tranform 8 ARGB pixels (32 bytes) with color matrix.
+// TODO(fbarchard): Was same as Sepia except matrix is provided. This function
+// needs to saturate. Consider doing a non-saturating version.
+void ARGBColorMatrixRow_NEON(const uint8* src_argb,
+ uint8* dst_argb,
+ const int8* matrix_argb,
+ int width) {
+ asm volatile(
+ "vld1.8 {q2}, [%3] \n" // load 3 ARGB vectors.
+ "vmovl.s8 q0, d4 \n" // B,G coefficients s16.
+ "vmovl.s8 q1, d5 \n" // R,A coefficients s16.
+
+ "1: \n"
+ "vld4.8 {d16, d18, d20, d22}, [%0]! \n" // load 8 ARGB pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmovl.u8 q8, d16 \n" // b (0 .. 255) 16 bit
+ "vmovl.u8 q9, d18 \n" // g
+ "vmovl.u8 q10, d20 \n" // r
+ "vmovl.u8 q11, d22 \n" // a
+ "vmul.s16 q12, q8, d0[0] \n" // B = B * Matrix B
+ "vmul.s16 q13, q8, d1[0] \n" // G = B * Matrix G
+ "vmul.s16 q14, q8, d2[0] \n" // R = B * Matrix R
+ "vmul.s16 q15, q8, d3[0] \n" // A = B * Matrix A
+ "vmul.s16 q4, q9, d0[1] \n" // B += G * Matrix B
+ "vmul.s16 q5, q9, d1[1] \n" // G += G * Matrix G
+ "vmul.s16 q6, q9, d2[1] \n" // R += G * Matrix R
+ "vmul.s16 q7, q9, d3[1] \n" // A += G * Matrix A
+ "vqadd.s16 q12, q12, q4 \n" // Accumulate B
+ "vqadd.s16 q13, q13, q5 \n" // Accumulate G
+ "vqadd.s16 q14, q14, q6 \n" // Accumulate R
+ "vqadd.s16 q15, q15, q7 \n" // Accumulate A
+ "vmul.s16 q4, q10, d0[2] \n" // B += R * Matrix B
+ "vmul.s16 q5, q10, d1[2] \n" // G += R * Matrix G
+ "vmul.s16 q6, q10, d2[2] \n" // R += R * Matrix R
+ "vmul.s16 q7, q10, d3[2] \n" // A += R * Matrix A
+ "vqadd.s16 q12, q12, q4 \n" // Accumulate B
+ "vqadd.s16 q13, q13, q5 \n" // Accumulate G
+ "vqadd.s16 q14, q14, q6 \n" // Accumulate R
+ "vqadd.s16 q15, q15, q7 \n" // Accumulate A
+ "vmul.s16 q4, q11, d0[3] \n" // B += A * Matrix B
+ "vmul.s16 q5, q11, d1[3] \n" // G += A * Matrix G
+ "vmul.s16 q6, q11, d2[3] \n" // R += A * Matrix R
+ "vmul.s16 q7, q11, d3[3] \n" // A += A * Matrix A
+ "vqadd.s16 q12, q12, q4 \n" // Accumulate B
+ "vqadd.s16 q13, q13, q5 \n" // Accumulate G
+ "vqadd.s16 q14, q14, q6 \n" // Accumulate R
+ "vqadd.s16 q15, q15, q7 \n" // Accumulate A
+ "vqshrun.s16 d16, q12, #6 \n" // 16 bit to 8 bit B
+ "vqshrun.s16 d18, q13, #6 \n" // 16 bit to 8 bit G
+ "vqshrun.s16 d20, q14, #6 \n" // 16 bit to 8 bit R
+ "vqshrun.s16 d22, q15, #6 \n" // 16 bit to 8 bit A
+ "vst4.8 {d16, d18, d20, d22}, [%1]! \n" // store 8 ARGB pixels.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "r"(matrix_argb) // %3
+ : "cc", "memory", "q0", "q1", "q2", "q4", "q5", "q6", "q7", "q8", "q9",
+ "q10", "q11", "q12", "q13", "q14", "q15");
+}
+
+// Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
+void ARGBMultiplyRow_NEON(const uint8* src_argb0,
+ const uint8* src_argb1,
+ uint8* dst_argb,
+ int width) {
+ asm volatile(
+ // 8 pixel loop.
+ "1: \n"
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
+ "vld4.8 {d1, d3, d5, d7}, [%1]! \n" // load 8 more ARGB
+ // pixels.
+ "subs %3, %3, #8 \n" // 8 processed per loop.
+ "vmull.u8 q0, d0, d1 \n" // multiply B
+ "vmull.u8 q1, d2, d3 \n" // multiply G
+ "vmull.u8 q2, d4, d5 \n" // multiply R
+ "vmull.u8 q3, d6, d7 \n" // multiply A
+ "vrshrn.u16 d0, q0, #8 \n" // 16 bit to 8 bit B
+ "vrshrn.u16 d1, q1, #8 \n" // 16 bit to 8 bit G
+ "vrshrn.u16 d2, q2, #8 \n" // 16 bit to 8 bit R
+ "vrshrn.u16 d3, q3, #8 \n" // 16 bit to 8 bit A
+ "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
+ "bgt 1b \n"
+
+ : "+r"(src_argb0), // %0
+ "+r"(src_argb1), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3");
+}
+
+// Add 2 rows of ARGB pixels together, 8 pixels at a time.
+void ARGBAddRow_NEON(const uint8* src_argb0,
+ const uint8* src_argb1,
+ uint8* dst_argb,
+ int width) {
+ asm volatile(
+ // 8 pixel loop.
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
+ "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB
+ // pixels.
+ "subs %3, %3, #8 \n" // 8 processed per loop.
+ "vqadd.u8 q0, q0, q2 \n" // add B, G
+ "vqadd.u8 q1, q1, q3 \n" // add R, A
+ "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
+ "bgt 1b \n"
+
+ : "+r"(src_argb0), // %0
+ "+r"(src_argb1), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3");
+}
+
+// Subtract 2 rows of ARGB pixels, 8 pixels at a time.
+void ARGBSubtractRow_NEON(const uint8* src_argb0,
+ const uint8* src_argb1,
+ uint8* dst_argb,
+ int width) {
+ asm volatile(
+ // 8 pixel loop.
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
+ "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB
+ // pixels.
+ "subs %3, %3, #8 \n" // 8 processed per loop.
+ "vqsub.u8 q0, q0, q2 \n" // subtract B, G
+ "vqsub.u8 q1, q1, q3 \n" // subtract R, A
+ "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
+ "bgt 1b \n"
+
+ : "+r"(src_argb0), // %0
+ "+r"(src_argb1), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3");
+}
+
+// Adds Sobel X and Sobel Y and stores Sobel into ARGB.
+// A = 255
+// R = Sobel
+// G = Sobel
+// B = Sobel
+void SobelRow_NEON(const uint8* src_sobelx,
+ const uint8* src_sobely,
+ uint8* dst_argb,
+ int width) {
+ asm volatile(
+ "vmov.u8 d3, #255 \n" // alpha
+ // 8 pixel loop.
+ "1: \n"
+ "vld1.8 {d0}, [%0]! \n" // load 8 sobelx.
+ "vld1.8 {d1}, [%1]! \n" // load 8 sobely.
+ "subs %3, %3, #8 \n" // 8 processed per loop.
+ "vqadd.u8 d0, d0, d1 \n" // add
+ "vmov.u8 d1, d0 \n"
+ "vmov.u8 d2, d0 \n"
+ "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
+ "bgt 1b \n"
+ : "+r"(src_sobelx), // %0
+ "+r"(src_sobely), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "q0", "q1");
+}
+
+// Adds Sobel X and Sobel Y and stores Sobel into plane.
+void SobelToPlaneRow_NEON(const uint8* src_sobelx,
+ const uint8* src_sobely,
+ uint8* dst_y,
+ int width) {
+ asm volatile(
+ // 16 pixel loop.
+ "1: \n"
+ "vld1.8 {q0}, [%0]! \n" // load 16 sobelx.
+ "vld1.8 {q1}, [%1]! \n" // load 16 sobely.
+ "subs %3, %3, #16 \n" // 16 processed per loop.
+ "vqadd.u8 q0, q0, q1 \n" // add
+ "vst1.8 {q0}, [%2]! \n" // store 16 pixels.
+ "bgt 1b \n"
+ : "+r"(src_sobelx), // %0
+ "+r"(src_sobely), // %1
+ "+r"(dst_y), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "q0", "q1");
+}
+
+// Mixes Sobel X, Sobel Y and Sobel into ARGB.
+// A = 255
+// R = Sobel X
+// G = Sobel
+// B = Sobel Y
+void SobelXYRow_NEON(const uint8* src_sobelx,
+ const uint8* src_sobely,
+ uint8* dst_argb,
+ int width) {
+ asm volatile(
+ "vmov.u8 d3, #255 \n" // alpha
+ // 8 pixel loop.
+ "1: \n"
+ "vld1.8 {d2}, [%0]! \n" // load 8 sobelx.
+ "vld1.8 {d0}, [%1]! \n" // load 8 sobely.
+ "subs %3, %3, #8 \n" // 8 processed per loop.
+ "vqadd.u8 d1, d0, d2 \n" // add
+ "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
+ "bgt 1b \n"
+ : "+r"(src_sobelx), // %0
+ "+r"(src_sobely), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "q0", "q1");
+}
+
+// SobelX as a matrix is
+// -1 0 1
+// -2 0 2
+// -1 0 1
+void SobelXRow_NEON(const uint8* src_y0,
+ const uint8* src_y1,
+ const uint8* src_y2,
+ uint8* dst_sobelx,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "vld1.8 {d0}, [%0],%5 \n" // top
+ "vld1.8 {d1}, [%0],%6 \n"
+ "vsubl.u8 q0, d0, d1 \n"
+ "vld1.8 {d2}, [%1],%5 \n" // center * 2
+ "vld1.8 {d3}, [%1],%6 \n"
+ "vsubl.u8 q1, d2, d3 \n"
+ "vadd.s16 q0, q0, q1 \n"
+ "vadd.s16 q0, q0, q1 \n"
+ "vld1.8 {d2}, [%2],%5 \n" // bottom
+ "vld1.8 {d3}, [%2],%6 \n"
+ "subs %4, %4, #8 \n" // 8 pixels
+ "vsubl.u8 q1, d2, d3 \n"
+ "vadd.s16 q0, q0, q1 \n"
+ "vabs.s16 q0, q0 \n"
+ "vqmovn.u16 d0, q0 \n"
+ "vst1.8 {d0}, [%3]! \n" // store 8 sobelx
+ "bgt 1b \n"
+ : "+r"(src_y0), // %0
+ "+r"(src_y1), // %1
+ "+r"(src_y2), // %2
+ "+r"(dst_sobelx), // %3
+ "+r"(width) // %4
+ : "r"(2), // %5
+ "r"(6) // %6
+ : "cc", "memory", "q0", "q1" // Clobber List
+ );
+}
+
+// SobelY as a matrix is
+// -1 -2 -1
+// 0 0 0
+// 1 2 1
+void SobelYRow_NEON(const uint8* src_y0,
+ const uint8* src_y1,
+ uint8* dst_sobely,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "vld1.8 {d0}, [%0],%4 \n" // left
+ "vld1.8 {d1}, [%1],%4 \n"
+ "vsubl.u8 q0, d0, d1 \n"
+ "vld1.8 {d2}, [%0],%4 \n" // center * 2
+ "vld1.8 {d3}, [%1],%4 \n"
+ "vsubl.u8 q1, d2, d3 \n"
+ "vadd.s16 q0, q0, q1 \n"
+ "vadd.s16 q0, q0, q1 \n"
+ "vld1.8 {d2}, [%0],%5 \n" // right
+ "vld1.8 {d3}, [%1],%5 \n"
+ "subs %3, %3, #8 \n" // 8 pixels
+ "vsubl.u8 q1, d2, d3 \n"
+ "vadd.s16 q0, q0, q1 \n"
+ "vabs.s16 q0, q0 \n"
+ "vqmovn.u16 d0, q0 \n"
+ "vst1.8 {d0}, [%2]! \n" // store 8 sobely
+ "bgt 1b \n"
+ : "+r"(src_y0), // %0
+ "+r"(src_y1), // %1
+ "+r"(dst_sobely), // %2
+ "+r"(width) // %3
+ : "r"(1), // %4
+ "r"(6) // %5
+ : "cc", "memory", "q0", "q1" // Clobber List
+ );
+}
+
+void HalfFloat1Row_NEON(const uint16* src, uint16* dst, float, int width) {
+ asm volatile(
+ "vdup.32 q0, %3 \n"
+
+ "1: \n"
+ "vld1.8 {q1}, [%0]! \n" // load 8 shorts
+ "subs %2, %2, #8 \n" // 8 pixels per loop
+ "vmovl.u16 q2, d2 \n" // 8 int's
+ "vmovl.u16 q3, d3 \n"
+ "vcvt.f32.u32 q2, q2 \n" // 8 floats
+ "vcvt.f32.u32 q3, q3 \n"
+ "vmul.f32 q2, q2, q0 \n" // adjust exponent
+ "vmul.f32 q3, q3, q0 \n"
+ "vqshrn.u32 d2, q2, #13 \n" // isolate halffloat
+ "vqshrn.u32 d3, q3, #13 \n"
+ "vst1.8 {q1}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ : "r"(1.9259299444e-34f) // %3
+ : "cc", "memory", "q0", "q1", "q2", "q3");
+}
+
+// TODO(fbarchard): multiply by element.
+void HalfFloatRow_NEON(const uint16* src, uint16* dst, float scale, int width) {
+ asm volatile(
+ "vdup.32 q0, %3 \n"
+
+ "1: \n"
+ "vld1.8 {q1}, [%0]! \n" // load 8 shorts
+ "subs %2, %2, #8 \n" // 8 pixels per loop
+ "vmovl.u16 q2, d2 \n" // 8 int's
+ "vmovl.u16 q3, d3 \n"
+ "vcvt.f32.u32 q2, q2 \n" // 8 floats
+ "vcvt.f32.u32 q3, q3 \n"
+ "vmul.f32 q2, q2, q0 \n" // adjust exponent
+ "vmul.f32 q3, q3, q0 \n"
+ "vqshrn.u32 d2, q2, #13 \n" // isolate halffloat
+ "vqshrn.u32 d3, q3, #13 \n"
+ "vst1.8 {q1}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ : "r"(scale * 1.9259299444e-34f) // %3
+ : "cc", "memory", "q0", "q1", "q2", "q3");
+}
+
+#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)..
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/libyuv/src/main/cpp/libyuv/source/row_neon64.cc b/libyuv/src/main/cpp/libyuv/source/row_neon64.cc
new file mode 100644
index 0000000..2f4d149
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/source/row_neon64.cc
@@ -0,0 +1,2620 @@
+/*
+ * Copyright 2014 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for GCC Neon armv8 64 bit.
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+
+// Read 8 Y, 4 U and 4 V from 422
+#define READYUV422 \
+ "ld1 {v0.8b}, [%0], #8 \n" \
+ "ld1 {v1.s}[0], [%1], #4 \n" \
+ "ld1 {v1.s}[1], [%2], #4 \n"
+
+// Read 8 Y, 8 U and 8 V from 444
+#define READYUV444 \
+ "ld1 {v0.8b}, [%0], #8 \n" \
+ "ld1 {v1.d}[0], [%1], #8 \n" \
+ "ld1 {v1.d}[1], [%2], #8 \n" \
+ "uaddlp v1.8h, v1.16b \n" \
+ "rshrn v1.8b, v1.8h, #1 \n"
+
+// Read 8 Y, and set 4 U and 4 V to 128
+#define READYUV400 \
+ "ld1 {v0.8b}, [%0], #8 \n" \
+ "movi v1.8b , #128 \n"
+
+// Read 8 Y and 4 UV from NV12
+#define READNV12 \
+ "ld1 {v0.8b}, [%0], #8 \n" \
+ "ld1 {v2.8b}, [%1], #8 \n" \
+ "uzp1 v1.8b, v2.8b, v2.8b \n" \
+ "uzp2 v3.8b, v2.8b, v2.8b \n" \
+ "ins v1.s[1], v3.s[0] \n"
+
+// Read 8 Y and 4 VU from NV21
+#define READNV21 \
+ "ld1 {v0.8b}, [%0], #8 \n" \
+ "ld1 {v2.8b}, [%1], #8 \n" \
+ "uzp1 v3.8b, v2.8b, v2.8b \n" \
+ "uzp2 v1.8b, v2.8b, v2.8b \n" \
+ "ins v1.s[1], v3.s[0] \n"
+
+// Read 8 YUY2
+#define READYUY2 \
+ "ld2 {v0.8b, v1.8b}, [%0], #16 \n" \
+ "uzp2 v3.8b, v1.8b, v1.8b \n" \
+ "uzp1 v1.8b, v1.8b, v1.8b \n" \
+ "ins v1.s[1], v3.s[0] \n"
+
+// Read 8 UYVY
+#define READUYVY \
+ "ld2 {v2.8b, v3.8b}, [%0], #16 \n" \
+ "orr v0.8b, v3.8b, v3.8b \n" \
+ "uzp1 v1.8b, v2.8b, v2.8b \n" \
+ "uzp2 v3.8b, v2.8b, v2.8b \n" \
+ "ins v1.s[1], v3.s[0] \n"
+
+#define YUVTORGB_SETUP \
+ "ld1r {v24.8h}, [%[kUVBiasBGR]], #2 \n" \
+ "ld1r {v25.8h}, [%[kUVBiasBGR]], #2 \n" \
+ "ld1r {v26.8h}, [%[kUVBiasBGR]] \n" \
+ "ld1r {v31.4s}, [%[kYToRgb]] \n" \
+ "ld2 {v27.8h, v28.8h}, [%[kUVToRB]] \n" \
+ "ld2 {v29.8h, v30.8h}, [%[kUVToG]] \n"
+
+#define YUVTORGB(vR, vG, vB) \
+ "uxtl v0.8h, v0.8b \n" /* Extract Y */ \
+ "shll v2.8h, v1.8b, #8 \n" /* Replicate UV */ \
+ "ushll2 v3.4s, v0.8h, #0 \n" /* Y */ \
+ "ushll v0.4s, v0.4h, #0 \n" \
+ "mul v3.4s, v3.4s, v31.4s \n" \
+ "mul v0.4s, v0.4s, v31.4s \n" \
+ "sqshrun v0.4h, v0.4s, #16 \n" \
+ "sqshrun2 v0.8h, v3.4s, #16 \n" /* Y */ \
+ "uaddw v1.8h, v2.8h, v1.8b \n" /* Replicate UV */ \
+ "mov v2.d[0], v1.d[1] \n" /* Extract V */ \
+ "uxtl v2.8h, v2.8b \n" \
+ "uxtl v1.8h, v1.8b \n" /* Extract U */ \
+ "mul v3.8h, v1.8h, v27.8h \n" \
+ "mul v5.8h, v1.8h, v29.8h \n" \
+ "mul v6.8h, v2.8h, v30.8h \n" \
+ "mul v7.8h, v2.8h, v28.8h \n" \
+ "sqadd v6.8h, v6.8h, v5.8h \n" \
+ "sqadd " #vB \
+ ".8h, v24.8h, v0.8h \n" /* B */ \
+ "sqadd " #vG \
+ ".8h, v25.8h, v0.8h \n" /* G */ \
+ "sqadd " #vR \
+ ".8h, v26.8h, v0.8h \n" /* R */ \
+ "sqadd " #vB ".8h, " #vB \
+ ".8h, v3.8h \n" /* B */ \
+ "sqsub " #vG ".8h, " #vG \
+ ".8h, v6.8h \n" /* G */ \
+ "sqadd " #vR ".8h, " #vR \
+ ".8h, v7.8h \n" /* R */ \
+ "sqshrun " #vB ".8b, " #vB \
+ ".8h, #6 \n" /* B */ \
+ "sqshrun " #vG ".8b, " #vG \
+ ".8h, #6 \n" /* G */ \
+ "sqshrun " #vR ".8b, " #vR ".8h, #6 \n" /* R */
+
+void I444ToARGBRow_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP
+ "movi v23.8b, #255 \n" /* A */
+ "1: \n"
+ READYUV444
+ YUVTORGB(v22, v21, v20)
+ "subs %w4, %w4, #8 \n"
+ "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
+ "b.gt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_argb), // %3
+ "+r"(width) // %4
+ : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+ [kUVToG]"r"(&yuvconstants->kUVToG),
+ [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+ [kYToRgb]"r"(&yuvconstants->kYToRgb)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+ "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+ );
+}
+
+void I422ToARGBRow_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP
+ "movi v23.8b, #255 \n" /* A */
+ "1: \n"
+ READYUV422
+ YUVTORGB(v22, v21, v20)
+ "subs %w4, %w4, #8 \n"
+ "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
+ "b.gt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_argb), // %3
+ "+r"(width) // %4
+ : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+ [kUVToG]"r"(&yuvconstants->kUVToG),
+ [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+ [kYToRgb]"r"(&yuvconstants->kYToRgb)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+ "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+ );
+}
+
+void I422AlphaToARGBRow_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ const uint8* src_a,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP
+ "1: \n"
+ READYUV422
+ YUVTORGB(v22, v21, v20)
+ "ld1 {v23.8b}, [%3], #8 \n"
+ "subs %w5, %w5, #8 \n"
+ "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%4], #32 \n"
+ "b.gt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(src_a), // %3
+ "+r"(dst_argb), // %4
+ "+r"(width) // %5
+ : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+ [kUVToG]"r"(&yuvconstants->kUVToG),
+ [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+ [kYToRgb]"r"(&yuvconstants->kYToRgb)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+ "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+ );
+}
+
+void I422ToRGBARow_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_rgba,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP
+ "movi v20.8b, #255 \n" /* A */
+ "1: \n"
+ READYUV422
+ YUVTORGB(v23, v22, v21)
+ "subs %w4, %w4, #8 \n"
+ "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
+ "b.gt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_rgba), // %3
+ "+r"(width) // %4
+ : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+ [kUVToG]"r"(&yuvconstants->kUVToG),
+ [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+ [kYToRgb]"r"(&yuvconstants->kYToRgb)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+ "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+ );
+}
+
+void I422ToRGB24Row_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP
+ "1: \n"
+ READYUV422
+ YUVTORGB(v22, v21, v20)
+ "subs %w4, %w4, #8 \n"
+ "st3 {v20.8b,v21.8b,v22.8b}, [%3], #24 \n"
+ "b.gt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_rgb24), // %3
+ "+r"(width) // %4
+ : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+ [kUVToG]"r"(&yuvconstants->kUVToG),
+ [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+ [kYToRgb]"r"(&yuvconstants->kYToRgb)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+ "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+ );
+}
+
+#define ARGBTORGB565 \
+ "shll v0.8h, v22.8b, #8 \n" /* R */ \
+ "shll v21.8h, v21.8b, #8 \n" /* G */ \
+ "shll v20.8h, v20.8b, #8 \n" /* B */ \
+ "sri v0.8h, v21.8h, #5 \n" /* RG */ \
+ "sri v0.8h, v20.8h, #11 \n" /* RGB */
+
+void I422ToRGB565Row_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_rgb565,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile(
+ YUVTORGB_SETUP
+ "1: \n" READYUV422 YUVTORGB(
+ v22, v21,
+ v20) "subs %w4, %w4, #8 \n" ARGBTORGB565
+ "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels
+ // RGB565.
+ "b.gt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_rgb565), // %3
+ "+r"(width) // %4
+ : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+ [kUVToG] "r"(&yuvconstants->kUVToG),
+ [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+ [kYToRgb] "r"(&yuvconstants->kYToRgb)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+ "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30");
+}
+
+#define ARGBTOARGB1555 \
+ "shll v0.8h, v23.8b, #8 \n" /* A */ \
+ "shll v22.8h, v22.8b, #8 \n" /* R */ \
+ "shll v21.8h, v21.8b, #8 \n" /* G */ \
+ "shll v20.8h, v20.8b, #8 \n" /* B */ \
+ "sri v0.8h, v22.8h, #1 \n" /* AR */ \
+ "sri v0.8h, v21.8h, #6 \n" /* ARG */ \
+ "sri v0.8h, v20.8h, #11 \n" /* ARGB */
+
+void I422ToARGB1555Row_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb1555,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile(
+ YUVTORGB_SETUP
+ "movi v23.8b, #255 \n"
+ "1: \n" READYUV422 YUVTORGB(
+ v22, v21,
+ v20) "subs %w4, %w4, #8 \n" ARGBTOARGB1555
+ "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels
+ // RGB565.
+ "b.gt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_argb1555), // %3
+ "+r"(width) // %4
+ : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+ [kUVToG] "r"(&yuvconstants->kUVToG),
+ [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+ [kYToRgb] "r"(&yuvconstants->kYToRgb)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+ "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30");
+}
+
+#define ARGBTOARGB4444 \
+ /* Input v20.8b<=B, v21.8b<=G, v22.8b<=R, v23.8b<=A, v4.8b<=0x0f */ \
+ "ushr v20.8b, v20.8b, #4 \n" /* B */ \
+ "bic v21.8b, v21.8b, v4.8b \n" /* G */ \
+ "ushr v22.8b, v22.8b, #4 \n" /* R */ \
+ "bic v23.8b, v23.8b, v4.8b \n" /* A */ \
+ "orr v0.8b, v20.8b, v21.8b \n" /* BG */ \
+ "orr v1.8b, v22.8b, v23.8b \n" /* RA */ \
+ "zip1 v0.16b, v0.16b, v1.16b \n" /* BGRA */
+
+void I422ToARGB4444Row_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb4444,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP
+ "movi v4.16b, #0x0f \n" // bits to clear with vbic.
+ "1: \n"
+ READYUV422
+ YUVTORGB(v22, v21, v20)
+ "subs %w4, %w4, #8 \n"
+ "movi v23.8b, #255 \n"
+ ARGBTOARGB4444
+ "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels ARGB4444.
+ "b.gt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_argb4444), // %3
+ "+r"(width) // %4
+ : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+ [kUVToG]"r"(&yuvconstants->kUVToG),
+ [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+ [kYToRgb]"r"(&yuvconstants->kYToRgb)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+ "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+ );
+}
+
+void I400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width) {
+ asm volatile (
+ YUVTORGB_SETUP
+ "movi v23.8b, #255 \n"
+ "1: \n"
+ READYUV400
+ YUVTORGB(v22, v21, v20)
+ "subs %w2, %w2, #8 \n"
+ "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
+ "b.gt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : [kUVToRB]"r"(&kYuvI601Constants.kUVToRB),
+ [kUVToG]"r"(&kYuvI601Constants.kUVToG),
+ [kUVBiasBGR]"r"(&kYuvI601Constants.kUVBiasBGR),
+ [kYToRgb]"r"(&kYuvI601Constants.kYToRgb)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+ "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+ );
+}
+
+void J400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width) {
+ asm volatile(
+ "movi v23.8b, #255 \n"
+ "1: \n"
+ "ld1 {v20.8b}, [%0], #8 \n"
+ "orr v21.8b, v20.8b, v20.8b \n"
+ "orr v22.8b, v20.8b, v20.8b \n"
+ "subs %w2, %w2, #8 \n"
+ "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
+ "b.gt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v20", "v21", "v22", "v23");
+}
+
+void NV12ToARGBRow_NEON(const uint8* src_y,
+ const uint8* src_uv,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP
+ "movi v23.8b, #255 \n"
+ "1: \n"
+ READNV12
+ YUVTORGB(v22, v21, v20)
+ "subs %w3, %w3, #8 \n"
+ "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n"
+ "b.gt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_uv), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+ [kUVToG]"r"(&yuvconstants->kUVToG),
+ [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+ [kYToRgb]"r"(&yuvconstants->kYToRgb)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+ "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+ );
+}
+
+void NV21ToARGBRow_NEON(const uint8* src_y,
+ const uint8* src_vu,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP
+ "movi v23.8b, #255 \n"
+ "1: \n"
+ READNV21
+ YUVTORGB(v22, v21, v20)
+ "subs %w3, %w3, #8 \n"
+ "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n"
+ "b.gt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_vu), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+ [kUVToG]"r"(&yuvconstants->kUVToG),
+ [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+ [kYToRgb]"r"(&yuvconstants->kYToRgb)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+ "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+ );
+}
+
+void NV12ToRGB565Row_NEON(const uint8* src_y,
+ const uint8* src_uv,
+ uint8* dst_rgb565,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile(
+ YUVTORGB_SETUP
+ "1: \n" READNV12 YUVTORGB(
+ v22, v21,
+ v20) "subs %w3, %w3, #8 \n" ARGBTORGB565
+ "st1 {v0.8h}, [%2], 16 \n" // store 8 pixels
+ // RGB565.
+ "b.gt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_uv), // %1
+ "+r"(dst_rgb565), // %2
+ "+r"(width) // %3
+ : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+ [kUVToG] "r"(&yuvconstants->kUVToG),
+ [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+ [kYToRgb] "r"(&yuvconstants->kYToRgb)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+ "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30");
+}
+
+void YUY2ToARGBRow_NEON(const uint8* src_yuy2,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP
+ "movi v23.8b, #255 \n"
+ "1: \n"
+ READYUY2
+ YUVTORGB(v22, v21, v20)
+ "subs %w2, %w2, #8 \n"
+ "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
+ "b.gt 1b \n"
+ : "+r"(src_yuy2), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+ [kUVToG]"r"(&yuvconstants->kUVToG),
+ [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+ [kYToRgb]"r"(&yuvconstants->kYToRgb)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+ "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+ );
+}
+
+void UYVYToARGBRow_NEON(const uint8* src_uyvy,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP
+ "movi v23.8b, #255 \n"
+ "1: \n"
+ READUYVY
+ YUVTORGB(v22, v21, v20)
+ "subs %w2, %w2, #8 \n"
+ "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], 32 \n"
+ "b.gt 1b \n"
+ : "+r"(src_uyvy), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+ [kUVToG]"r"(&yuvconstants->kUVToG),
+ [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+ [kYToRgb]"r"(&yuvconstants->kYToRgb)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+ "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+ );
+}
+
+// Reads 16 pairs of UV and write even values to dst_u and odd to dst_v.
+void SplitUVRow_NEON(const uint8* src_uv,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pairs of UV
+ "subs %w3, %w3, #16 \n" // 16 processed per loop
+ "st1 {v0.16b}, [%1], #16 \n" // store U
+ "st1 {v1.16b}, [%2], #16 \n" // store V
+ "b.gt 1b \n"
+ : "+r"(src_uv), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(width) // %3 // Output registers
+ : // Input registers
+ : "cc", "memory", "v0", "v1" // Clobber List
+ );
+}
+
+// Reads 16 U's and V's and writes out 16 pairs of UV.
+void MergeUVRow_NEON(const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_uv,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "ld1 {v0.16b}, [%0], #16 \n" // load U
+ "ld1 {v1.16b}, [%1], #16 \n" // load V
+ "subs %w3, %w3, #16 \n" // 16 processed per loop
+ "st2 {v0.16b,v1.16b}, [%2], #32 \n" // store 16 pairs of UV
+ "b.gt 1b \n"
+ : "+r"(src_u), // %0
+ "+r"(src_v), // %1
+ "+r"(dst_uv), // %2
+ "+r"(width) // %3 // Output registers
+ : // Input registers
+ : "cc", "memory", "v0", "v1" // Clobber List
+ );
+}
+
+// Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15.
+void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
+ asm volatile(
+ "1: \n"
+ "ld1 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 32
+ "subs %w2, %w2, #32 \n" // 32 processed per loop
+ "st1 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 32
+ "b.gt 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(count) // %2 // Output registers
+ : // Input registers
+ : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
+ );
+}
+
+// SetRow writes 'count' bytes using an 8 bit value repeated.
+void SetRow_NEON(uint8* dst, uint8 v8, int count) {
+ asm volatile(
+ "dup v0.16b, %w2 \n" // duplicate 16 bytes
+ "1: \n"
+ "subs %w1, %w1, #16 \n" // 16 bytes per loop
+ "st1 {v0.16b}, [%0], #16 \n" // store
+ "b.gt 1b \n"
+ : "+r"(dst), // %0
+ "+r"(count) // %1
+ : "r"(v8) // %2
+ : "cc", "memory", "v0");
+}
+
+void ARGBSetRow_NEON(uint8* dst, uint32 v32, int count) {
+ asm volatile(
+ "dup v0.4s, %w2 \n" // duplicate 4 ints
+ "1: \n"
+ "subs %w1, %w1, #4 \n" // 4 ints per loop
+ "st1 {v0.16b}, [%0], #16 \n" // store
+ "b.gt 1b \n"
+ : "+r"(dst), // %0
+ "+r"(count) // %1
+ : "r"(v32) // %2
+ : "cc", "memory", "v0");
+}
+
+void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
+ asm volatile(
+ // Start at end of source row.
+ "add %0, %0, %w2, sxtw \n"
+ "sub %0, %0, #16 \n"
+ "1: \n"
+ "ld1 {v0.16b}, [%0], %3 \n" // src -= 16
+ "subs %w2, %w2, #16 \n" // 16 pixels per loop.
+ "rev64 v0.16b, v0.16b \n"
+ "st1 {v0.D}[1], [%1], #8 \n" // dst += 16
+ "st1 {v0.D}[0], [%1], #8 \n"
+ "b.gt 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ : "r"((ptrdiff_t)-16) // %3
+ : "cc", "memory", "v0");
+}
+
+void MirrorUVRow_NEON(const uint8* src_uv,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
+ asm volatile(
+ // Start at end of source row.
+ "add %0, %0, %w3, sxtw #1 \n"
+ "sub %0, %0, #16 \n"
+ "1: \n"
+ "ld2 {v0.8b, v1.8b}, [%0], %4 \n" // src -= 16
+ "subs %w3, %w3, #8 \n" // 8 pixels per loop.
+ "rev64 v0.8b, v0.8b \n"
+ "rev64 v1.8b, v1.8b \n"
+ "st1 {v0.8b}, [%1], #8 \n" // dst += 8
+ "st1 {v1.8b}, [%2], #8 \n"
+ "b.gt 1b \n"
+ : "+r"(src_uv), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(width) // %3
+ : "r"((ptrdiff_t)-16) // %4
+ : "cc", "memory", "v0", "v1");
+}
+
+void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {
+ asm volatile(
+ // Start at end of source row.
+ "add %0, %0, %w2, sxtw #2 \n"
+ "sub %0, %0, #16 \n"
+ "1: \n"
+ "ld1 {v0.16b}, [%0], %3 \n" // src -= 16
+ "subs %w2, %w2, #4 \n" // 4 pixels per loop.
+ "rev64 v0.4s, v0.4s \n"
+ "st1 {v0.D}[1], [%1], #8 \n" // dst += 16
+ "st1 {v0.D}[0], [%1], #8 \n"
+ "b.gt 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ : "r"((ptrdiff_t)-16) // %3
+ : "cc", "memory", "v0");
+}
+
+void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int width) {
+ asm volatile(
+ "movi v4.8b, #255 \n" // Alpha
+ "1: \n"
+ "ld3 {v1.8b,v2.8b,v3.8b}, [%0], #24 \n" // load 8 pixels of RGB24.
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "st4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n" // store 8 ARGB
+ // pixels
+ "b.gt 1b \n"
+ : "+r"(src_rgb24), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List
+ );
+}
+
+void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int width) {
+ asm volatile(
+ "movi v5.8b, #255 \n" // Alpha
+ "1: \n"
+ "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "orr v3.8b, v1.8b, v1.8b \n" // move g
+ "orr v4.8b, v0.8b, v0.8b \n" // move r
+ "st4 {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n" // store b g r a
+ "b.gt 1b \n"
+ : "+r"(src_raw), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5" // Clobber List
+ );
+}
+
+void RAWToRGB24Row_NEON(const uint8* src_raw, uint8* dst_rgb24, int width) {
+ asm volatile(
+ "1: \n"
+ "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "orr v3.8b, v1.8b, v1.8b \n" // move g
+ "orr v4.8b, v0.8b, v0.8b \n" // move r
+ "st3 {v2.8b,v3.8b,v4.8b}, [%1], #24 \n" // store b g r
+ "b.gt 1b \n"
+ : "+r"(src_raw), // %0
+ "+r"(dst_rgb24), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4" // Clobber List
+ );
+}
+
+#define RGB565TOARGB \
+ "shrn v6.8b, v0.8h, #5 \n" /* G xxGGGGGG */ \
+ "shl v6.8b, v6.8b, #2 \n" /* G GGGGGG00 upper 6 */ \
+ "ushr v4.8b, v6.8b, #6 \n" /* G 000000GG lower 2 */ \
+ "orr v1.8b, v4.8b, v6.8b \n" /* G */ \
+ "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \
+ "ushr v0.8h, v0.8h, #11 \n" /* R 000RRRRR */ \
+ "xtn2 v2.16b,v0.8h \n" /* R in upper part */ \
+ "shl v2.16b, v2.16b, #3 \n" /* R,B BBBBB000 upper 5 */ \
+ "ushr v0.16b, v2.16b, #5 \n" /* R,B 00000BBB lower 3 */ \
+ "orr v0.16b, v0.16b, v2.16b \n" /* R,B */ \
+ "dup v2.2D, v0.D[1] \n" /* R */
+
+void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int width) {
+ asm volatile(
+ "movi v3.8b, #255 \n" // Alpha
+ "1: \n"
+ "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels.
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ RGB565TOARGB
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB
+ // pixels
+ "b.gt 1b \n"
+ : "+r"(src_rgb565), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6" // Clobber List
+ );
+}
+
+#define ARGB1555TOARGB \
+ "ushr v2.8h, v0.8h, #10 \n" /* R xxxRRRRR */ \
+ "shl v2.8h, v2.8h, #3 \n" /* R RRRRR000 upper 5 */ \
+ "xtn v3.8b, v2.8h \n" /* RRRRR000 AAAAAAAA */ \
+ \
+ "sshr v2.8h, v0.8h, #15 \n" /* A AAAAAAAA */ \
+ "xtn2 v3.16b, v2.8h \n" \
+ \
+ "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \
+ "shrn2 v2.16b,v0.8h, #5 \n" /* G xxxGGGGG */ \
+ \
+ "ushr v1.16b, v3.16b, #5 \n" /* R,A 00000RRR lower 3 */ \
+ "shl v0.16b, v2.16b, #3 \n" /* B,G BBBBB000 upper 5 */ \
+ "ushr v2.16b, v0.16b, #5 \n" /* B,G 00000BBB lower 3 */ \
+ \
+ "orr v0.16b, v0.16b, v2.16b \n" /* B,G */ \
+ "orr v2.16b, v1.16b, v3.16b \n" /* R,A */ \
+ "dup v1.2D, v0.D[1] \n" \
+ "dup v3.2D, v2.D[1] \n"
+
+// RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha.
+#define RGB555TOARGB \
+ "ushr v2.8h, v0.8h, #10 \n" /* R xxxRRRRR */ \
+ "shl v2.8h, v2.8h, #3 \n" /* R RRRRR000 upper 5 */ \
+ "xtn v3.8b, v2.8h \n" /* RRRRR000 */ \
+ \
+ "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \
+ "shrn2 v2.16b,v0.8h, #5 \n" /* G xxxGGGGG */ \
+ \
+ "ushr v1.16b, v3.16b, #5 \n" /* R 00000RRR lower 3 */ \
+ "shl v0.16b, v2.16b, #3 \n" /* B,G BBBBB000 upper 5 */ \
+ "ushr v2.16b, v0.16b, #5 \n" /* B,G 00000BBB lower 3 */ \
+ \
+ "orr v0.16b, v0.16b, v2.16b \n" /* B,G */ \
+ "orr v2.16b, v1.16b, v3.16b \n" /* R */ \
+ "dup v1.2D, v0.D[1] \n" /* G */
+
+void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555,
+ uint8* dst_argb,
+ int width) {
+ asm volatile(
+ "movi v3.8b, #255 \n" // Alpha
+ "1: \n"
+ "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels.
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ ARGB1555TOARGB
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB
+ // pixels
+ "b.gt 1b \n"
+ : "+r"(src_argb1555), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
+ );
+}
+
+#define ARGB4444TOARGB \
+ "shrn v1.8b, v0.8h, #8 \n" /* v1(l) AR */ \
+ "xtn2 v1.16b, v0.8h \n" /* v1(h) GB */ \
+ "shl v2.16b, v1.16b, #4 \n" /* B,R BBBB0000 */ \
+ "ushr v3.16b, v1.16b, #4 \n" /* G,A 0000GGGG */ \
+ "ushr v0.16b, v2.16b, #4 \n" /* B,R 0000BBBB */ \
+ "shl v1.16b, v3.16b, #4 \n" /* G,A GGGG0000 */ \
+ "orr v2.16b, v0.16b, v2.16b \n" /* B,R BBBBBBBB */ \
+ "orr v3.16b, v1.16b, v3.16b \n" /* G,A GGGGGGGG */ \
+ "dup v0.2D, v2.D[1] \n" \
+ "dup v1.2D, v3.D[1] \n"
+
+void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444,
+ uint8* dst_argb,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels.
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ ARGB4444TOARGB
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB
+ // pixels
+ "b.gt 1b \n"
+ : "+r"(src_argb4444), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4" // Clobber List
+ );
+}
+
+void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int width) {
+ asm volatile(
+ "1: \n"
+ "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load 8 ARGB
+ // pixels
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "st3 {v1.8b,v2.8b,v3.8b}, [%1], #24 \n" // store 8 pixels of
+ // RGB24.
+ "b.gt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_rgb24), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List
+ );
+}
+
+void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int width) {
+ asm volatile(
+ "1: \n"
+ "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load b g r a
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "orr v4.8b, v2.8b, v2.8b \n" // mov g
+ "orr v5.8b, v1.8b, v1.8b \n" // mov b
+ "st3 {v3.8b,v4.8b,v5.8b}, [%1], #24 \n" // store r g b
+ "b.gt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_raw), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v1", "v2", "v3", "v4", "v5" // Clobber List
+ );
+}
+
+void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int width) {
+ asm volatile(
+ "1: \n"
+ "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of YUY2.
+ "subs %w2, %w2, #16 \n" // 16 processed per loop.
+ "st1 {v0.16b}, [%1], #16 \n" // store 16 pixels of Y.
+ "b.gt 1b \n"
+ : "+r"(src_yuy2), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v1" // Clobber List
+ );
+}
+
+void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int width) {
+ asm volatile(
+ "1: \n"
+ "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of UYVY.
+ "subs %w2, %w2, #16 \n" // 16 processed per loop.
+ "st1 {v1.16b}, [%1], #16 \n" // store 16 pixels of Y.
+ "b.gt 1b \n"
+ : "+r"(src_uyvy), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v1" // Clobber List
+ );
+}
+
+void YUY2ToUV422Row_NEON(const uint8* src_yuy2,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 YUY2
+ // pixels
+ "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs.
+ "st1 {v1.8b}, [%1], #8 \n" // store 8 U.
+ "st1 {v3.8b}, [%2], #8 \n" // store 8 V.
+ "b.gt 1b \n"
+ : "+r"(src_yuy2), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
+ );
+}
+
+void UYVYToUV422Row_NEON(const uint8* src_uyvy,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 UYVY
+ // pixels
+ "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs.
+ "st1 {v0.8b}, [%1], #8 \n" // store 8 U.
+ "st1 {v2.8b}, [%2], #8 \n" // store 8 V.
+ "b.gt 1b \n"
+ : "+r"(src_uyvy), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
+ );
+}
+
+void YUY2ToUVRow_NEON(const uint8* src_yuy2,
+ int stride_yuy2,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
+ const uint8* src_yuy2b = src_yuy2 + stride_yuy2;
+ asm volatile(
+ "1: \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels
+ "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs.
+ "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row
+ "urhadd v1.8b, v1.8b, v5.8b \n" // average rows of U
+ "urhadd v3.8b, v3.8b, v7.8b \n" // average rows of V
+ "st1 {v1.8b}, [%2], #8 \n" // store 8 U.
+ "st1 {v3.8b}, [%3], #8 \n" // store 8 V.
+ "b.gt 1b \n"
+ : "+r"(src_yuy2), // %0
+ "+r"(src_yuy2b), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
+ "v7" // Clobber List
+ );
+}
+
+void UYVYToUVRow_NEON(const uint8* src_uyvy,
+ int stride_uyvy,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
+ const uint8* src_uyvyb = src_uyvy + stride_uyvy;
+ asm volatile(
+ "1: \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels
+ "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs.
+ "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row
+ "urhadd v0.8b, v0.8b, v4.8b \n" // average rows of U
+ "urhadd v2.8b, v2.8b, v6.8b \n" // average rows of V
+ "st1 {v0.8b}, [%2], #8 \n" // store 8 U.
+ "st1 {v2.8b}, [%3], #8 \n" // store 8 V.
+ "b.gt 1b \n"
+ : "+r"(src_uyvy), // %0
+ "+r"(src_uyvyb), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
+ "v7" // Clobber List
+ );
+}
+
+// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
+void ARGBShuffleRow_NEON(const uint8* src_argb,
+ uint8* dst_argb,
+ const uint8* shuffler,
+ int width) {
+ asm volatile(
+ "ld1 {v2.16b}, [%3] \n" // shuffler
+ "1: \n"
+ "ld1 {v0.16b}, [%0], #16 \n" // load 4 pixels.
+ "subs %w2, %w2, #4 \n" // 4 processed per loop
+ "tbl v1.16b, {v0.16b}, v2.16b \n" // look up 4 pixels
+ "st1 {v1.16b}, [%1], #16 \n" // store 4.
+ "b.gt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "r"(shuffler) // %3
+ : "cc", "memory", "v0", "v1", "v2" // Clobber List
+ );
+}
+
+void I422ToYUY2Row_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_yuy2,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "ld2 {v0.8b, v1.8b}, [%0], #16 \n" // load 16 Ys
+ "orr v2.8b, v1.8b, v1.8b \n"
+ "ld1 {v1.8b}, [%1], #8 \n" // load 8 Us
+ "ld1 {v3.8b}, [%2], #8 \n" // load 8 Vs
+ "subs %w4, %w4, #16 \n" // 16 pixels
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels.
+ "b.gt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_yuy2), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3");
+}
+
+void I422ToUYVYRow_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_uyvy,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "ld2 {v1.8b,v2.8b}, [%0], #16 \n" // load 16 Ys
+ "orr v3.8b, v2.8b, v2.8b \n"
+ "ld1 {v0.8b}, [%1], #8 \n" // load 8 Us
+ "ld1 {v2.8b}, [%2], #8 \n" // load 8 Vs
+ "subs %w4, %w4, #16 \n" // 16 pixels
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels.
+ "b.gt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_uyvy), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3");
+}
+
+void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int width) {
+ asm volatile(
+ "1: \n"
+ "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ ARGBTORGB565
+ "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels RGB565.
+ "b.gt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_rgb565), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v20", "v21", "v22", "v23");
+}
+
+void ARGBToRGB565DitherRow_NEON(const uint8* src_argb,
+ uint8* dst_rgb,
+ const uint32 dither4,
+ int width) {
+ asm volatile(
+ "dup v1.4s, %w2 \n" // dither4
+ "1: \n"
+ "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" // load 8 pixels
+ "subs %w3, %w3, #8 \n" // 8 processed per loop.
+ "uqadd v20.8b, v20.8b, v1.8b \n"
+ "uqadd v21.8b, v21.8b, v1.8b \n"
+ "uqadd v22.8b, v22.8b, v1.8b \n" ARGBTORGB565
+ "st1 {v0.16b}, [%0], #16 \n" // store 8 pixels RGB565.
+ "b.gt 1b \n"
+ : "+r"(dst_rgb) // %0
+ : "r"(src_argb), // %1
+ "r"(dither4), // %2
+ "r"(width) // %3
+ : "cc", "memory", "v0", "v1", "v20", "v21", "v22", "v23");
+}
+
+void ARGBToARGB1555Row_NEON(const uint8* src_argb,
+ uint8* dst_argb1555,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ ARGBTOARGB1555
+ "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels
+ // ARGB1555.
+ "b.gt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb1555), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v20", "v21", "v22", "v23");
+}
+
+void ARGBToARGB4444Row_NEON(const uint8* src_argb,
+ uint8* dst_argb4444,
+ int width) {
+ asm volatile(
+ "movi v4.16b, #0x0f \n" // bits to clear with
+ // vbic.
+ "1: \n"
+ "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ ARGBTOARGB4444
+ "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels
+ // ARGB4444.
+ "b.gt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb4444), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v1", "v4", "v20", "v21", "v22", "v23");
+}
+
+void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int width) {
+ asm volatile(
+ "movi v4.8b, #13 \n" // B * 0.1016 coefficient
+ "movi v5.8b, #65 \n" // G * 0.5078 coefficient
+ "movi v6.8b, #33 \n" // R * 0.2578 coefficient
+ "movi v7.8b, #16 \n" // Add 16 constant
+ "1: \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
+ // pixels.
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "umull v3.8h, v0.8b, v4.8b \n" // B
+ "umlal v3.8h, v1.8b, v5.8b \n" // G
+ "umlal v3.8h, v2.8b, v6.8b \n" // R
+ "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y
+ "uqadd v0.8b, v0.8b, v7.8b \n"
+ "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
+ "b.gt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
+}
+
+void ARGBExtractAlphaRow_NEON(const uint8* src_argb, uint8* dst_a, int width) {
+ asm volatile(
+ "1: \n"
+ "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load row 16
+ // pixels
+ "subs %w2, %w2, #16 \n" // 16 processed per loop
+ "st1 {v3.16b}, [%1], #16 \n" // store 16 A's.
+ "b.gt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_a), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
+ );
+}
+
+void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int width) {
+ asm volatile(
+ "movi v4.8b, #15 \n" // B * 0.11400 coefficient
+ "movi v5.8b, #75 \n" // G * 0.58700 coefficient
+ "movi v6.8b, #38 \n" // R * 0.29900 coefficient
+ "1: \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
+ // pixels.
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "umull v3.8h, v0.8b, v4.8b \n" // B
+ "umlal v3.8h, v1.8b, v5.8b \n" // G
+ "umlal v3.8h, v2.8b, v6.8b \n" // R
+ "sqrshrun v0.8b, v3.8h, #7 \n" // 15 bit to 8 bit Y
+ "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
+ "b.gt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6");
+}
+
+// 8x1 pixels.
+void ARGBToUV444Row_NEON(const uint8* src_argb,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
+ asm volatile(
+ "movi v24.8b, #112 \n" // UB / VR 0.875
+ // coefficient
+ "movi v25.8b, #74 \n" // UG -0.5781 coefficient
+ "movi v26.8b, #38 \n" // UR -0.2969 coefficient
+ "movi v27.8b, #18 \n" // VB -0.1406 coefficient
+ "movi v28.8b, #94 \n" // VG -0.7344 coefficient
+ "movi v29.16b,#0x80 \n" // 128.5
+ "1: \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
+ // pixels.
+ "subs %w3, %w3, #8 \n" // 8 processed per loop.
+ "umull v4.8h, v0.8b, v24.8b \n" // B
+ "umlsl v4.8h, v1.8b, v25.8b \n" // G
+ "umlsl v4.8h, v2.8b, v26.8b \n" // R
+ "add v4.8h, v4.8h, v29.8h \n" // +128 -> unsigned
+
+ "umull v3.8h, v2.8b, v24.8b \n" // R
+ "umlsl v3.8h, v1.8b, v28.8b \n" // G
+ "umlsl v3.8h, v0.8b, v27.8b \n" // B
+ "add v3.8h, v3.8h, v29.8h \n" // +128 -> unsigned
+
+ "uqshrn v0.8b, v4.8h, #8 \n" // 16 bit to 8 bit U
+ "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V
+
+ "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels U.
+ "st1 {v1.8b}, [%2], #8 \n" // store 8 pixels V.
+ "b.gt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26",
+ "v27", "v28", "v29");
+}
+
+#define RGBTOUV_SETUP_REG \
+ "movi v20.8h, #56, lsl #0 \n" /* UB/VR coefficient (0.875) / 2 */ \
+ "movi v21.8h, #37, lsl #0 \n" /* UG coefficient (-0.5781) / 2 */ \
+ "movi v22.8h, #19, lsl #0 \n" /* UR coefficient (-0.2969) / 2 */ \
+ "movi v23.8h, #9, lsl #0 \n" /* VB coefficient (-0.1406) / 2 */ \
+ "movi v24.8h, #47, lsl #0 \n" /* VG coefficient (-0.7344) / 2 */ \
+ "movi v25.16b, #0x80 \n" /* 128.5 (0x8080 in 16-bit) */
+
+// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
+#define RGBTOUV(QB, QG, QR) \
+ "mul v3.8h, " #QB \
+ ",v20.8h \n" /* B */ \
+ "mul v4.8h, " #QR \
+ ",v20.8h \n" /* R */ \
+ "mls v3.8h, " #QG \
+ ",v21.8h \n" /* G */ \
+ "mls v4.8h, " #QG \
+ ",v24.8h \n" /* G */ \
+ "mls v3.8h, " #QR \
+ ",v22.8h \n" /* R */ \
+ "mls v4.8h, " #QB \
+ ",v23.8h \n" /* B */ \
+ "add v3.8h, v3.8h, v25.8h \n" /* +128 -> unsigned */ \
+ "add v4.8h, v4.8h, v25.8h \n" /* +128 -> unsigned */ \
+ "uqshrn v0.8b, v3.8h, #8 \n" /* 16 bit to 8 bit U */ \
+ "uqshrn v1.8b, v4.8h, #8 \n" /* 16 bit to 8 bit V */
+
+// TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.
+// TODO(fbarchard): consider ptrdiff_t for all strides.
+
+void ARGBToUVRow_NEON(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
+ const uint8* src_argb_1 = src_argb + src_stride_argb;
+ asm volatile (
+ RGBTOUV_SETUP_REG
+ "1: \n"
+ "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
+ "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
+ "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
+ "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
+
+ "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16
+ "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts.
+ "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
+ "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts.
+
+ "urshr v0.8h, v0.8h, #1 \n" // 2x average
+ "urshr v1.8h, v1.8h, #1 \n"
+ "urshr v2.8h, v2.8h, #1 \n"
+
+ "subs %w4, %w4, #16 \n" // 32 processed per loop.
+ RGBTOUV(v0.8h, v1.8h, v2.8h)
+ "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
+ "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
+ "b.gt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(src_argb_1), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+ "v20", "v21", "v22", "v23", "v24", "v25"
+ );
+}
+
+// TODO(fbarchard): Subsample match C code.
+void ARGBToUVJRow_NEON(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
+ const uint8* src_argb_1 = src_argb + src_stride_argb;
+ asm volatile (
+ "movi v20.8h, #63, lsl #0 \n" // UB/VR coeff (0.500) / 2
+ "movi v21.8h, #42, lsl #0 \n" // UG coeff (-0.33126) / 2
+ "movi v22.8h, #21, lsl #0 \n" // UR coeff (-0.16874) / 2
+ "movi v23.8h, #10, lsl #0 \n" // VB coeff (-0.08131) / 2
+ "movi v24.8h, #53, lsl #0 \n" // VG coeff (-0.41869) / 2
+ "movi v25.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit)
+ "1: \n"
+ "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
+ "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
+ "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
+ "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
+ "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16
+ "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts.
+ "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
+ "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts.
+
+ "urshr v0.8h, v0.8h, #1 \n" // 2x average
+ "urshr v1.8h, v1.8h, #1 \n"
+ "urshr v2.8h, v2.8h, #1 \n"
+
+ "subs %w4, %w4, #16 \n" // 32 processed per loop.
+ RGBTOUV(v0.8h, v1.8h, v2.8h)
+ "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
+ "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
+ "b.gt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(src_argb_1), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+ "v20", "v21", "v22", "v23", "v24", "v25"
+ );
+}
+
+void BGRAToUVRow_NEON(const uint8* src_bgra,
+ int src_stride_bgra,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
+ const uint8* src_bgra_1 = src_bgra + src_stride_bgra;
+ asm volatile (
+ RGBTOUV_SETUP_REG
+ "1: \n"
+ "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
+ "uaddlp v0.8h, v3.16b \n" // B 16 bytes -> 8 shorts.
+ "uaddlp v3.8h, v2.16b \n" // G 16 bytes -> 8 shorts.
+ "uaddlp v2.8h, v1.16b \n" // R 16 bytes -> 8 shorts.
+ "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more
+ "uadalp v0.8h, v7.16b \n" // B 16 bytes -> 8 shorts.
+ "uadalp v3.8h, v6.16b \n" // G 16 bytes -> 8 shorts.
+ "uadalp v2.8h, v5.16b \n" // R 16 bytes -> 8 shorts.
+
+ "urshr v0.8h, v0.8h, #1 \n" // 2x average
+ "urshr v1.8h, v3.8h, #1 \n"
+ "urshr v2.8h, v2.8h, #1 \n"
+
+ "subs %w4, %w4, #16 \n" // 32 processed per loop.
+ RGBTOUV(v0.8h, v1.8h, v2.8h)
+ "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
+ "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
+ "b.gt 1b \n"
+ : "+r"(src_bgra), // %0
+ "+r"(src_bgra_1), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+ "v20", "v21", "v22", "v23", "v24", "v25"
+ );
+}
+
+void ABGRToUVRow_NEON(const uint8* src_abgr,
+ int src_stride_abgr,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
+ const uint8* src_abgr_1 = src_abgr + src_stride_abgr;
+ asm volatile (
+ RGBTOUV_SETUP_REG
+ "1: \n"
+ "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
+ "uaddlp v3.8h, v2.16b \n" // B 16 bytes -> 8 shorts.
+ "uaddlp v2.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
+ "uaddlp v1.8h, v0.16b \n" // R 16 bytes -> 8 shorts.
+ "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more.
+ "uadalp v3.8h, v6.16b \n" // B 16 bytes -> 8 shorts.
+ "uadalp v2.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
+ "uadalp v1.8h, v4.16b \n" // R 16 bytes -> 8 shorts.
+
+ "urshr v0.8h, v3.8h, #1 \n" // 2x average
+ "urshr v2.8h, v2.8h, #1 \n"
+ "urshr v1.8h, v1.8h, #1 \n"
+
+ "subs %w4, %w4, #16 \n" // 32 processed per loop.
+ RGBTOUV(v0.8h, v2.8h, v1.8h)
+ "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
+ "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
+ "b.gt 1b \n"
+ : "+r"(src_abgr), // %0
+ "+r"(src_abgr_1), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+ "v20", "v21", "v22", "v23", "v24", "v25"
+ );
+}
+
+void RGBAToUVRow_NEON(const uint8* src_rgba,
+ int src_stride_rgba,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
+ const uint8* src_rgba_1 = src_rgba + src_stride_rgba;
+ asm volatile (
+ RGBTOUV_SETUP_REG
+ "1: \n"
+ "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
+ "uaddlp v0.8h, v1.16b \n" // B 16 bytes -> 8 shorts.
+ "uaddlp v1.8h, v2.16b \n" // G 16 bytes -> 8 shorts.
+ "uaddlp v2.8h, v3.16b \n" // R 16 bytes -> 8 shorts.
+ "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more.
+ "uadalp v0.8h, v5.16b \n" // B 16 bytes -> 8 shorts.
+ "uadalp v1.8h, v6.16b \n" // G 16 bytes -> 8 shorts.
+ "uadalp v2.8h, v7.16b \n" // R 16 bytes -> 8 shorts.
+
+ "urshr v0.8h, v0.8h, #1 \n" // 2x average
+ "urshr v1.8h, v1.8h, #1 \n"
+ "urshr v2.8h, v2.8h, #1 \n"
+
+ "subs %w4, %w4, #16 \n" // 32 processed per loop.
+ RGBTOUV(v0.8h, v1.8h, v2.8h)
+ "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
+ "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
+ "b.gt 1b \n"
+ : "+r"(src_rgba), // %0
+ "+r"(src_rgba_1), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+ "v20", "v21", "v22", "v23", "v24", "v25"
+ );
+}
+
+void RGB24ToUVRow_NEON(const uint8* src_rgb24,
+ int src_stride_rgb24,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
+ const uint8* src_rgb24_1 = src_rgb24 + src_stride_rgb24;
+ asm volatile (
+ RGBTOUV_SETUP_REG
+ "1: \n"
+ "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 pixels.
+ "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
+ "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
+ "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
+ "ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n" // load 16 more.
+ "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts.
+ "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
+ "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts.
+
+ "urshr v0.8h, v0.8h, #1 \n" // 2x average
+ "urshr v1.8h, v1.8h, #1 \n"
+ "urshr v2.8h, v2.8h, #1 \n"
+
+ "subs %w4, %w4, #16 \n" // 32 processed per loop.
+ RGBTOUV(v0.8h, v1.8h, v2.8h)
+ "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
+ "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
+ "b.gt 1b \n"
+ : "+r"(src_rgb24), // %0
+ "+r"(src_rgb24_1), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+ "v20", "v21", "v22", "v23", "v24", "v25"
+ );
+}
+
+void RAWToUVRow_NEON(const uint8* src_raw,
+ int src_stride_raw,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
+ const uint8* src_raw_1 = src_raw + src_stride_raw;
+ asm volatile (
+ RGBTOUV_SETUP_REG
+ "1: \n"
+ "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 8 RAW pixels.
+ "uaddlp v2.8h, v2.16b \n" // B 16 bytes -> 8 shorts.
+ "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
+ "uaddlp v0.8h, v0.16b \n" // R 16 bytes -> 8 shorts.
+ "ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n" // load 8 more RAW pixels
+ "uadalp v2.8h, v6.16b \n" // B 16 bytes -> 8 shorts.
+ "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
+ "uadalp v0.8h, v4.16b \n" // R 16 bytes -> 8 shorts.
+
+ "urshr v2.8h, v2.8h, #1 \n" // 2x average
+ "urshr v1.8h, v1.8h, #1 \n"
+ "urshr v0.8h, v0.8h, #1 \n"
+
+ "subs %w4, %w4, #16 \n" // 32 processed per loop.
+ RGBTOUV(v2.8h, v1.8h, v0.8h)
+ "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
+ "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
+ "b.gt 1b \n"
+ : "+r"(src_raw), // %0
+ "+r"(src_raw_1), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+ "v20", "v21", "v22", "v23", "v24", "v25"
+ );
+}
+
+// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
+void RGB565ToUVRow_NEON(const uint8* src_rgb565,
+ int src_stride_rgb565,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
+ const uint8* src_rgb565_1 = src_rgb565 + src_stride_rgb565;
+ asm volatile(
+ "movi v22.8h, #56, lsl #0 \n" // UB / VR coeff (0.875) /
+ // 2
+ "movi v23.8h, #37, lsl #0 \n" // UG coeff (-0.5781) / 2
+ "movi v24.8h, #19, lsl #0 \n" // UR coeff (-0.2969) / 2
+ "movi v25.8h, #9 , lsl #0 \n" // VB coeff (-0.1406) / 2
+ "movi v26.8h, #47, lsl #0 \n" // VG coeff (-0.7344) / 2
+ "movi v27.16b, #0x80 \n" // 128.5 (0x8080 in
+ // 16-bit)
+ "1: \n"
+ "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels.
+ RGB565TOARGB
+ "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
+ "uaddlp v18.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
+ "uaddlp v20.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
+ "ld1 {v0.16b}, [%0], #16 \n" // next 8 RGB565 pixels.
+ RGB565TOARGB
+ "uaddlp v17.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
+ "uaddlp v19.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
+ "uaddlp v21.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
+
+ "ld1 {v0.16b}, [%1], #16 \n" // load 8 RGB565 pixels.
+ RGB565TOARGB
+ "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
+ "uadalp v18.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
+ "uadalp v20.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
+ "ld1 {v0.16b}, [%1], #16 \n" // next 8 RGB565 pixels.
+ RGB565TOARGB
+ "uadalp v17.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
+ "uadalp v19.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
+ "uadalp v21.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
+
+ "ins v16.D[1], v17.D[0] \n"
+ "ins v18.D[1], v19.D[0] \n"
+ "ins v20.D[1], v21.D[0] \n"
+
+ "urshr v4.8h, v16.8h, #1 \n" // 2x average
+ "urshr v5.8h, v18.8h, #1 \n"
+ "urshr v6.8h, v20.8h, #1 \n"
+
+ "subs %w4, %w4, #16 \n" // 16 processed per loop.
+ "mul v16.8h, v4.8h, v22.8h \n" // B
+ "mls v16.8h, v5.8h, v23.8h \n" // G
+ "mls v16.8h, v6.8h, v24.8h \n" // R
+ "add v16.8h, v16.8h, v27.8h \n" // +128 -> unsigned
+ "mul v17.8h, v6.8h, v22.8h \n" // R
+ "mls v17.8h, v5.8h, v26.8h \n" // G
+ "mls v17.8h, v4.8h, v25.8h \n" // B
+ "add v17.8h, v17.8h, v27.8h \n" // +128 -> unsigned
+ "uqshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit U
+ "uqshrn v1.8b, v17.8h, #8 \n" // 16 bit to 8 bit V
+ "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
+ "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
+ "b.gt 1b \n"
+ : "+r"(src_rgb565), // %0
+ "+r"(src_rgb565_1), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
+ "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
+ "v27");
+}
+
+// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
+void ARGB1555ToUVRow_NEON(const uint8* src_argb1555,
+ int src_stride_argb1555,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
+ const uint8* src_argb1555_1 = src_argb1555 + src_stride_argb1555;
+ asm volatile(
+ RGBTOUV_SETUP_REG
+ "1: \n"
+ "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels.
+ RGB555TOARGB
+ "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
+ "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
+ "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
+ "ld1 {v0.16b}, [%0], #16 \n" // next 8 ARGB1555 pixels.
+ RGB555TOARGB
+ "uaddlp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
+ "uaddlp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
+ "uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
+
+ "ld1 {v0.16b}, [%1], #16 \n" // load 8 ARGB1555 pixels.
+ RGB555TOARGB
+ "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
+ "uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
+ "uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
+ "ld1 {v0.16b}, [%1], #16 \n" // next 8 ARGB1555 pixels.
+ RGB555TOARGB
+ "uadalp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
+ "uadalp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
+ "uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
+
+ "ins v16.D[1], v26.D[0] \n"
+ "ins v17.D[1], v27.D[0] \n"
+ "ins v18.D[1], v28.D[0] \n"
+
+ "urshr v4.8h, v16.8h, #1 \n" // 2x average
+ "urshr v5.8h, v17.8h, #1 \n"
+ "urshr v6.8h, v18.8h, #1 \n"
+
+ "subs %w4, %w4, #16 \n" // 16 processed per loop.
+ "mul v2.8h, v4.8h, v20.8h \n" // B
+ "mls v2.8h, v5.8h, v21.8h \n" // G
+ "mls v2.8h, v6.8h, v22.8h \n" // R
+ "add v2.8h, v2.8h, v25.8h \n" // +128 -> unsigned
+ "mul v3.8h, v6.8h, v20.8h \n" // R
+ "mls v3.8h, v5.8h, v24.8h \n" // G
+ "mls v3.8h, v4.8h, v23.8h \n" // B
+ "add v3.8h, v3.8h, v25.8h \n" // +128 -> unsigned
+ "uqshrn v0.8b, v2.8h, #8 \n" // 16 bit to 8 bit U
+ "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V
+ "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
+ "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
+ "b.gt 1b \n"
+ : "+r"(src_argb1555), // %0
+ "+r"(src_argb1555_1), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v16", "v17",
+ "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27",
+ "v28");
+}
+
+// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
+void ARGB4444ToUVRow_NEON(const uint8* src_argb4444,
+ int src_stride_argb4444,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
+ const uint8* src_argb4444_1 = src_argb4444 + src_stride_argb4444;
+ asm volatile(
+ RGBTOUV_SETUP_REG
+ "1: \n"
+ "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels.
+ ARGB4444TOARGB
+ "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
+ "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
+ "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
+ "ld1 {v0.16b}, [%0], #16 \n" // next 8 ARGB4444 pixels.
+ ARGB4444TOARGB
+ "uaddlp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
+ "uaddlp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
+ "uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
+
+ "ld1 {v0.16b}, [%1], #16 \n" // load 8 ARGB4444 pixels.
+ ARGB4444TOARGB
+ "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
+ "uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
+ "uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
+ "ld1 {v0.16b}, [%1], #16 \n" // next 8 ARGB4444 pixels.
+ ARGB4444TOARGB
+ "uadalp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
+ "uadalp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
+ "uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
+
+ "ins v16.D[1], v26.D[0] \n"
+ "ins v17.D[1], v27.D[0] \n"
+ "ins v18.D[1], v28.D[0] \n"
+
+ "urshr v4.8h, v16.8h, #1 \n" // 2x average
+ "urshr v5.8h, v17.8h, #1 \n"
+ "urshr v6.8h, v18.8h, #1 \n"
+
+ "subs %w4, %w4, #16 \n" // 16 processed per loop.
+ "mul v2.8h, v4.8h, v20.8h \n" // B
+ "mls v2.8h, v5.8h, v21.8h \n" // G
+ "mls v2.8h, v6.8h, v22.8h \n" // R
+ "add v2.8h, v2.8h, v25.8h \n" // +128 -> unsigned
+ "mul v3.8h, v6.8h, v20.8h \n" // R
+ "mls v3.8h, v5.8h, v24.8h \n" // G
+ "mls v3.8h, v4.8h, v23.8h \n" // B
+ "add v3.8h, v3.8h, v25.8h \n" // +128 -> unsigned
+ "uqshrn v0.8b, v2.8h, #8 \n" // 16 bit to 8 bit U
+ "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V
+ "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
+ "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
+ "b.gt 1b \n"
+ : "+r"(src_argb4444), // %0
+ "+r"(src_argb4444_1), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v16", "v17",
+ "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27",
+ "v28"
+
+ );
+}
+
+void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int width) {
+ asm volatile(
+ "movi v24.8b, #13 \n" // B * 0.1016 coefficient
+ "movi v25.8b, #65 \n" // G * 0.5078 coefficient
+ "movi v26.8b, #33 \n" // R * 0.2578 coefficient
+ "movi v27.8b, #16 \n" // Add 16 constant
+ "1: \n"
+ "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels.
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ RGB565TOARGB
+ "umull v3.8h, v0.8b, v24.8b \n" // B
+ "umlal v3.8h, v1.8b, v25.8b \n" // G
+ "umlal v3.8h, v2.8b, v26.8b \n" // R
+ "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y
+ "uqadd v0.8b, v0.8b, v27.8b \n"
+ "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
+ "b.gt 1b \n"
+ : "+r"(src_rgb565), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6", "v24", "v25", "v26",
+ "v27");
+}
+
+void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int width) {
+ asm volatile(
+ "movi v4.8b, #13 \n" // B * 0.1016 coefficient
+ "movi v5.8b, #65 \n" // G * 0.5078 coefficient
+ "movi v6.8b, #33 \n" // R * 0.2578 coefficient
+ "movi v7.8b, #16 \n" // Add 16 constant
+ "1: \n"
+ "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels.
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ ARGB1555TOARGB
+ "umull v3.8h, v0.8b, v4.8b \n" // B
+ "umlal v3.8h, v1.8b, v5.8b \n" // G
+ "umlal v3.8h, v2.8b, v6.8b \n" // R
+ "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y
+ "uqadd v0.8b, v0.8b, v7.8b \n"
+ "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
+ "b.gt 1b \n"
+ : "+r"(src_argb1555), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
+}
+
+void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int width) {
+ asm volatile(
+ "movi v24.8b, #13 \n" // B * 0.1016 coefficient
+ "movi v25.8b, #65 \n" // G * 0.5078 coefficient
+ "movi v26.8b, #33 \n" // R * 0.2578 coefficient
+ "movi v27.8b, #16 \n" // Add 16 constant
+ "1: \n"
+ "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels.
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ ARGB4444TOARGB
+ "umull v3.8h, v0.8b, v24.8b \n" // B
+ "umlal v3.8h, v1.8b, v25.8b \n" // G
+ "umlal v3.8h, v2.8b, v26.8b \n" // R
+ "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y
+ "uqadd v0.8b, v0.8b, v27.8b \n"
+ "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
+ "b.gt 1b \n"
+ : "+r"(src_argb4444), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v24", "v25", "v26", "v27");
+}
+
+void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int width) {
+ asm volatile(
+ "movi v4.8b, #33 \n" // R * 0.2578 coefficient
+ "movi v5.8b, #65 \n" // G * 0.5078 coefficient
+ "movi v6.8b, #13 \n" // B * 0.1016 coefficient
+ "movi v7.8b, #16 \n" // Add 16 constant
+ "1: \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels.
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "umull v16.8h, v1.8b, v4.8b \n" // R
+ "umlal v16.8h, v2.8b, v5.8b \n" // G
+ "umlal v16.8h, v3.8b, v6.8b \n" // B
+ "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y
+ "uqadd v0.8b, v0.8b, v7.8b \n"
+ "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
+ "b.gt 1b \n"
+ : "+r"(src_bgra), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
+}
+
+void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int width) {
+ asm volatile(
+ "movi v4.8b, #33 \n" // R * 0.2578 coefficient
+ "movi v5.8b, #65 \n" // G * 0.5078 coefficient
+ "movi v6.8b, #13 \n" // B * 0.1016 coefficient
+ "movi v7.8b, #16 \n" // Add 16 constant
+ "1: \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels.
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "umull v16.8h, v0.8b, v4.8b \n" // R
+ "umlal v16.8h, v1.8b, v5.8b \n" // G
+ "umlal v16.8h, v2.8b, v6.8b \n" // B
+ "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y
+ "uqadd v0.8b, v0.8b, v7.8b \n"
+ "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
+ "b.gt 1b \n"
+ : "+r"(src_abgr), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
+}
+
+void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int width) {
+ asm volatile(
+ "movi v4.8b, #13 \n" // B * 0.1016 coefficient
+ "movi v5.8b, #65 \n" // G * 0.5078 coefficient
+ "movi v6.8b, #33 \n" // R * 0.2578 coefficient
+ "movi v7.8b, #16 \n" // Add 16 constant
+ "1: \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels.
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "umull v16.8h, v1.8b, v4.8b \n" // B
+ "umlal v16.8h, v2.8b, v5.8b \n" // G
+ "umlal v16.8h, v3.8b, v6.8b \n" // R
+ "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y
+ "uqadd v0.8b, v0.8b, v7.8b \n"
+ "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
+ "b.gt 1b \n"
+ : "+r"(src_rgba), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
+}
+
+void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int width) {
+ asm volatile(
+ "movi v4.8b, #13 \n" // B * 0.1016 coefficient
+ "movi v5.8b, #65 \n" // G * 0.5078 coefficient
+ "movi v6.8b, #33 \n" // R * 0.2578 coefficient
+ "movi v7.8b, #16 \n" // Add 16 constant
+ "1: \n"
+ "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels.
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "umull v16.8h, v0.8b, v4.8b \n" // B
+ "umlal v16.8h, v1.8b, v5.8b \n" // G
+ "umlal v16.8h, v2.8b, v6.8b \n" // R
+ "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y
+ "uqadd v0.8b, v0.8b, v7.8b \n"
+ "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
+ "b.gt 1b \n"
+ : "+r"(src_rgb24), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
+}
+
+void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int width) {
+ asm volatile(
+ "movi v4.8b, #33 \n" // R * 0.2578 coefficient
+ "movi v5.8b, #65 \n" // G * 0.5078 coefficient
+ "movi v6.8b, #13 \n" // B * 0.1016 coefficient
+ "movi v7.8b, #16 \n" // Add 16 constant
+ "1: \n"
+ "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels.
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "umull v16.8h, v0.8b, v4.8b \n" // B
+ "umlal v16.8h, v1.8b, v5.8b \n" // G
+ "umlal v16.8h, v2.8b, v6.8b \n" // R
+ "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y
+ "uqadd v0.8b, v0.8b, v7.8b \n"
+ "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
+ "b.gt 1b \n"
+ : "+r"(src_raw), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
+}
+
+// Bilinear filter 16x2 -> 16x1
+void InterpolateRow_NEON(uint8* dst_ptr,
+ const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ int dst_width,
+ int source_y_fraction) {
+ int y1_fraction = source_y_fraction;
+ int y0_fraction = 256 - y1_fraction;
+ const uint8* src_ptr1 = src_ptr + src_stride;
+ asm volatile(
+ "cmp %w4, #0 \n"
+ "b.eq 100f \n"
+ "cmp %w4, #128 \n"
+ "b.eq 50f \n"
+
+ "dup v5.16b, %w4 \n"
+ "dup v4.16b, %w5 \n"
+ // General purpose row blend.
+ "1: \n"
+ "ld1 {v0.16b}, [%1], #16 \n"
+ "ld1 {v1.16b}, [%2], #16 \n"
+ "subs %w3, %w3, #16 \n"
+ "umull v2.8h, v0.8b, v4.8b \n"
+ "umull2 v3.8h, v0.16b, v4.16b \n"
+ "umlal v2.8h, v1.8b, v5.8b \n"
+ "umlal2 v3.8h, v1.16b, v5.16b \n"
+ "rshrn v0.8b, v2.8h, #8 \n"
+ "rshrn2 v0.16b, v3.8h, #8 \n"
+ "st1 {v0.16b}, [%0], #16 \n"
+ "b.gt 1b \n"
+ "b 99f \n"
+
+ // Blend 50 / 50.
+ "50: \n"
+ "ld1 {v0.16b}, [%1], #16 \n"
+ "ld1 {v1.16b}, [%2], #16 \n"
+ "subs %w3, %w3, #16 \n"
+ "urhadd v0.16b, v0.16b, v1.16b \n"
+ "st1 {v0.16b}, [%0], #16 \n"
+ "b.gt 50b \n"
+ "b 99f \n"
+
+ // Blend 100 / 0 - Copy row unchanged.
+ "100: \n"
+ "ld1 {v0.16b}, [%1], #16 \n"
+ "subs %w3, %w3, #16 \n"
+ "st1 {v0.16b}, [%0], #16 \n"
+ "b.gt 100b \n"
+
+ "99: \n"
+ : "+r"(dst_ptr), // %0
+ "+r"(src_ptr), // %1
+ "+r"(src_ptr1), // %2
+ "+r"(dst_width), // %3
+ "+r"(y1_fraction), // %4
+ "+r"(y0_fraction) // %5
+ :
+ : "cc", "memory", "v0", "v1", "v3", "v4", "v5");
+}
+
+// dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr
+void ARGBBlendRow_NEON(const uint8* src_argb0,
+ const uint8* src_argb1,
+ uint8* dst_argb,
+ int width) {
+ asm volatile(
+ "subs %w3, %w3, #8 \n"
+ "b.lt 89f \n"
+ // Blend 8 pixels.
+ "8: \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB0
+ // pixels
+ "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 ARGB1
+ // pixels
+ "subs %w3, %w3, #8 \n" // 8 processed per loop.
+ "umull v16.8h, v4.8b, v3.8b \n" // db * a
+ "umull v17.8h, v5.8b, v3.8b \n" // dg * a
+ "umull v18.8h, v6.8b, v3.8b \n" // dr * a
+ "uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8
+ "uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8
+ "uqrshrn v18.8b, v18.8h, #8 \n" // dr >>= 8
+ "uqsub v4.8b, v4.8b, v16.8b \n" // db - (db * a / 256)
+ "uqsub v5.8b, v5.8b, v17.8b \n" // dg - (dg * a / 256)
+ "uqsub v6.8b, v6.8b, v18.8b \n" // dr - (dr * a / 256)
+ "uqadd v0.8b, v0.8b, v4.8b \n" // + sb
+ "uqadd v1.8b, v1.8b, v5.8b \n" // + sg
+ "uqadd v2.8b, v2.8b, v6.8b \n" // + sr
+ "movi v3.8b, #255 \n" // a = 255
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
+ // pixels
+ "b.ge 8b \n"
+
+ "89: \n"
+ "adds %w3, %w3, #8-1 \n"
+ "b.lt 99f \n"
+
+ // Blend 1 pixels.
+ "1: \n"
+ "ld4 {v0.b,v1.b,v2.b,v3.b}[0], [%0], #4 \n" // load 1 pixel ARGB0.
+ "ld4 {v4.b,v5.b,v6.b,v7.b}[0], [%1], #4 \n" // load 1 pixel ARGB1.
+ "subs %w3, %w3, #1 \n" // 1 processed per loop.
+ "umull v16.8h, v4.8b, v3.8b \n" // db * a
+ "umull v17.8h, v5.8b, v3.8b \n" // dg * a
+ "umull v18.8h, v6.8b, v3.8b \n" // dr * a
+ "uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8
+ "uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8
+ "uqrshrn v18.8b, v18.8h, #8 \n" // dr >>= 8
+ "uqsub v4.8b, v4.8b, v16.8b \n" // db - (db * a / 256)
+ "uqsub v5.8b, v5.8b, v17.8b \n" // dg - (dg * a / 256)
+ "uqsub v6.8b, v6.8b, v18.8b \n" // dr - (dr * a / 256)
+ "uqadd v0.8b, v0.8b, v4.8b \n" // + sb
+ "uqadd v1.8b, v1.8b, v5.8b \n" // + sg
+ "uqadd v2.8b, v2.8b, v6.8b \n" // + sr
+ "movi v3.8b, #255 \n" // a = 255
+ "st4 {v0.b,v1.b,v2.b,v3.b}[0], [%2], #4 \n" // store 1 pixel.
+ "b.ge 1b \n"
+
+ "99: \n"
+
+ : "+r"(src_argb0), // %0
+ "+r"(src_argb1), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
+ "v17", "v18");
+}
+
+// Attenuate 8 pixels at a time.
+void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
+ asm volatile(
+ // Attenuate 8 pixels.
+ "1: \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
+ // pixels
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "umull v4.8h, v0.8b, v3.8b \n" // b * a
+ "umull v5.8h, v1.8b, v3.8b \n" // g * a
+ "umull v6.8h, v2.8b, v3.8b \n" // r * a
+ "uqrshrn v0.8b, v4.8h, #8 \n" // b >>= 8
+ "uqrshrn v1.8b, v5.8h, #8 \n" // g >>= 8
+ "uqrshrn v2.8b, v6.8h, #8 \n" // r >>= 8
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB
+ // pixels
+ "b.gt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6");
+}
+
+// Quantize 8 ARGB pixels (32 bytes).
+// dst = (dst * scale >> 16) * interval_size + interval_offset;
+void ARGBQuantizeRow_NEON(uint8* dst_argb,
+ int scale,
+ int interval_size,
+ int interval_offset,
+ int width) {
+ asm volatile(
+ "dup v4.8h, %w2 \n"
+ "ushr v4.8h, v4.8h, #1 \n" // scale >>= 1
+ "dup v5.8h, %w3 \n" // interval multiply.
+ "dup v6.8h, %w4 \n" // interval add
+
+ // 8 pixel loop.
+ "1: \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 pixels of
+ // ARGB.
+ "subs %w1, %w1, #8 \n" // 8 processed per loop.
+ "uxtl v0.8h, v0.8b \n" // b (0 .. 255)
+ "uxtl v1.8h, v1.8b \n"
+ "uxtl v2.8h, v2.8b \n"
+ "sqdmulh v0.8h, v0.8h, v4.8h \n" // b * scale
+ "sqdmulh v1.8h, v1.8h, v4.8h \n" // g
+ "sqdmulh v2.8h, v2.8h, v4.8h \n" // r
+ "mul v0.8h, v0.8h, v5.8h \n" // b * interval_size
+ "mul v1.8h, v1.8h, v5.8h \n" // g
+ "mul v2.8h, v2.8h, v5.8h \n" // r
+ "add v0.8h, v0.8h, v6.8h \n" // b + interval_offset
+ "add v1.8h, v1.8h, v6.8h \n" // g
+ "add v2.8h, v2.8h, v6.8h \n" // r
+ "uqxtn v0.8b, v0.8h \n"
+ "uqxtn v1.8b, v1.8h \n"
+ "uqxtn v2.8b, v2.8h \n"
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 ARGB
+ // pixels
+ "b.gt 1b \n"
+ : "+r"(dst_argb), // %0
+ "+r"(width) // %1
+ : "r"(scale), // %2
+ "r"(interval_size), // %3
+ "r"(interval_offset) // %4
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6");
+}
+
+// Shade 8 pixels at a time by specified value.
+// NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8.
+// Rounding in vqrdmulh does +1 to high if high bit of low s16 is set.
+void ARGBShadeRow_NEON(const uint8* src_argb,
+ uint8* dst_argb,
+ int width,
+ uint32 value) {
+ asm volatile(
+ "dup v0.4s, %w3 \n" // duplicate scale value.
+ "zip1 v0.8b, v0.8b, v0.8b \n" // v0.8b aarrggbb.
+ "ushr v0.8h, v0.8h, #1 \n" // scale / 2.
+
+ // 8 pixel loop.
+ "1: \n"
+ "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%0], #32 \n" // load 8 ARGB
+ // pixels.
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "uxtl v4.8h, v4.8b \n" // b (0 .. 255)
+ "uxtl v5.8h, v5.8b \n"
+ "uxtl v6.8h, v6.8b \n"
+ "uxtl v7.8h, v7.8b \n"
+ "sqrdmulh v4.8h, v4.8h, v0.h[0] \n" // b * scale * 2
+ "sqrdmulh v5.8h, v5.8h, v0.h[1] \n" // g
+ "sqrdmulh v6.8h, v6.8h, v0.h[2] \n" // r
+ "sqrdmulh v7.8h, v7.8h, v0.h[3] \n" // a
+ "uqxtn v4.8b, v4.8h \n"
+ "uqxtn v5.8b, v5.8h \n"
+ "uqxtn v6.8b, v6.8h \n"
+ "uqxtn v7.8b, v7.8h \n"
+ "st4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // store 8 ARGB
+ // pixels
+ "b.gt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "r"(value) // %3
+ : "cc", "memory", "v0", "v4", "v5", "v6", "v7");
+}
+
+// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
+// Similar to ARGBToYJ but stores ARGB.
+// C code is (15 * b + 75 * g + 38 * r + 64) >> 7;
+void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
+ asm volatile(
+ "movi v24.8b, #15 \n" // B * 0.11400 coefficient
+ "movi v25.8b, #75 \n" // G * 0.58700 coefficient
+ "movi v26.8b, #38 \n" // R * 0.29900 coefficient
+ "1: \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
+ // pixels.
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "umull v4.8h, v0.8b, v24.8b \n" // B
+ "umlal v4.8h, v1.8b, v25.8b \n" // G
+ "umlal v4.8h, v2.8b, v26.8b \n" // R
+ "sqrshrun v0.8b, v4.8h, #7 \n" // 15 bit to 8 bit B
+ "orr v1.8b, v0.8b, v0.8b \n" // G
+ "orr v2.8b, v0.8b, v0.8b \n" // R
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 pixels.
+ "b.gt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26");
+}
+
+// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
+// b = (r * 35 + g * 68 + b * 17) >> 7
+// g = (r * 45 + g * 88 + b * 22) >> 7
+// r = (r * 50 + g * 98 + b * 24) >> 7
+
+void ARGBSepiaRow_NEON(uint8* dst_argb, int width) {
+ asm volatile(
+ "movi v20.8b, #17 \n" // BB coefficient
+ "movi v21.8b, #68 \n" // BG coefficient
+ "movi v22.8b, #35 \n" // BR coefficient
+ "movi v24.8b, #22 \n" // GB coefficient
+ "movi v25.8b, #88 \n" // GG coefficient
+ "movi v26.8b, #45 \n" // GR coefficient
+ "movi v28.8b, #24 \n" // BB coefficient
+ "movi v29.8b, #98 \n" // BG coefficient
+ "movi v30.8b, #50 \n" // BR coefficient
+ "1: \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB pixels.
+ "subs %w1, %w1, #8 \n" // 8 processed per loop.
+ "umull v4.8h, v0.8b, v20.8b \n" // B to Sepia B
+ "umlal v4.8h, v1.8b, v21.8b \n" // G
+ "umlal v4.8h, v2.8b, v22.8b \n" // R
+ "umull v5.8h, v0.8b, v24.8b \n" // B to Sepia G
+ "umlal v5.8h, v1.8b, v25.8b \n" // G
+ "umlal v5.8h, v2.8b, v26.8b \n" // R
+ "umull v6.8h, v0.8b, v28.8b \n" // B to Sepia R
+ "umlal v6.8h, v1.8b, v29.8b \n" // G
+ "umlal v6.8h, v2.8b, v30.8b \n" // R
+ "uqshrn v0.8b, v4.8h, #7 \n" // 16 bit to 8 bit B
+ "uqshrn v1.8b, v5.8h, #7 \n" // 16 bit to 8 bit G
+ "uqshrn v2.8b, v6.8h, #7 \n" // 16 bit to 8 bit R
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 pixels.
+ "b.gt 1b \n"
+ : "+r"(dst_argb), // %0
+ "+r"(width) // %1
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+ "v21", "v22", "v24", "v25", "v26", "v28", "v29", "v30");
+}
+
+// Tranform 8 ARGB pixels (32 bytes) with color matrix.
+// TODO(fbarchard): Was same as Sepia except matrix is provided. This function
+// needs to saturate. Consider doing a non-saturating version.
+void ARGBColorMatrixRow_NEON(const uint8* src_argb,
+ uint8* dst_argb,
+ const int8* matrix_argb,
+ int width) {
+ asm volatile(
+ "ld1 {v2.16b}, [%3] \n" // load 3 ARGB vectors.
+ "sxtl v0.8h, v2.8b \n" // B,G coefficients s16.
+ "sxtl2 v1.8h, v2.16b \n" // R,A coefficients s16.
+
+ "1: \n"
+ "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8
+ // pixels.
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "uxtl v16.8h, v16.8b \n" // b (0 .. 255) 16 bit
+ "uxtl v17.8h, v17.8b \n" // g
+ "uxtl v18.8h, v18.8b \n" // r
+ "uxtl v19.8h, v19.8b \n" // a
+ "mul v22.8h, v16.8h, v0.h[0] \n" // B = B * Matrix B
+ "mul v23.8h, v16.8h, v0.h[4] \n" // G = B * Matrix G
+ "mul v24.8h, v16.8h, v1.h[0] \n" // R = B * Matrix R
+ "mul v25.8h, v16.8h, v1.h[4] \n" // A = B * Matrix A
+ "mul v4.8h, v17.8h, v0.h[1] \n" // B += G * Matrix B
+ "mul v5.8h, v17.8h, v0.h[5] \n" // G += G * Matrix G
+ "mul v6.8h, v17.8h, v1.h[1] \n" // R += G * Matrix R
+ "mul v7.8h, v17.8h, v1.h[5] \n" // A += G * Matrix A
+ "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B
+ "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G
+ "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R
+ "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A
+ "mul v4.8h, v18.8h, v0.h[2] \n" // B += R * Matrix B
+ "mul v5.8h, v18.8h, v0.h[6] \n" // G += R * Matrix G
+ "mul v6.8h, v18.8h, v1.h[2] \n" // R += R * Matrix R
+ "mul v7.8h, v18.8h, v1.h[6] \n" // A += R * Matrix A
+ "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B
+ "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G
+ "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R
+ "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A
+ "mul v4.8h, v19.8h, v0.h[3] \n" // B += A * Matrix B
+ "mul v5.8h, v19.8h, v0.h[7] \n" // G += A * Matrix G
+ "mul v6.8h, v19.8h, v1.h[3] \n" // R += A * Matrix R
+ "mul v7.8h, v19.8h, v1.h[7] \n" // A += A * Matrix A
+ "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B
+ "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G
+ "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R
+ "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A
+ "sqshrun v16.8b, v22.8h, #6 \n" // 16 bit to 8 bit B
+ "sqshrun v17.8b, v23.8h, #6 \n" // 16 bit to 8 bit G
+ "sqshrun v18.8b, v24.8h, #6 \n" // 16 bit to 8 bit R
+ "sqshrun v19.8b, v25.8h, #6 \n" // 16 bit to 8 bit A
+ "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n" // store 8
+ // pixels.
+ "b.gt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "r"(matrix_argb) // %3
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
+ "v17", "v18", "v19", "v22", "v23", "v24", "v25");
+}
+
+// TODO(fbarchard): fix vqshrun in ARGBMultiplyRow_NEON and reenable.
+// Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
+void ARGBMultiplyRow_NEON(const uint8* src_argb0,
+ const uint8* src_argb1,
+ uint8* dst_argb,
+ int width) {
+ asm volatile(
+ // 8 pixel loop.
+ "1: \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
+ // pixels.
+ "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more
+ // pixels.
+ "subs %w3, %w3, #8 \n" // 8 processed per loop.
+ "umull v0.8h, v0.8b, v4.8b \n" // multiply B
+ "umull v1.8h, v1.8b, v5.8b \n" // multiply G
+ "umull v2.8h, v2.8b, v6.8b \n" // multiply R
+ "umull v3.8h, v3.8b, v7.8b \n" // multiply A
+ "rshrn v0.8b, v0.8h, #8 \n" // 16 bit to 8 bit B
+ "rshrn v1.8b, v1.8h, #8 \n" // 16 bit to 8 bit G
+ "rshrn v2.8b, v2.8h, #8 \n" // 16 bit to 8 bit R
+ "rshrn v3.8b, v3.8h, #8 \n" // 16 bit to 8 bit A
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
+ // pixels
+ "b.gt 1b \n"
+
+ : "+r"(src_argb0), // %0
+ "+r"(src_argb1), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
+}
+
+// Add 2 rows of ARGB pixels together, 8 pixels at a time.
+void ARGBAddRow_NEON(const uint8* src_argb0,
+ const uint8* src_argb1,
+ uint8* dst_argb,
+ int width) {
+ asm volatile(
+ // 8 pixel loop.
+ "1: \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
+ // pixels.
+ "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more
+ // pixels.
+ "subs %w3, %w3, #8 \n" // 8 processed per loop.
+ "uqadd v0.8b, v0.8b, v4.8b \n"
+ "uqadd v1.8b, v1.8b, v5.8b \n"
+ "uqadd v2.8b, v2.8b, v6.8b \n"
+ "uqadd v3.8b, v3.8b, v7.8b \n"
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
+ // pixels
+ "b.gt 1b \n"
+
+ : "+r"(src_argb0), // %0
+ "+r"(src_argb1), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
+}
+
+// Subtract 2 rows of ARGB pixels, 8 pixels at a time.
+void ARGBSubtractRow_NEON(const uint8* src_argb0,
+ const uint8* src_argb1,
+ uint8* dst_argb,
+ int width) {
+ asm volatile(
+ // 8 pixel loop.
+ "1: \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
+ // pixels.
+ "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more
+ // pixels.
+ "subs %w3, %w3, #8 \n" // 8 processed per loop.
+ "uqsub v0.8b, v0.8b, v4.8b \n"
+ "uqsub v1.8b, v1.8b, v5.8b \n"
+ "uqsub v2.8b, v2.8b, v6.8b \n"
+ "uqsub v3.8b, v3.8b, v7.8b \n"
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
+ // pixels
+ "b.gt 1b \n"
+
+ : "+r"(src_argb0), // %0
+ "+r"(src_argb1), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
+}
+
+// Adds Sobel X and Sobel Y and stores Sobel into ARGB.
+// A = 255
+// R = Sobel
+// G = Sobel
+// B = Sobel
+void SobelRow_NEON(const uint8* src_sobelx,
+ const uint8* src_sobely,
+ uint8* dst_argb,
+ int width) {
+ asm volatile(
+ "movi v3.8b, #255 \n" // alpha
+ // 8 pixel loop.
+ "1: \n"
+ "ld1 {v0.8b}, [%0], #8 \n" // load 8 sobelx.
+ "ld1 {v1.8b}, [%1], #8 \n" // load 8 sobely.
+ "subs %w3, %w3, #8 \n" // 8 processed per loop.
+ "uqadd v0.8b, v0.8b, v1.8b \n" // add
+ "orr v1.8b, v0.8b, v0.8b \n"
+ "orr v2.8b, v0.8b, v0.8b \n"
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
+ // pixels
+ "b.gt 1b \n"
+ : "+r"(src_sobelx), // %0
+ "+r"(src_sobely), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3");
+}
+
+// Adds Sobel X and Sobel Y and stores Sobel into plane.
+void SobelToPlaneRow_NEON(const uint8* src_sobelx,
+ const uint8* src_sobely,
+ uint8* dst_y,
+ int width) {
+ asm volatile(
+ // 16 pixel loop.
+ "1: \n"
+ "ld1 {v0.16b}, [%0], #16 \n" // load 16 sobelx.
+ "ld1 {v1.16b}, [%1], #16 \n" // load 16 sobely.
+ "subs %w3, %w3, #16 \n" // 16 processed per loop.
+ "uqadd v0.16b, v0.16b, v1.16b \n" // add
+ "st1 {v0.16b}, [%2], #16 \n" // store 16 pixels.
+ "b.gt 1b \n"
+ : "+r"(src_sobelx), // %0
+ "+r"(src_sobely), // %1
+ "+r"(dst_y), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "v0", "v1");
+}
+
+// Mixes Sobel X, Sobel Y and Sobel into ARGB.
+// A = 255
+// R = Sobel X
+// G = Sobel
+// B = Sobel Y
+void SobelXYRow_NEON(const uint8* src_sobelx,
+ const uint8* src_sobely,
+ uint8* dst_argb,
+ int width) {
+ asm volatile(
+ "movi v3.8b, #255 \n" // alpha
+ // 8 pixel loop.
+ "1: \n"
+ "ld1 {v2.8b}, [%0], #8 \n" // load 8 sobelx.
+ "ld1 {v0.8b}, [%1], #8 \n" // load 8 sobely.
+ "subs %w3, %w3, #8 \n" // 8 processed per loop.
+ "uqadd v1.8b, v0.8b, v2.8b \n" // add
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
+ // pixels
+ "b.gt 1b \n"
+ : "+r"(src_sobelx), // %0
+ "+r"(src_sobely), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3");
+}
+
+// SobelX as a matrix is
+// -1 0 1
+// -2 0 2
+// -1 0 1
+void SobelXRow_NEON(const uint8* src_y0,
+ const uint8* src_y1,
+ const uint8* src_y2,
+ uint8* dst_sobelx,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "ld1 {v0.8b}, [%0],%5 \n" // top
+ "ld1 {v1.8b}, [%0],%6 \n"
+ "usubl v0.8h, v0.8b, v1.8b \n"
+ "ld1 {v2.8b}, [%1],%5 \n" // center * 2
+ "ld1 {v3.8b}, [%1],%6 \n"
+ "usubl v1.8h, v2.8b, v3.8b \n"
+ "add v0.8h, v0.8h, v1.8h \n"
+ "add v0.8h, v0.8h, v1.8h \n"
+ "ld1 {v2.8b}, [%2],%5 \n" // bottom
+ "ld1 {v3.8b}, [%2],%6 \n"
+ "subs %w4, %w4, #8 \n" // 8 pixels
+ "usubl v1.8h, v2.8b, v3.8b \n"
+ "add v0.8h, v0.8h, v1.8h \n"
+ "abs v0.8h, v0.8h \n"
+ "uqxtn v0.8b, v0.8h \n"
+ "st1 {v0.8b}, [%3], #8 \n" // store 8 sobelx
+ "b.gt 1b \n"
+ : "+r"(src_y0), // %0
+ "+r"(src_y1), // %1
+ "+r"(src_y2), // %2
+ "+r"(dst_sobelx), // %3
+ "+r"(width) // %4
+ : "r"(2LL), // %5
+ "r"(6LL) // %6
+ : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
+ );
+}
+
+// SobelY as a matrix is
+// -1 -2 -1
+// 0 0 0
+// 1 2 1
+void SobelYRow_NEON(const uint8* src_y0,
+ const uint8* src_y1,
+ uint8* dst_sobely,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "ld1 {v0.8b}, [%0],%4 \n" // left
+ "ld1 {v1.8b}, [%1],%4 \n"
+ "usubl v0.8h, v0.8b, v1.8b \n"
+ "ld1 {v2.8b}, [%0],%4 \n" // center * 2
+ "ld1 {v3.8b}, [%1],%4 \n"
+ "usubl v1.8h, v2.8b, v3.8b \n"
+ "add v0.8h, v0.8h, v1.8h \n"
+ "add v0.8h, v0.8h, v1.8h \n"
+ "ld1 {v2.8b}, [%0],%5 \n" // right
+ "ld1 {v3.8b}, [%1],%5 \n"
+ "subs %w3, %w3, #8 \n" // 8 pixels
+ "usubl v1.8h, v2.8b, v3.8b \n"
+ "add v0.8h, v0.8h, v1.8h \n"
+ "abs v0.8h, v0.8h \n"
+ "uqxtn v0.8b, v0.8h \n"
+ "st1 {v0.8b}, [%2], #8 \n" // store 8 sobely
+ "b.gt 1b \n"
+ : "+r"(src_y0), // %0
+ "+r"(src_y1), // %1
+ "+r"(dst_sobely), // %2
+ "+r"(width) // %3
+ : "r"(1LL), // %4
+ "r"(6LL) // %5
+ : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
+ );
+}
+
+// Caveat - rounds float to half float whereas scaling version truncates.
+void HalfFloat1Row_NEON(const uint16* src, uint16* dst, float, int width) {
+ asm volatile(
+ "1: \n"
+ "ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts
+ "subs %w2, %w2, #8 \n" // 8 pixels per loop
+ "uxtl v2.4s, v1.4h \n" // 8 int's
+ "uxtl2 v3.4s, v1.8h \n"
+ "scvtf v2.4s, v2.4s \n" // 8 floats
+ "scvtf v3.4s, v3.4s \n"
+ "fcvtn v1.4h, v2.4s \n" // 8 half floats
+ "fcvtn2 v1.8h, v3.4s \n"
+ "st1 {v1.16b}, [%1], #16 \n" // store 8 shorts
+ "b.gt 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v1", "v2", "v3");
+}
+
+void HalfFloatRow_NEON(const uint16* src, uint16* dst, float scale, int width) {
+ asm volatile(
+ "1: \n"
+ "ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts
+ "subs %w2, %w2, #8 \n" // 8 pixels per loop
+ "uxtl v2.4s, v1.4h \n" // 8 int's
+ "uxtl2 v3.4s, v1.8h \n"
+ "scvtf v2.4s, v2.4s \n" // 8 floats
+ "scvtf v3.4s, v3.4s \n"
+ "fmul v2.4s, v2.4s, %3.s[0] \n" // adjust exponent
+ "fmul v3.4s, v3.4s, %3.s[0] \n"
+ "uqshrn v1.4h, v2.4s, #13 \n" // isolate halffloat
+ "uqshrn2 v1.8h, v3.4s, #13 \n"
+ "st1 {v1.16b}, [%1], #16 \n" // store 8 shorts
+ "b.gt 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ : "w"(scale * 1.9259299444e-34f) // %3
+ : "cc", "memory", "v1", "v2", "v3");
+}
+
+#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/libyuv/src/main/cpp/libyuv/source/row_win.cc b/libyuv/src/main/cpp/libyuv/source/row_win.cc
new file mode 100644
index 0000000..202f2b8
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/source/row_win.cc
@@ -0,0 +1,6341 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+
+// This module is for Visual C 32/64 bit and clangcl 32 bit
+#if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \
+ (defined(_M_IX86) || (defined(_M_X64) && !defined(__clang__)))
+
+#if defined(_M_X64)
+#include
+#include // For _mm_maddubs_epi16
+#endif
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// 64 bit
+#if defined(_M_X64)
+
+// Read 4 UV from 422, upsample to 8 UV.
+#define READYUV422 \
+ xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf); \
+ xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset)); \
+ xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \
+ xmm0 = _mm_unpacklo_epi16(xmm0, xmm0); \
+ u_buf += 4; \
+ xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \
+ xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \
+ y_buf += 8;
+
+// Read 4 UV from 422, upsample to 8 UV. With 8 Alpha.
+#define READYUVA422 \
+ xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf); \
+ xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset)); \
+ xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \
+ xmm0 = _mm_unpacklo_epi16(xmm0, xmm0); \
+ u_buf += 4; \
+ xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \
+ xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \
+ y_buf += 8; \
+ xmm5 = _mm_loadl_epi64((__m128i*)a_buf); \
+ a_buf += 8;
+
+// Convert 8 pixels: 8 UV and 8 Y.
+#define YUVTORGB(yuvconstants) \
+ xmm1 = _mm_loadu_si128(&xmm0); \
+ xmm2 = _mm_loadu_si128(&xmm0); \
+ xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)yuvconstants->kUVToB); \
+ xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)yuvconstants->kUVToG); \
+ xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)yuvconstants->kUVToR); \
+ xmm0 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasB, xmm0); \
+ xmm1 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasG, xmm1); \
+ xmm2 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasR, xmm2); \
+ xmm4 = _mm_mulhi_epu16(xmm4, *(__m128i*)yuvconstants->kYToRgb); \
+ xmm0 = _mm_adds_epi16(xmm0, xmm4); \
+ xmm1 = _mm_adds_epi16(xmm1, xmm4); \
+ xmm2 = _mm_adds_epi16(xmm2, xmm4); \
+ xmm0 = _mm_srai_epi16(xmm0, 6); \
+ xmm1 = _mm_srai_epi16(xmm1, 6); \
+ xmm2 = _mm_srai_epi16(xmm2, 6); \
+ xmm0 = _mm_packus_epi16(xmm0, xmm0); \
+ xmm1 = _mm_packus_epi16(xmm1, xmm1); \
+ xmm2 = _mm_packus_epi16(xmm2, xmm2);
+
+// Store 8 ARGB values.
+#define STOREARGB \
+ xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \
+ xmm2 = _mm_unpacklo_epi8(xmm2, xmm5); \
+ xmm1 = _mm_loadu_si128(&xmm0); \
+ xmm0 = _mm_unpacklo_epi16(xmm0, xmm2); \
+ xmm1 = _mm_unpackhi_epi16(xmm1, xmm2); \
+ _mm_storeu_si128((__m128i*)dst_argb, xmm0); \
+ _mm_storeu_si128((__m128i*)(dst_argb + 16), xmm1); \
+ dst_argb += 32;
+
+#if defined(HAS_I422TOARGBROW_SSSE3)
+void I422ToARGBRow_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ __m128i xmm0, xmm1, xmm2, xmm4;
+ const __m128i xmm5 = _mm_set1_epi8(-1);
+ const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf;
+ while (width > 0) {
+ READYUV422
+ YUVTORGB(yuvconstants)
+ STOREARGB
+ width -= 8;
+ }
+}
+#endif
+
+#if defined(HAS_I422ALPHATOARGBROW_SSSE3)
+void I422AlphaToARGBRow_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ const uint8* a_buf,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ __m128i xmm0, xmm1, xmm2, xmm4, xmm5;
+ const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf;
+ while (width > 0) {
+ READYUVA422
+ YUVTORGB(yuvconstants)
+ STOREARGB
+ width -= 8;
+ }
+}
+#endif
+
+// 32 bit
+#else // defined(_M_X64)
+#ifdef HAS_ARGBTOYROW_SSSE3
+
+// Constants for ARGB.
+static const vec8 kARGBToY = {13, 65, 33, 0, 13, 65, 33, 0,
+ 13, 65, 33, 0, 13, 65, 33, 0};
+
+// JPeg full range.
+static const vec8 kARGBToYJ = {15, 75, 38, 0, 15, 75, 38, 0,
+ 15, 75, 38, 0, 15, 75, 38, 0};
+
+static const vec8 kARGBToU = {112, -74, -38, 0, 112, -74, -38, 0,
+ 112, -74, -38, 0, 112, -74, -38, 0};
+
+static const vec8 kARGBToUJ = {127, -84, -43, 0, 127, -84, -43, 0,
+ 127, -84, -43, 0, 127, -84, -43, 0};
+
+static const vec8 kARGBToV = {
+ -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
+};
+
+static const vec8 kARGBToVJ = {-20, -107, 127, 0, -20, -107, 127, 0,
+ -20, -107, 127, 0, -20, -107, 127, 0};
+
+// vpshufb for vphaddw + vpackuswb packed to shorts.
+static const lvec8 kShufARGBToUV_AVX = {
+ 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
+ 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15};
+
+// Constants for BGRA.
+static const vec8 kBGRAToY = {0, 33, 65, 13, 0, 33, 65, 13,
+ 0, 33, 65, 13, 0, 33, 65, 13};
+
+static const vec8 kBGRAToU = {0, -38, -74, 112, 0, -38, -74, 112,
+ 0, -38, -74, 112, 0, -38, -74, 112};
+
+static const vec8 kBGRAToV = {0, 112, -94, -18, 0, 112, -94, -18,
+ 0, 112, -94, -18, 0, 112, -94, -18};
+
+// Constants for ABGR.
+static const vec8 kABGRToY = {33, 65, 13, 0, 33, 65, 13, 0,
+ 33, 65, 13, 0, 33, 65, 13, 0};
+
+static const vec8 kABGRToU = {-38, -74, 112, 0, -38, -74, 112, 0,
+ -38, -74, 112, 0, -38, -74, 112, 0};
+
+static const vec8 kABGRToV = {112, -94, -18, 0, 112, -94, -18, 0,
+ 112, -94, -18, 0, 112, -94, -18, 0};
+
+// Constants for RGBA.
+static const vec8 kRGBAToY = {0, 13, 65, 33, 0, 13, 65, 33,
+ 0, 13, 65, 33, 0, 13, 65, 33};
+
+static const vec8 kRGBAToU = {0, 112, -74, -38, 0, 112, -74, -38,
+ 0, 112, -74, -38, 0, 112, -74, -38};
+
+static const vec8 kRGBAToV = {0, -18, -94, 112, 0, -18, -94, 112,
+ 0, -18, -94, 112, 0, -18, -94, 112};
+
+static const uvec8 kAddY16 = {16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
+ 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u};
+
+// 7 bit fixed point 0.5.
+static const vec16 kAddYJ64 = {64, 64, 64, 64, 64, 64, 64, 64};
+
+static const uvec8 kAddUV128 = {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
+ 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
+
+static const uvec16 kAddUVJ128 = {0x8080u, 0x8080u, 0x8080u, 0x8080u,
+ 0x8080u, 0x8080u, 0x8080u, 0x8080u};
+
+// Shuffle table for converting RGB24 to ARGB.
+static const uvec8 kShuffleMaskRGB24ToARGB = {
+ 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u};
+
+// Shuffle table for converting RAW to ARGB.
+static const uvec8 kShuffleMaskRAWToARGB = {2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u,
+ 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u};
+
+// Shuffle table for converting RAW to RGB24. First 8.
+static const uvec8 kShuffleMaskRAWToRGB24_0 = {
+ 2u, 1u, 0u, 5u, 4u, 3u, 8u, 7u,
+ 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
+
+// Shuffle table for converting RAW to RGB24. Middle 8.
+static const uvec8 kShuffleMaskRAWToRGB24_1 = {
+ 2u, 7u, 6u, 5u, 10u, 9u, 8u, 13u,
+ 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
+
+// Shuffle table for converting RAW to RGB24. Last 8.
+static const uvec8 kShuffleMaskRAWToRGB24_2 = {
+ 8u, 7u, 12u, 11u, 10u, 15u, 14u, 13u,
+ 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
+
+// Shuffle table for converting ARGB to RGB24.
+static const uvec8 kShuffleMaskARGBToRGB24 = {
+ 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u};
+
+// Shuffle table for converting ARGB to RAW.
+static const uvec8 kShuffleMaskARGBToRAW = {
+ 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u};
+
+// Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4
+static const uvec8 kShuffleMaskARGBToRGB24_0 = {
+ 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u};
+
+// YUY2 shuf 16 Y to 32 Y.
+static const lvec8 kShuffleYUY2Y = {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10,
+ 10, 12, 12, 14, 14, 0, 0, 2, 2, 4, 4,
+ 6, 6, 8, 8, 10, 10, 12, 12, 14, 14};
+
+// YUY2 shuf 8 UV to 16 UV.
+static const lvec8 kShuffleYUY2UV = {1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9,
+ 11, 13, 15, 13, 15, 1, 3, 1, 3, 5, 7,
+ 5, 7, 9, 11, 9, 11, 13, 15, 13, 15};
+
+// UYVY shuf 16 Y to 32 Y.
+static const lvec8 kShuffleUYVYY = {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11,
+ 11, 13, 13, 15, 15, 1, 1, 3, 3, 5, 5,
+ 7, 7, 9, 9, 11, 11, 13, 13, 15, 15};
+
+// UYVY shuf 8 UV to 16 UV.
+static const lvec8 kShuffleUYVYUV = {0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8,
+ 10, 12, 14, 12, 14, 0, 2, 0, 2, 4, 6,
+ 4, 6, 8, 10, 8, 10, 12, 14, 12, 14};
+
+// NV21 shuf 8 VU to 16 UV.
+static const lvec8 kShuffleNV21 = {
+ 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
+ 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
+};
+
+// Duplicates gray value 3 times and fills in alpha opaque.
+__declspec(naked) void J400ToARGBRow_SSE2(const uint8* src_y,
+ uint8* dst_argb,
+ int width) {
+ __asm {
+ mov eax, [esp + 4] // src_y
+ mov edx, [esp + 8] // dst_argb
+ mov ecx, [esp + 12] // width
+ pcmpeqb xmm5, xmm5 // generate mask 0xff000000
+ pslld xmm5, 24
+
+ convertloop:
+ movq xmm0, qword ptr [eax]
+ lea eax, [eax + 8]
+ punpcklbw xmm0, xmm0
+ movdqa xmm1, xmm0
+ punpcklwd xmm0, xmm0
+ punpckhwd xmm1, xmm1
+ por xmm0, xmm5
+ por xmm1, xmm5
+ movdqu [edx], xmm0
+ movdqu [edx + 16], xmm1
+ lea edx, [edx + 32]
+ sub ecx, 8
+ jg convertloop
+ ret
+ }
+}
+
+#ifdef HAS_J400TOARGBROW_AVX2
+// Duplicates gray value 3 times and fills in alpha opaque.
+__declspec(naked) void J400ToARGBRow_AVX2(const uint8* src_y,
+ uint8* dst_argb,
+ int width) {
+ __asm {
+ mov eax, [esp + 4] // src_y
+ mov edx, [esp + 8] // dst_argb
+ mov ecx, [esp + 12] // width
+ vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000
+ vpslld ymm5, ymm5, 24
+
+ convertloop:
+ vmovdqu xmm0, [eax]
+ lea eax, [eax + 16]
+ vpermq ymm0, ymm0, 0xd8
+ vpunpcklbw ymm0, ymm0, ymm0
+ vpermq ymm0, ymm0, 0xd8
+ vpunpckhwd ymm1, ymm0, ymm0
+ vpunpcklwd ymm0, ymm0, ymm0
+ vpor ymm0, ymm0, ymm5
+ vpor ymm1, ymm1, ymm5
+ vmovdqu [edx], ymm0
+ vmovdqu [edx + 32], ymm1
+ lea edx, [edx + 64]
+ sub ecx, 16
+ jg convertloop
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_J400TOARGBROW_AVX2
+
+__declspec(naked) void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24,
+ uint8* dst_argb,
+ int width) {
+ __asm {
+ mov eax, [esp + 4] // src_rgb24
+ mov edx, [esp + 8] // dst_argb
+ mov ecx, [esp + 12] // width
+ pcmpeqb xmm5, xmm5 // generate mask 0xff000000
+ pslld xmm5, 24
+ movdqa xmm4, xmmword ptr kShuffleMaskRGB24ToARGB
+
+ convertloop:
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ movdqu xmm3, [eax + 32]
+ lea eax, [eax + 48]
+ movdqa xmm2, xmm3
+ palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]}
+ pshufb xmm2, xmm4
+ por xmm2, xmm5
+ palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]}
+ pshufb xmm0, xmm4
+ movdqu [edx + 32], xmm2
+ por xmm0, xmm5
+ pshufb xmm1, xmm4
+ movdqu [edx], xmm0
+ por xmm1, xmm5
+ palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]}
+ pshufb xmm3, xmm4
+ movdqu [edx + 16], xmm1
+ por xmm3, xmm5
+ movdqu [edx + 48], xmm3
+ lea edx, [edx + 64]
+ sub ecx, 16
+ jg convertloop
+ ret
+ }
+}
+
+__declspec(naked) void RAWToARGBRow_SSSE3(const uint8* src_raw,
+ uint8* dst_argb,
+ int width) {
+ __asm {
+ mov eax, [esp + 4] // src_raw
+ mov edx, [esp + 8] // dst_argb
+ mov ecx, [esp + 12] // width
+ pcmpeqb xmm5, xmm5 // generate mask 0xff000000
+ pslld xmm5, 24
+ movdqa xmm4, xmmword ptr kShuffleMaskRAWToARGB
+
+ convertloop:
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ movdqu xmm3, [eax + 32]
+ lea eax, [eax + 48]
+ movdqa xmm2, xmm3
+ palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]}
+ pshufb xmm2, xmm4
+ por xmm2, xmm5
+ palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]}
+ pshufb xmm0, xmm4
+ movdqu [edx + 32], xmm2
+ por xmm0, xmm5
+ pshufb xmm1, xmm4
+ movdqu [edx], xmm0
+ por xmm1, xmm5
+ palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]}
+ pshufb xmm3, xmm4
+ movdqu [edx + 16], xmm1
+ por xmm3, xmm5
+ movdqu [edx + 48], xmm3
+ lea edx, [edx + 64]
+ sub ecx, 16
+ jg convertloop
+ ret
+ }
+}
+
+__declspec(naked) void RAWToRGB24Row_SSSE3(const uint8* src_raw,
+ uint8* dst_rgb24,
+ int width) {
+ __asm {
+ mov eax, [esp + 4] // src_raw
+ mov edx, [esp + 8] // dst_rgb24
+ mov ecx, [esp + 12] // width
+ movdqa xmm3, xmmword ptr kShuffleMaskRAWToRGB24_0
+ movdqa xmm4, xmmword ptr kShuffleMaskRAWToRGB24_1
+ movdqa xmm5, xmmword ptr kShuffleMaskRAWToRGB24_2
+
+ convertloop:
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 4]
+ movdqu xmm2, [eax + 8]
+ lea eax, [eax + 24]
+ pshufb xmm0, xmm3
+ pshufb xmm1, xmm4
+ pshufb xmm2, xmm5
+ movq qword ptr [edx], xmm0
+ movq qword ptr [edx + 8], xmm1
+ movq qword ptr [edx + 16], xmm2
+ lea edx, [edx + 24]
+ sub ecx, 8
+ jg convertloop
+ ret
+ }
+}
+
+// pmul method to replicate bits.
+// Math to replicate bits:
+// (v << 8) | (v << 3)
+// v * 256 + v * 8
+// v * (256 + 8)
+// G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
+// 20 instructions.
+__declspec(naked) void RGB565ToARGBRow_SSE2(const uint8* src_rgb565,
+ uint8* dst_argb,
+ int width) {
+ __asm {
+ mov eax, 0x01080108 // generate multiplier to repeat 5 bits
+ movd xmm5, eax
+ pshufd xmm5, xmm5, 0
+ mov eax, 0x20802080 // multiplier shift by 5 and then repeat 6 bits
+ movd xmm6, eax
+ pshufd xmm6, xmm6, 0
+ pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red
+ psllw xmm3, 11
+ pcmpeqb xmm4, xmm4 // generate mask 0x07e007e0 for Green
+ psllw xmm4, 10
+ psrlw xmm4, 5
+ pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha
+ psllw xmm7, 8
+
+ mov eax, [esp + 4] // src_rgb565
+ mov edx, [esp + 8] // dst_argb
+ mov ecx, [esp + 12] // width
+ sub edx, eax
+ sub edx, eax
+
+ convertloop:
+ movdqu xmm0, [eax] // fetch 8 pixels of bgr565
+ movdqa xmm1, xmm0
+ movdqa xmm2, xmm0
+ pand xmm1, xmm3 // R in upper 5 bits
+ psllw xmm2, 11 // B in upper 5 bits
+ pmulhuw xmm1, xmm5 // * (256 + 8)
+ pmulhuw xmm2, xmm5 // * (256 + 8)
+ psllw xmm1, 8
+ por xmm1, xmm2 // RB
+ pand xmm0, xmm4 // G in middle 6 bits
+ pmulhuw xmm0, xmm6 // << 5 * (256 + 4)
+ por xmm0, xmm7 // AG
+ movdqa xmm2, xmm1
+ punpcklbw xmm1, xmm0
+ punpckhbw xmm2, xmm0
+ movdqu [eax * 2 + edx], xmm1 // store 4 pixels of ARGB
+ movdqu [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB
+ lea eax, [eax + 16]
+ sub ecx, 8
+ jg convertloop
+ ret
+ }
+}
+
+#ifdef HAS_RGB565TOARGBROW_AVX2
+// pmul method to replicate bits.
+// Math to replicate bits:
+// (v << 8) | (v << 3)
+// v * 256 + v * 8
+// v * (256 + 8)
+// G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
+__declspec(naked) void RGB565ToARGBRow_AVX2(const uint8* src_rgb565,
+ uint8* dst_argb,
+ int width) {
+ __asm {
+ mov eax, 0x01080108 // generate multiplier to repeat 5 bits
+ vmovd xmm5, eax
+ vbroadcastss ymm5, xmm5
+ mov eax, 0x20802080 // multiplier shift by 5 and then repeat 6 bits
+ vmovd xmm6, eax
+ vbroadcastss ymm6, xmm6
+ vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red
+ vpsllw ymm3, ymm3, 11
+ vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x07e007e0 for Green
+ vpsllw ymm4, ymm4, 10
+ vpsrlw ymm4, ymm4, 5
+ vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha
+ vpsllw ymm7, ymm7, 8
+
+ mov eax, [esp + 4] // src_rgb565
+ mov edx, [esp + 8] // dst_argb
+ mov ecx, [esp + 12] // width
+ sub edx, eax
+ sub edx, eax
+
+ convertloop:
+ vmovdqu ymm0, [eax] // fetch 16 pixels of bgr565
+ vpand ymm1, ymm0, ymm3 // R in upper 5 bits
+ vpsllw ymm2, ymm0, 11 // B in upper 5 bits
+ vpmulhuw ymm1, ymm1, ymm5 // * (256 + 8)
+ vpmulhuw ymm2, ymm2, ymm5 // * (256 + 8)
+ vpsllw ymm1, ymm1, 8
+ vpor ymm1, ymm1, ymm2 // RB
+ vpand ymm0, ymm0, ymm4 // G in middle 6 bits
+ vpmulhuw ymm0, ymm0, ymm6 // << 5 * (256 + 4)
+ vpor ymm0, ymm0, ymm7 // AG
+ vpermq ymm0, ymm0, 0xd8 // mutate for unpack
+ vpermq ymm1, ymm1, 0xd8
+ vpunpckhbw ymm2, ymm1, ymm0
+ vpunpcklbw ymm1, ymm1, ymm0
+ vmovdqu [eax * 2 + edx], ymm1 // store 4 pixels of ARGB
+ vmovdqu [eax * 2 + edx + 32], ymm2 // store next 4 pixels of ARGB
+ lea eax, [eax + 32]
+ sub ecx, 16
+ jg convertloop
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_RGB565TOARGBROW_AVX2
+
+#ifdef HAS_ARGB1555TOARGBROW_AVX2
+__declspec(naked) void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555,
+ uint8* dst_argb,
+ int width) {
+ __asm {
+ mov eax, 0x01080108 // generate multiplier to repeat 5 bits
+ vmovd xmm5, eax
+ vbroadcastss ymm5, xmm5
+ mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits
+ vmovd xmm6, eax
+ vbroadcastss ymm6, xmm6
+ vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red
+ vpsllw ymm3, ymm3, 11
+ vpsrlw ymm4, ymm3, 6 // generate mask 0x03e003e0 for Green
+ vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha
+ vpsllw ymm7, ymm7, 8
+
+ mov eax, [esp + 4] // src_argb1555
+ mov edx, [esp + 8] // dst_argb
+ mov ecx, [esp + 12] // width
+ sub edx, eax
+ sub edx, eax
+
+ convertloop:
+ vmovdqu ymm0, [eax] // fetch 16 pixels of 1555
+ vpsllw ymm1, ymm0, 1 // R in upper 5 bits
+ vpsllw ymm2, ymm0, 11 // B in upper 5 bits
+ vpand ymm1, ymm1, ymm3
+ vpmulhuw ymm2, ymm2, ymm5 // * (256 + 8)
+ vpmulhuw ymm1, ymm1, ymm5 // * (256 + 8)
+ vpsllw ymm1, ymm1, 8
+ vpor ymm1, ymm1, ymm2 // RB
+ vpsraw ymm2, ymm0, 8 // A
+ vpand ymm0, ymm0, ymm4 // G in middle 5 bits
+ vpmulhuw ymm0, ymm0, ymm6 // << 6 * (256 + 8)
+ vpand ymm2, ymm2, ymm7
+ vpor ymm0, ymm0, ymm2 // AG
+ vpermq ymm0, ymm0, 0xd8 // mutate for unpack
+ vpermq ymm1, ymm1, 0xd8
+ vpunpckhbw ymm2, ymm1, ymm0
+ vpunpcklbw ymm1, ymm1, ymm0
+ vmovdqu [eax * 2 + edx], ymm1 // store 8 pixels of ARGB
+ vmovdqu [eax * 2 + edx + 32], ymm2 // store next 8 pixels of ARGB
+ lea eax, [eax + 32]
+ sub ecx, 16
+ jg convertloop
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_ARGB1555TOARGBROW_AVX2
+
+#ifdef HAS_ARGB4444TOARGBROW_AVX2
+__declspec(naked) void ARGB4444ToARGBRow_AVX2(const uint8* src_argb4444,
+ uint8* dst_argb,
+ int width) {
+ __asm {
+ mov eax, 0x0f0f0f0f // generate mask 0x0f0f0f0f
+ vmovd xmm4, eax
+ vbroadcastss ymm4, xmm4
+ vpslld ymm5, ymm4, 4 // 0xf0f0f0f0 for high nibbles
+ mov eax, [esp + 4] // src_argb4444
+ mov edx, [esp + 8] // dst_argb
+ mov ecx, [esp + 12] // width
+ sub edx, eax
+ sub edx, eax
+
+ convertloop:
+ vmovdqu ymm0, [eax] // fetch 16 pixels of bgra4444
+ vpand ymm2, ymm0, ymm5 // mask high nibbles
+ vpand ymm0, ymm0, ymm4 // mask low nibbles
+ vpsrlw ymm3, ymm2, 4
+ vpsllw ymm1, ymm0, 4
+ vpor ymm2, ymm2, ymm3
+ vpor ymm0, ymm0, ymm1
+ vpermq ymm0, ymm0, 0xd8 // mutate for unpack
+ vpermq ymm2, ymm2, 0xd8
+ vpunpckhbw ymm1, ymm0, ymm2
+ vpunpcklbw ymm0, ymm0, ymm2
+ vmovdqu [eax * 2 + edx], ymm0 // store 8 pixels of ARGB
+ vmovdqu [eax * 2 + edx + 32], ymm1 // store next 8 pixels of ARGB
+ lea eax, [eax + 32]
+ sub ecx, 16
+ jg convertloop
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_ARGB4444TOARGBROW_AVX2
+
+// 24 instructions
+__declspec(naked) void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555,
+ uint8* dst_argb,
+ int width) {
+ __asm {
+ mov eax, 0x01080108 // generate multiplier to repeat 5 bits
+ movd xmm5, eax
+ pshufd xmm5, xmm5, 0
+ mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits
+ movd xmm6, eax
+ pshufd xmm6, xmm6, 0
+ pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red
+ psllw xmm3, 11
+ movdqa xmm4, xmm3 // generate mask 0x03e003e0 for Green
+ psrlw xmm4, 6
+ pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha
+ psllw xmm7, 8
+
+ mov eax, [esp + 4] // src_argb1555
+ mov edx, [esp + 8] // dst_argb
+ mov ecx, [esp + 12] // width
+ sub edx, eax
+ sub edx, eax
+
+ convertloop:
+ movdqu xmm0, [eax] // fetch 8 pixels of 1555
+ movdqa xmm1, xmm0
+ movdqa xmm2, xmm0
+ psllw xmm1, 1 // R in upper 5 bits
+ psllw xmm2, 11 // B in upper 5 bits
+ pand xmm1, xmm3
+ pmulhuw xmm2, xmm5 // * (256 + 8)
+ pmulhuw xmm1, xmm5 // * (256 + 8)
+ psllw xmm1, 8
+ por xmm1, xmm2 // RB
+ movdqa xmm2, xmm0
+ pand xmm0, xmm4 // G in middle 5 bits
+ psraw xmm2, 8 // A
+ pmulhuw xmm0, xmm6 // << 6 * (256 + 8)
+ pand xmm2, xmm7
+ por xmm0, xmm2 // AG
+ movdqa xmm2, xmm1
+ punpcklbw xmm1, xmm0
+ punpckhbw xmm2, xmm0
+ movdqu [eax * 2 + edx], xmm1 // store 4 pixels of ARGB
+ movdqu [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB
+ lea eax, [eax + 16]
+ sub ecx, 8
+ jg convertloop
+ ret
+ }
+}
+
+// 18 instructions.
+__declspec(naked) void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444,
+ uint8* dst_argb,
+ int width) {
+ __asm {
+ mov eax, 0x0f0f0f0f // generate mask 0x0f0f0f0f
+ movd xmm4, eax
+ pshufd xmm4, xmm4, 0
+ movdqa xmm5, xmm4 // 0xf0f0f0f0 for high nibbles
+ pslld xmm5, 4
+ mov eax, [esp + 4] // src_argb4444
+ mov edx, [esp + 8] // dst_argb
+ mov ecx, [esp + 12] // width
+ sub edx, eax
+ sub edx, eax
+
+ convertloop:
+ movdqu xmm0, [eax] // fetch 8 pixels of bgra4444
+ movdqa xmm2, xmm0
+ pand xmm0, xmm4 // mask low nibbles
+ pand xmm2, xmm5 // mask high nibbles
+ movdqa xmm1, xmm0
+ movdqa xmm3, xmm2
+ psllw xmm1, 4
+ psrlw xmm3, 4
+ por xmm0, xmm1
+ por xmm2, xmm3
+ movdqa xmm1, xmm0
+ punpcklbw xmm0, xmm2
+ punpckhbw xmm1, xmm2
+ movdqu [eax * 2 + edx], xmm0 // store 4 pixels of ARGB
+ movdqu [eax * 2 + edx + 16], xmm1 // store next 4 pixels of ARGB
+ lea eax, [eax + 16]
+ sub ecx, 8
+ jg convertloop
+ ret
+ }
+}
+
+__declspec(naked) void ARGBToRGB24Row_SSSE3(const uint8* src_argb,
+ uint8* dst_rgb,
+ int width) {
+ __asm {
+ mov eax, [esp + 4] // src_argb
+ mov edx, [esp + 8] // dst_rgb
+ mov ecx, [esp + 12] // width
+ movdqa xmm6, xmmword ptr kShuffleMaskARGBToRGB24
+
+ convertloop:
+ movdqu xmm0, [eax] // fetch 16 pixels of argb
+ movdqu xmm1, [eax + 16]
+ movdqu xmm2, [eax + 32]
+ movdqu xmm3, [eax + 48]
+ lea eax, [eax + 64]
+ pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB
+ pshufb xmm1, xmm6
+ pshufb xmm2, xmm6
+ pshufb xmm3, xmm6
+ movdqa xmm4, xmm1 // 4 bytes from 1 for 0
+ psrldq xmm1, 4 // 8 bytes from 1
+ pslldq xmm4, 12 // 4 bytes from 1 for 0
+ movdqa xmm5, xmm2 // 8 bytes from 2 for 1
+ por xmm0, xmm4 // 4 bytes from 1 for 0
+ pslldq xmm5, 8 // 8 bytes from 2 for 1
+ movdqu [edx], xmm0 // store 0
+ por xmm1, xmm5 // 8 bytes from 2 for 1
+ psrldq xmm2, 8 // 4 bytes from 2
+ pslldq xmm3, 4 // 12 bytes from 3 for 2
+ por xmm2, xmm3 // 12 bytes from 3 for 2
+ movdqu [edx + 16], xmm1 // store 1
+ movdqu [edx + 32], xmm2 // store 2
+ lea edx, [edx + 48]
+ sub ecx, 16
+ jg convertloop
+ ret
+ }
+}
+
+__declspec(naked) void ARGBToRAWRow_SSSE3(const uint8* src_argb,
+ uint8* dst_rgb,
+ int width) {
+ __asm {
+ mov eax, [esp + 4] // src_argb
+ mov edx, [esp + 8] // dst_rgb
+ mov ecx, [esp + 12] // width
+ movdqa xmm6, xmmword ptr kShuffleMaskARGBToRAW
+
+ convertloop:
+ movdqu xmm0, [eax] // fetch 16 pixels of argb
+ movdqu xmm1, [eax + 16]
+ movdqu xmm2, [eax + 32]
+ movdqu xmm3, [eax + 48]
+ lea eax, [eax + 64]
+ pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB
+ pshufb xmm1, xmm6
+ pshufb xmm2, xmm6
+ pshufb xmm3, xmm6
+ movdqa xmm4, xmm1 // 4 bytes from 1 for 0
+ psrldq xmm1, 4 // 8 bytes from 1
+ pslldq xmm4, 12 // 4 bytes from 1 for 0
+ movdqa xmm5, xmm2 // 8 bytes from 2 for 1
+ por xmm0, xmm4 // 4 bytes from 1 for 0
+ pslldq xmm5, 8 // 8 bytes from 2 for 1
+ movdqu [edx], xmm0 // store 0
+ por xmm1, xmm5 // 8 bytes from 2 for 1
+ psrldq xmm2, 8 // 4 bytes from 2
+ pslldq xmm3, 4 // 12 bytes from 3 for 2
+ por xmm2, xmm3 // 12 bytes from 3 for 2
+ movdqu [edx + 16], xmm1 // store 1
+ movdqu [edx + 32], xmm2 // store 2
+ lea edx, [edx + 48]
+ sub ecx, 16
+ jg convertloop
+ ret
+ }
+}
+
+__declspec(naked) void ARGBToRGB565Row_SSE2(const uint8* src_argb,
+ uint8* dst_rgb,
+ int width) {
+ __asm {
+ mov eax, [esp + 4] // src_argb
+ mov edx, [esp + 8] // dst_rgb
+ mov ecx, [esp + 12] // width
+ pcmpeqb xmm3, xmm3 // generate mask 0x0000001f
+ psrld xmm3, 27
+ pcmpeqb xmm4, xmm4 // generate mask 0x000007e0
+ psrld xmm4, 26
+ pslld xmm4, 5
+ pcmpeqb xmm5, xmm5 // generate mask 0xfffff800
+ pslld xmm5, 11
+
+ convertloop:
+ movdqu xmm0, [eax] // fetch 4 pixels of argb
+ movdqa xmm1, xmm0 // B
+ movdqa xmm2, xmm0 // G
+ pslld xmm0, 8 // R
+ psrld xmm1, 3 // B
+ psrld xmm2, 5 // G
+ psrad xmm0, 16 // R
+ pand xmm1, xmm3 // B
+ pand xmm2, xmm4 // G
+ pand xmm0, xmm5 // R
+ por xmm1, xmm2 // BG
+ por xmm0, xmm1 // BGR
+ packssdw xmm0, xmm0
+ lea eax, [eax + 16]
+ movq qword ptr [edx], xmm0 // store 4 pixels of RGB565
+ lea edx, [edx + 8]
+ sub ecx, 4
+ jg convertloop
+ ret
+ }
+}
+
+__declspec(naked) void ARGBToRGB565DitherRow_SSE2(const uint8* src_argb,
+ uint8* dst_rgb,
+ const uint32 dither4,
+ int width) {
+ __asm {
+
+ mov eax, [esp + 4] // src_argb
+ mov edx, [esp + 8] // dst_rgb
+ movd xmm6, [esp + 12] // dither4
+ mov ecx, [esp + 16] // width
+ punpcklbw xmm6, xmm6 // make dither 16 bytes
+ movdqa xmm7, xmm6
+ punpcklwd xmm6, xmm6
+ punpckhwd xmm7, xmm7
+ pcmpeqb xmm3, xmm3 // generate mask 0x0000001f
+ psrld xmm3, 27
+ pcmpeqb xmm4, xmm4 // generate mask 0x000007e0
+ psrld xmm4, 26
+ pslld xmm4, 5
+ pcmpeqb xmm5, xmm5 // generate mask 0xfffff800
+ pslld xmm5, 11
+
+ convertloop:
+ movdqu xmm0, [eax] // fetch 4 pixels of argb
+ paddusb xmm0, xmm6 // add dither
+ movdqa xmm1, xmm0 // B
+ movdqa xmm2, xmm0 // G
+ pslld xmm0, 8 // R
+ psrld xmm1, 3 // B
+ psrld xmm2, 5 // G
+ psrad xmm0, 16 // R
+ pand xmm1, xmm3 // B
+ pand xmm2, xmm4 // G
+ pand xmm0, xmm5 // R
+ por xmm1, xmm2 // BG
+ por xmm0, xmm1 // BGR
+ packssdw xmm0, xmm0
+ lea eax, [eax + 16]
+ movq qword ptr [edx], xmm0 // store 4 pixels of RGB565
+ lea edx, [edx + 8]
+ sub ecx, 4
+ jg convertloop
+ ret
+ }
+}
+
+#ifdef HAS_ARGBTORGB565DITHERROW_AVX2
+__declspec(naked) void ARGBToRGB565DitherRow_AVX2(const uint8* src_argb,
+ uint8* dst_rgb,
+ const uint32 dither4,
+ int width) {
+ __asm {
+ mov eax, [esp + 4] // src_argb
+ mov edx, [esp + 8] // dst_rgb
+ vbroadcastss xmm6, [esp + 12] // dither4
+ mov ecx, [esp + 16] // width
+ vpunpcklbw xmm6, xmm6, xmm6 // make dither 32 bytes
+ vpermq ymm6, ymm6, 0xd8
+ vpunpcklwd ymm6, ymm6, ymm6
+ vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0x0000001f
+ vpsrld ymm3, ymm3, 27
+ vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x000007e0
+ vpsrld ymm4, ymm4, 26
+ vpslld ymm4, ymm4, 5
+ vpslld ymm5, ymm3, 11 // generate mask 0x0000f800
+
+ convertloop:
+ vmovdqu ymm0, [eax] // fetch 8 pixels of argb
+ vpaddusb ymm0, ymm0, ymm6 // add dither
+ vpsrld ymm2, ymm0, 5 // G
+ vpsrld ymm1, ymm0, 3 // B
+ vpsrld ymm0, ymm0, 8 // R
+ vpand ymm2, ymm2, ymm4 // G
+ vpand ymm1, ymm1, ymm3 // B
+ vpand ymm0, ymm0, ymm5 // R
+ vpor ymm1, ymm1, ymm2 // BG
+ vpor ymm0, ymm0, ymm1 // BGR
+ vpackusdw ymm0, ymm0, ymm0
+ vpermq ymm0, ymm0, 0xd8
+ lea eax, [eax + 32]
+ vmovdqu [edx], xmm0 // store 8 pixels of RGB565
+ lea edx, [edx + 16]
+ sub ecx, 8
+ jg convertloop
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_ARGBTORGB565DITHERROW_AVX2
+
+// TODO(fbarchard): Improve sign extension/packing.
+__declspec(naked) void ARGBToARGB1555Row_SSE2(const uint8* src_argb,
+ uint8* dst_rgb,
+ int width) {
+ __asm {
+ mov eax, [esp + 4] // src_argb
+ mov edx, [esp + 8] // dst_rgb
+ mov ecx, [esp + 12] // width
+ pcmpeqb xmm4, xmm4 // generate mask 0x0000001f
+ psrld xmm4, 27
+ movdqa xmm5, xmm4 // generate mask 0x000003e0
+ pslld xmm5, 5
+ movdqa xmm6, xmm4 // generate mask 0x00007c00
+ pslld xmm6, 10
+ pcmpeqb xmm7, xmm7 // generate mask 0xffff8000
+ pslld xmm7, 15
+
+ convertloop:
+ movdqu xmm0, [eax] // fetch 4 pixels of argb
+ movdqa xmm1, xmm0 // B
+ movdqa xmm2, xmm0 // G
+ movdqa xmm3, xmm0 // R
+ psrad xmm0, 16 // A
+ psrld xmm1, 3 // B
+ psrld xmm2, 6 // G
+ psrld xmm3, 9 // R
+ pand xmm0, xmm7 // A
+ pand xmm1, xmm4 // B
+ pand xmm2, xmm5 // G
+ pand xmm3, xmm6 // R
+ por xmm0, xmm1 // BA
+ por xmm2, xmm3 // GR
+ por xmm0, xmm2 // BGRA
+ packssdw xmm0, xmm0
+ lea eax, [eax + 16]
+ movq qword ptr [edx], xmm0 // store 4 pixels of ARGB1555
+ lea edx, [edx + 8]
+ sub ecx, 4
+ jg convertloop
+ ret
+ }
+}
+
+__declspec(naked) void ARGBToARGB4444Row_SSE2(const uint8* src_argb,
+ uint8* dst_rgb,
+ int width) {
+ __asm {
+ mov eax, [esp + 4] // src_argb
+ mov edx, [esp + 8] // dst_rgb
+ mov ecx, [esp + 12] // width
+ pcmpeqb xmm4, xmm4 // generate mask 0xf000f000
+ psllw xmm4, 12
+ movdqa xmm3, xmm4 // generate mask 0x00f000f0
+ psrlw xmm3, 8
+
+ convertloop:
+ movdqu xmm0, [eax] // fetch 4 pixels of argb
+ movdqa xmm1, xmm0
+ pand xmm0, xmm3 // low nibble
+ pand xmm1, xmm4 // high nibble
+ psrld xmm0, 4
+ psrld xmm1, 8
+ por xmm0, xmm1
+ packuswb xmm0, xmm0
+ lea eax, [eax + 16]
+ movq qword ptr [edx], xmm0 // store 4 pixels of ARGB4444
+ lea edx, [edx + 8]
+ sub ecx, 4
+ jg convertloop
+ ret
+ }
+}
+
+#ifdef HAS_ARGBTORGB565ROW_AVX2
+__declspec(naked) void ARGBToRGB565Row_AVX2(const uint8* src_argb,
+ uint8* dst_rgb,
+ int width) {
+ __asm {
+ mov eax, [esp + 4] // src_argb
+ mov edx, [esp + 8] // dst_rgb
+ mov ecx, [esp + 12] // width
+ vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0x0000001f
+ vpsrld ymm3, ymm3, 27
+ vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x000007e0
+ vpsrld ymm4, ymm4, 26
+ vpslld ymm4, ymm4, 5
+ vpslld ymm5, ymm3, 11 // generate mask 0x0000f800
+
+ convertloop:
+ vmovdqu ymm0, [eax] // fetch 8 pixels of argb
+ vpsrld ymm2, ymm0, 5 // G
+ vpsrld ymm1, ymm0, 3 // B
+ vpsrld ymm0, ymm0, 8 // R
+ vpand ymm2, ymm2, ymm4 // G
+ vpand ymm1, ymm1, ymm3 // B
+ vpand ymm0, ymm0, ymm5 // R
+ vpor ymm1, ymm1, ymm2 // BG
+ vpor ymm0, ymm0, ymm1 // BGR
+ vpackusdw ymm0, ymm0, ymm0
+ vpermq ymm0, ymm0, 0xd8
+ lea eax, [eax + 32]
+ vmovdqu [edx], xmm0 // store 8 pixels of RGB565
+ lea edx, [edx + 16]
+ sub ecx, 8
+ jg convertloop
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_ARGBTORGB565ROW_AVX2
+
+#ifdef HAS_ARGBTOARGB1555ROW_AVX2
+__declspec(naked) void ARGBToARGB1555Row_AVX2(const uint8* src_argb,
+ uint8* dst_rgb,
+ int width) {
+ __asm {
+ mov eax, [esp + 4] // src_argb
+ mov edx, [esp + 8] // dst_rgb
+ mov ecx, [esp + 12] // width
+ vpcmpeqb ymm4, ymm4, ymm4
+ vpsrld ymm4, ymm4, 27 // generate mask 0x0000001f
+ vpslld ymm5, ymm4, 5 // generate mask 0x000003e0
+ vpslld ymm6, ymm4, 10 // generate mask 0x00007c00
+ vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xffff8000
+ vpslld ymm7, ymm7, 15
+
+ convertloop:
+ vmovdqu ymm0, [eax] // fetch 8 pixels of argb
+ vpsrld ymm3, ymm0, 9 // R
+ vpsrld ymm2, ymm0, 6 // G
+ vpsrld ymm1, ymm0, 3 // B
+ vpsrad ymm0, ymm0, 16 // A
+ vpand ymm3, ymm3, ymm6 // R
+ vpand ymm2, ymm2, ymm5 // G
+ vpand ymm1, ymm1, ymm4 // B
+ vpand ymm0, ymm0, ymm7 // A
+ vpor ymm0, ymm0, ymm1 // BA
+ vpor ymm2, ymm2, ymm3 // GR
+ vpor ymm0, ymm0, ymm2 // BGRA
+ vpackssdw ymm0, ymm0, ymm0
+ vpermq ymm0, ymm0, 0xd8
+ lea eax, [eax + 32]
+ vmovdqu [edx], xmm0 // store 8 pixels of ARGB1555
+ lea edx, [edx + 16]
+ sub ecx, 8
+ jg convertloop
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_ARGBTOARGB1555ROW_AVX2
+
+#ifdef HAS_ARGBTOARGB4444ROW_AVX2
+__declspec(naked) void ARGBToARGB4444Row_AVX2(const uint8* src_argb,
+ uint8* dst_rgb,
+ int width) {
+ __asm {
+ mov eax, [esp + 4] // src_argb
+ mov edx, [esp + 8] // dst_rgb
+ mov ecx, [esp + 12] // width
+ vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0xf000f000
+ vpsllw ymm4, ymm4, 12
+ vpsrlw ymm3, ymm4, 8 // generate mask 0x00f000f0
+
+ convertloop:
+ vmovdqu ymm0, [eax] // fetch 8 pixels of argb
+ vpand ymm1, ymm0, ymm4 // high nibble
+ vpand ymm0, ymm0, ymm3 // low nibble
+ vpsrld ymm1, ymm1, 8
+ vpsrld ymm0, ymm0, 4
+ vpor ymm0, ymm0, ymm1
+ vpackuswb ymm0, ymm0, ymm0
+ vpermq ymm0, ymm0, 0xd8
+ lea eax, [eax + 32]
+ vmovdqu [edx], xmm0 // store 8 pixels of ARGB4444
+ lea edx, [edx + 16]
+ sub ecx, 8
+ jg convertloop
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_ARGBTOARGB4444ROW_AVX2
+
+// Convert 16 ARGB pixels (64 bytes) to 16 Y values.
+__declspec(naked) void ARGBToYRow_SSSE3(const uint8* src_argb,
+ uint8* dst_y,
+ int width) {
+ __asm {
+ mov eax, [esp + 4] /* src_argb */
+ mov edx, [esp + 8] /* dst_y */
+ mov ecx, [esp + 12] /* width */
+ movdqa xmm4, xmmword ptr kARGBToY
+ movdqa xmm5, xmmword ptr kAddY16
+
+ convertloop:
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ movdqu xmm2, [eax + 32]
+ movdqu xmm3, [eax + 48]
+ pmaddubsw xmm0, xmm4
+ pmaddubsw xmm1, xmm4
+ pmaddubsw xmm2, xmm4
+ pmaddubsw xmm3, xmm4
+ lea eax, [eax + 64]
+ phaddw xmm0, xmm1
+ phaddw xmm2, xmm3
+ psrlw xmm0, 7
+ psrlw xmm2, 7
+ packuswb xmm0, xmm2
+ paddb xmm0, xmm5
+ movdqu [edx], xmm0
+ lea edx, [edx + 16]
+ sub ecx, 16
+ jg convertloop
+ ret
+ }
+}
+
+// Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
+// Same as ARGBToYRow but different coefficients, no add 16, but do rounding.
+__declspec(naked) void ARGBToYJRow_SSSE3(const uint8* src_argb,
+ uint8* dst_y,
+ int width) {
+ __asm {
+ mov eax, [esp + 4] /* src_argb */
+ mov edx, [esp + 8] /* dst_y */
+ mov ecx, [esp + 12] /* width */
+ movdqa xmm4, xmmword ptr kARGBToYJ
+ movdqa xmm5, xmmword ptr kAddYJ64
+
+ convertloop:
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ movdqu xmm2, [eax + 32]
+ movdqu xmm3, [eax + 48]
+ pmaddubsw xmm0, xmm4
+ pmaddubsw xmm1, xmm4
+ pmaddubsw xmm2, xmm4
+ pmaddubsw xmm3, xmm4
+ lea eax, [eax + 64]
+ phaddw xmm0, xmm1
+ phaddw xmm2, xmm3
+ paddw xmm0, xmm5 // Add .5 for rounding.
+ paddw xmm2, xmm5
+ psrlw xmm0, 7
+ psrlw xmm2, 7
+ packuswb xmm0, xmm2
+ movdqu [edx], xmm0
+ lea edx, [edx + 16]
+ sub ecx, 16
+ jg convertloop
+ ret
+ }
+}
+
+#ifdef HAS_ARGBTOYROW_AVX2
+// vpermd for vphaddw + vpackuswb vpermd.
+static const lvec32 kPermdARGBToY_AVX = {0, 4, 1, 5, 2, 6, 3, 7};
+
+// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
+__declspec(naked) void ARGBToYRow_AVX2(const uint8* src_argb,
+ uint8* dst_y,
+ int width) {
+ __asm {
+ mov eax, [esp + 4] /* src_argb */
+ mov edx, [esp + 8] /* dst_y */
+ mov ecx, [esp + 12] /* width */
+ vbroadcastf128 ymm4, xmmword ptr kARGBToY
+ vbroadcastf128 ymm5, xmmword ptr kAddY16
+ vmovdqu ymm6, ymmword ptr kPermdARGBToY_AVX
+
+ convertloop:
+ vmovdqu ymm0, [eax]
+ vmovdqu ymm1, [eax + 32]
+ vmovdqu ymm2, [eax + 64]
+ vmovdqu ymm3, [eax + 96]
+ vpmaddubsw ymm0, ymm0, ymm4
+ vpmaddubsw ymm1, ymm1, ymm4
+ vpmaddubsw ymm2, ymm2, ymm4
+ vpmaddubsw ymm3, ymm3, ymm4
+ lea eax, [eax + 128]
+ vphaddw ymm0, ymm0, ymm1 // mutates.
+ vphaddw ymm2, ymm2, ymm3
+ vpsrlw ymm0, ymm0, 7
+ vpsrlw ymm2, ymm2, 7
+ vpackuswb ymm0, ymm0, ymm2 // mutates.
+ vpermd ymm0, ymm6, ymm0 // For vphaddw + vpackuswb mutation.
+ vpaddb ymm0, ymm0, ymm5 // add 16 for Y
+ vmovdqu [edx], ymm0
+ lea edx, [edx + 32]
+ sub ecx, 32
+ jg convertloop
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_ARGBTOYROW_AVX2
+
+#ifdef HAS_ARGBTOYJROW_AVX2
+// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
+__declspec(naked) void ARGBToYJRow_AVX2(const uint8* src_argb,
+ uint8* dst_y,
+ int width) {
+ __asm {
+ mov eax, [esp + 4] /* src_argb */
+ mov edx, [esp + 8] /* dst_y */
+ mov ecx, [esp + 12] /* width */
+ vbroadcastf128 ymm4, xmmword ptr kARGBToYJ
+ vbroadcastf128 ymm5, xmmword ptr kAddYJ64
+ vmovdqu ymm6, ymmword ptr kPermdARGBToY_AVX
+
+ convertloop:
+ vmovdqu ymm0, [eax]
+ vmovdqu ymm1, [eax + 32]
+ vmovdqu ymm2, [eax + 64]
+ vmovdqu ymm3, [eax + 96]
+ vpmaddubsw ymm0, ymm0, ymm4
+ vpmaddubsw ymm1, ymm1, ymm4
+ vpmaddubsw ymm2, ymm2, ymm4
+ vpmaddubsw ymm3, ymm3, ymm4
+ lea eax, [eax + 128]
+ vphaddw ymm0, ymm0, ymm1 // mutates.
+ vphaddw ymm2, ymm2, ymm3
+ vpaddw ymm0, ymm0, ymm5 // Add .5 for rounding.
+ vpaddw ymm2, ymm2, ymm5
+ vpsrlw ymm0, ymm0, 7
+ vpsrlw ymm2, ymm2, 7
+ vpackuswb ymm0, ymm0, ymm2 // mutates.
+ vpermd ymm0, ymm6, ymm0 // For vphaddw + vpackuswb mutation.
+ vmovdqu [edx], ymm0
+ lea edx, [edx + 32]
+ sub ecx, 32
+ jg convertloop
+
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_ARGBTOYJROW_AVX2
+
+__declspec(naked) void BGRAToYRow_SSSE3(const uint8* src_argb,
+ uint8* dst_y,
+ int width) {
+ __asm {
+ mov eax, [esp + 4] /* src_argb */
+ mov edx, [esp + 8] /* dst_y */
+ mov ecx, [esp + 12] /* width */
+ movdqa xmm4, xmmword ptr kBGRAToY
+ movdqa xmm5, xmmword ptr kAddY16
+
+ convertloop:
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ movdqu xmm2, [eax + 32]
+ movdqu xmm3, [eax + 48]
+ pmaddubsw xmm0, xmm4
+ pmaddubsw xmm1, xmm4
+ pmaddubsw xmm2, xmm4
+ pmaddubsw xmm3, xmm4
+ lea eax, [eax + 64]
+ phaddw xmm0, xmm1
+ phaddw xmm2, xmm3
+ psrlw xmm0, 7
+ psrlw xmm2, 7
+ packuswb xmm0, xmm2
+ paddb xmm0, xmm5
+ movdqu [edx], xmm0
+ lea edx, [edx + 16]
+ sub ecx, 16
+ jg convertloop
+ ret
+ }
+}
+
+__declspec(naked) void ABGRToYRow_SSSE3(const uint8* src_argb,
+ uint8* dst_y,
+ int width) {
+ __asm {
+ mov eax, [esp + 4] /* src_argb */
+ mov edx, [esp + 8] /* dst_y */
+ mov ecx, [esp + 12] /* width */
+ movdqa xmm4, xmmword ptr kABGRToY
+ movdqa xmm5, xmmword ptr kAddY16
+
+ convertloop:
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ movdqu xmm2, [eax + 32]
+ movdqu xmm3, [eax + 48]
+ pmaddubsw xmm0, xmm4
+ pmaddubsw xmm1, xmm4
+ pmaddubsw xmm2, xmm4
+ pmaddubsw xmm3, xmm4
+ lea eax, [eax + 64]
+ phaddw xmm0, xmm1
+ phaddw xmm2, xmm3
+ psrlw xmm0, 7
+ psrlw xmm2, 7
+ packuswb xmm0, xmm2
+ paddb xmm0, xmm5
+ movdqu [edx], xmm0
+ lea edx, [edx + 16]
+ sub ecx, 16
+ jg convertloop
+ ret
+ }
+}
+
+__declspec(naked) void RGBAToYRow_SSSE3(const uint8* src_argb,
+ uint8* dst_y,
+ int width) {
+ __asm {
+ mov eax, [esp + 4] /* src_argb */
+ mov edx, [esp + 8] /* dst_y */
+ mov ecx, [esp + 12] /* width */
+ movdqa xmm4, xmmword ptr kRGBAToY
+ movdqa xmm5, xmmword ptr kAddY16
+
+ convertloop:
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ movdqu xmm2, [eax + 32]
+ movdqu xmm3, [eax + 48]
+ pmaddubsw xmm0, xmm4
+ pmaddubsw xmm1, xmm4
+ pmaddubsw xmm2, xmm4
+ pmaddubsw xmm3, xmm4
+ lea eax, [eax + 64]
+ phaddw xmm0, xmm1
+ phaddw xmm2, xmm3
+ psrlw xmm0, 7
+ psrlw xmm2, 7
+ packuswb xmm0, xmm2
+ paddb xmm0, xmm5
+ movdqu [edx], xmm0
+ lea edx, [edx + 16]
+ sub ecx, 16
+ jg convertloop
+ ret
+ }
+}
+
+__declspec(naked) void ARGBToUVRow_SSSE3(const uint8* src_argb0,
+ int src_stride_argb,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // src_argb
+ mov esi, [esp + 8 + 8] // src_stride_argb
+ mov edx, [esp + 8 + 12] // dst_u
+ mov edi, [esp + 8 + 16] // dst_v
+ mov ecx, [esp + 8 + 20] // width
+ movdqa xmm5, xmmword ptr kAddUV128
+ movdqa xmm6, xmmword ptr kARGBToV
+ movdqa xmm7, xmmword ptr kARGBToU
+ sub edi, edx // stride from u to v
+
+ convertloop:
+ /* step 1 - subsample 16x2 argb pixels to 8x1 */
+ movdqu xmm0, [eax]
+ movdqu xmm4, [eax + esi]
+ pavgb xmm0, xmm4
+ movdqu xmm1, [eax + 16]
+ movdqu xmm4, [eax + esi + 16]
+ pavgb xmm1, xmm4
+ movdqu xmm2, [eax + 32]
+ movdqu xmm4, [eax + esi + 32]
+ pavgb xmm2, xmm4
+ movdqu xmm3, [eax + 48]
+ movdqu xmm4, [eax + esi + 48]
+ pavgb xmm3, xmm4
+
+ lea eax, [eax + 64]
+ movdqa xmm4, xmm0
+ shufps xmm0, xmm1, 0x88
+ shufps xmm4, xmm1, 0xdd
+ pavgb xmm0, xmm4
+ movdqa xmm4, xmm2
+ shufps xmm2, xmm3, 0x88
+ shufps xmm4, xmm3, 0xdd
+ pavgb xmm2, xmm4
+
+ // step 2 - convert to U and V
+ // from here down is very similar to Y code except
+ // instead of 16 different pixels, its 8 pixels of U and 8 of V
+ movdqa xmm1, xmm0
+ movdqa xmm3, xmm2
+ pmaddubsw xmm0, xmm7 // U
+ pmaddubsw xmm2, xmm7
+ pmaddubsw xmm1, xmm6 // V
+ pmaddubsw xmm3, xmm6
+ phaddw xmm0, xmm2
+ phaddw xmm1, xmm3
+ psraw xmm0, 8
+ psraw xmm1, 8
+ packsswb xmm0, xmm1
+ paddb xmm0, xmm5 // -> unsigned
+
+ // step 3 - store 8 U and 8 V values
+ movlps qword ptr [edx], xmm0 // U
+ movhps qword ptr [edx + edi], xmm0 // V
+ lea edx, [edx + 8]
+ sub ecx, 16
+ jg convertloop
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+__declspec(naked) void ARGBToUVJRow_SSSE3(const uint8* src_argb0,
+ int src_stride_argb,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // src_argb
+ mov esi, [esp + 8 + 8] // src_stride_argb
+ mov edx, [esp + 8 + 12] // dst_u
+ mov edi, [esp + 8 + 16] // dst_v
+ mov ecx, [esp + 8 + 20] // width
+ movdqa xmm5, xmmword ptr kAddUVJ128
+ movdqa xmm6, xmmword ptr kARGBToVJ
+ movdqa xmm7, xmmword ptr kARGBToUJ
+ sub edi, edx // stride from u to v
+
+ convertloop:
+ /* step 1 - subsample 16x2 argb pixels to 8x1 */
+ movdqu xmm0, [eax]
+ movdqu xmm4, [eax + esi]
+ pavgb xmm0, xmm4
+ movdqu xmm1, [eax + 16]
+ movdqu xmm4, [eax + esi + 16]
+ pavgb xmm1, xmm4
+ movdqu xmm2, [eax + 32]
+ movdqu xmm4, [eax + esi + 32]
+ pavgb xmm2, xmm4
+ movdqu xmm3, [eax + 48]
+ movdqu xmm4, [eax + esi + 48]
+ pavgb xmm3, xmm4
+
+ lea eax, [eax + 64]
+ movdqa xmm4, xmm0
+ shufps xmm0, xmm1, 0x88
+ shufps xmm4, xmm1, 0xdd
+ pavgb xmm0, xmm4
+ movdqa xmm4, xmm2
+ shufps xmm2, xmm3, 0x88
+ shufps xmm4, xmm3, 0xdd
+ pavgb xmm2, xmm4
+
+ // step 2 - convert to U and V
+ // from here down is very similar to Y code except
+ // instead of 16 different pixels, its 8 pixels of U and 8 of V
+ movdqa xmm1, xmm0
+ movdqa xmm3, xmm2
+ pmaddubsw xmm0, xmm7 // U
+ pmaddubsw xmm2, xmm7
+ pmaddubsw xmm1, xmm6 // V
+ pmaddubsw xmm3, xmm6
+ phaddw xmm0, xmm2
+ phaddw xmm1, xmm3
+ paddw xmm0, xmm5 // +.5 rounding -> unsigned
+ paddw xmm1, xmm5
+ psraw xmm0, 8
+ psraw xmm1, 8
+ packsswb xmm0, xmm1
+
+ // step 3 - store 8 U and 8 V values
+ movlps qword ptr [edx], xmm0 // U
+ movhps qword ptr [edx + edi], xmm0 // V
+ lea edx, [edx + 8]
+ sub ecx, 16
+ jg convertloop
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+#ifdef HAS_ARGBTOUVROW_AVX2
+__declspec(naked) void ARGBToUVRow_AVX2(const uint8* src_argb0,
+ int src_stride_argb,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // src_argb
+ mov esi, [esp + 8 + 8] // src_stride_argb
+ mov edx, [esp + 8 + 12] // dst_u
+ mov edi, [esp + 8 + 16] // dst_v
+ mov ecx, [esp + 8 + 20] // width
+ vbroadcastf128 ymm5, xmmword ptr kAddUV128
+ vbroadcastf128 ymm6, xmmword ptr kARGBToV
+ vbroadcastf128 ymm7, xmmword ptr kARGBToU
+ sub edi, edx // stride from u to v
+
+ convertloop:
+ /* step 1 - subsample 32x2 argb pixels to 16x1 */
+ vmovdqu ymm0, [eax]
+ vmovdqu ymm1, [eax + 32]
+ vmovdqu ymm2, [eax + 64]
+ vmovdqu ymm3, [eax + 96]
+ vpavgb ymm0, ymm0, [eax + esi]
+ vpavgb ymm1, ymm1, [eax + esi + 32]
+ vpavgb ymm2, ymm2, [eax + esi + 64]
+ vpavgb ymm3, ymm3, [eax + esi + 96]
+ lea eax, [eax + 128]
+ vshufps ymm4, ymm0, ymm1, 0x88
+ vshufps ymm0, ymm0, ymm1, 0xdd
+ vpavgb ymm0, ymm0, ymm4 // mutated by vshufps
+ vshufps ymm4, ymm2, ymm3, 0x88
+ vshufps ymm2, ymm2, ymm3, 0xdd
+ vpavgb ymm2, ymm2, ymm4 // mutated by vshufps
+
+ // step 2 - convert to U and V
+ // from here down is very similar to Y code except
+ // instead of 32 different pixels, its 16 pixels of U and 16 of V
+ vpmaddubsw ymm1, ymm0, ymm7 // U
+ vpmaddubsw ymm3, ymm2, ymm7
+ vpmaddubsw ymm0, ymm0, ymm6 // V
+ vpmaddubsw ymm2, ymm2, ymm6
+ vphaddw ymm1, ymm1, ymm3 // mutates
+ vphaddw ymm0, ymm0, ymm2
+ vpsraw ymm1, ymm1, 8
+ vpsraw ymm0, ymm0, 8
+ vpacksswb ymm0, ymm1, ymm0 // mutates
+ vpermq ymm0, ymm0, 0xd8 // For vpacksswb
+ vpshufb ymm0, ymm0, ymmword ptr kShufARGBToUV_AVX // for vshufps/vphaddw
+ vpaddb ymm0, ymm0, ymm5 // -> unsigned
+
+ // step 3 - store 16 U and 16 V values
+ vextractf128 [edx], ymm0, 0 // U
+ vextractf128 [edx + edi], ymm0, 1 // V
+ lea edx, [edx + 16]
+ sub ecx, 32
+ jg convertloop
+
+ pop edi
+ pop esi
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_ARGBTOUVROW_AVX2
+
+#ifdef HAS_ARGBTOUVJROW_AVX2
+__declspec(naked) void ARGBToUVJRow_AVX2(const uint8* src_argb0,
+ int src_stride_argb,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // src_argb
+ mov esi, [esp + 8 + 8] // src_stride_argb
+ mov edx, [esp + 8 + 12] // dst_u
+ mov edi, [esp + 8 + 16] // dst_v
+ mov ecx, [esp + 8 + 20] // width
+ vbroadcastf128 ymm5, xmmword ptr kAddUV128
+ vbroadcastf128 ymm6, xmmword ptr kARGBToV
+ vbroadcastf128 ymm7, xmmword ptr kARGBToU
+ sub edi, edx // stride from u to v
+
+ convertloop:
+ /* step 1 - subsample 32x2 argb pixels to 16x1 */
+ vmovdqu ymm0, [eax]
+ vmovdqu ymm1, [eax + 32]
+ vmovdqu ymm2, [eax + 64]
+ vmovdqu ymm3, [eax + 96]
+ vpavgb ymm0, ymm0, [eax + esi]
+ vpavgb ymm1, ymm1, [eax + esi + 32]
+ vpavgb ymm2, ymm2, [eax + esi + 64]
+ vpavgb ymm3, ymm3, [eax + esi + 96]
+ lea eax, [eax + 128]
+ vshufps ymm4, ymm0, ymm1, 0x88
+ vshufps ymm0, ymm0, ymm1, 0xdd
+ vpavgb ymm0, ymm0, ymm4 // mutated by vshufps
+ vshufps ymm4, ymm2, ymm3, 0x88
+ vshufps ymm2, ymm2, ymm3, 0xdd
+ vpavgb ymm2, ymm2, ymm4 // mutated by vshufps
+
+ // step 2 - convert to U and V
+ // from here down is very similar to Y code except
+ // instead of 32 different pixels, its 16 pixels of U and 16 of V
+ vpmaddubsw ymm1, ymm0, ymm7 // U
+ vpmaddubsw ymm3, ymm2, ymm7
+ vpmaddubsw ymm0, ymm0, ymm6 // V
+ vpmaddubsw ymm2, ymm2, ymm6
+ vphaddw ymm1, ymm1, ymm3 // mutates
+ vphaddw ymm0, ymm0, ymm2
+ vpaddw ymm1, ymm1, ymm5 // +.5 rounding -> unsigned
+ vpaddw ymm0, ymm0, ymm5
+ vpsraw ymm1, ymm1, 8
+ vpsraw ymm0, ymm0, 8
+ vpacksswb ymm0, ymm1, ymm0 // mutates
+ vpermq ymm0, ymm0, 0xd8 // For vpacksswb
+ vpshufb ymm0, ymm0, ymmword ptr kShufARGBToUV_AVX // for vshufps/vphaddw
+
+ // step 3 - store 16 U and 16 V values
+ vextractf128 [edx], ymm0, 0 // U
+ vextractf128 [edx + edi], ymm0, 1 // V
+ lea edx, [edx + 16]
+ sub ecx, 32
+ jg convertloop
+
+ pop edi
+ pop esi
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_ARGBTOUVJROW_AVX2
+
+__declspec(naked) void ARGBToUV444Row_SSSE3(const uint8* src_argb0,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
+ __asm {
+ push edi
+ mov eax, [esp + 4 + 4] // src_argb
+ mov edx, [esp + 4 + 8] // dst_u
+ mov edi, [esp + 4 + 12] // dst_v
+ mov ecx, [esp + 4 + 16] // width
+ movdqa xmm5, xmmword ptr kAddUV128
+ movdqa xmm6, xmmword ptr kARGBToV
+ movdqa xmm7, xmmword ptr kARGBToU
+ sub edi, edx // stride from u to v
+
+ convertloop:
+ /* convert to U and V */
+ movdqu xmm0, [eax] // U
+ movdqu xmm1, [eax + 16]
+ movdqu xmm2, [eax + 32]
+ movdqu xmm3, [eax + 48]
+ pmaddubsw xmm0, xmm7
+ pmaddubsw xmm1, xmm7
+ pmaddubsw xmm2, xmm7
+ pmaddubsw xmm3, xmm7
+ phaddw xmm0, xmm1
+ phaddw xmm2, xmm3
+ psraw xmm0, 8
+ psraw xmm2, 8
+ packsswb xmm0, xmm2
+ paddb xmm0, xmm5
+ movdqu [edx], xmm0
+
+ movdqu xmm0, [eax] // V
+ movdqu xmm1, [eax + 16]
+ movdqu xmm2, [eax + 32]
+ movdqu xmm3, [eax + 48]
+ pmaddubsw xmm0, xmm6
+ pmaddubsw xmm1, xmm6
+ pmaddubsw xmm2, xmm6
+ pmaddubsw xmm3, xmm6
+ phaddw xmm0, xmm1
+ phaddw xmm2, xmm3
+ psraw xmm0, 8
+ psraw xmm2, 8
+ packsswb xmm0, xmm2
+ paddb xmm0, xmm5
+ lea eax, [eax + 64]
+ movdqu [edx + edi], xmm0
+ lea edx, [edx + 16]
+ sub ecx, 16
+ jg convertloop
+
+ pop edi
+ ret
+ }
+}
+
+__declspec(naked) void BGRAToUVRow_SSSE3(const uint8* src_argb0,
+ int src_stride_argb,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // src_argb
+ mov esi, [esp + 8 + 8] // src_stride_argb
+ mov edx, [esp + 8 + 12] // dst_u
+ mov edi, [esp + 8 + 16] // dst_v
+ mov ecx, [esp + 8 + 20] // width
+ movdqa xmm5, xmmword ptr kAddUV128
+ movdqa xmm6, xmmword ptr kBGRAToV
+ movdqa xmm7, xmmword ptr kBGRAToU
+ sub edi, edx // stride from u to v
+
+ convertloop:
+ /* step 1 - subsample 16x2 argb pixels to 8x1 */
+ movdqu xmm0, [eax]
+ movdqu xmm4, [eax + esi]
+ pavgb xmm0, xmm4
+ movdqu xmm1, [eax + 16]
+ movdqu xmm4, [eax + esi + 16]
+ pavgb xmm1, xmm4
+ movdqu xmm2, [eax + 32]
+ movdqu xmm4, [eax + esi + 32]
+ pavgb xmm2, xmm4
+ movdqu xmm3, [eax + 48]
+ movdqu xmm4, [eax + esi + 48]
+ pavgb xmm3, xmm4
+
+ lea eax, [eax + 64]
+ movdqa xmm4, xmm0
+ shufps xmm0, xmm1, 0x88
+ shufps xmm4, xmm1, 0xdd
+ pavgb xmm0, xmm4
+ movdqa xmm4, xmm2
+ shufps xmm2, xmm3, 0x88
+ shufps xmm4, xmm3, 0xdd
+ pavgb xmm2, xmm4
+
+ // step 2 - convert to U and V
+ // from here down is very similar to Y code except
+ // instead of 16 different pixels, its 8 pixels of U and 8 of V
+ movdqa xmm1, xmm0
+ movdqa xmm3, xmm2
+ pmaddubsw xmm0, xmm7 // U
+ pmaddubsw xmm2, xmm7
+ pmaddubsw xmm1, xmm6 // V
+ pmaddubsw xmm3, xmm6
+ phaddw xmm0, xmm2
+ phaddw xmm1, xmm3
+ psraw xmm0, 8
+ psraw xmm1, 8
+ packsswb xmm0, xmm1
+ paddb xmm0, xmm5 // -> unsigned
+
+ // step 3 - store 8 U and 8 V values
+ movlps qword ptr [edx], xmm0 // U
+ movhps qword ptr [edx + edi], xmm0 // V
+ lea edx, [edx + 8]
+ sub ecx, 16
+ jg convertloop
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+__declspec(naked) void ABGRToUVRow_SSSE3(const uint8* src_argb0,
+ int src_stride_argb,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // src_argb
+ mov esi, [esp + 8 + 8] // src_stride_argb
+ mov edx, [esp + 8 + 12] // dst_u
+ mov edi, [esp + 8 + 16] // dst_v
+ mov ecx, [esp + 8 + 20] // width
+ movdqa xmm5, xmmword ptr kAddUV128
+ movdqa xmm6, xmmword ptr kABGRToV
+ movdqa xmm7, xmmword ptr kABGRToU
+ sub edi, edx // stride from u to v
+
+ convertloop:
+ /* step 1 - subsample 16x2 argb pixels to 8x1 */
+ movdqu xmm0, [eax]
+ movdqu xmm4, [eax + esi]
+ pavgb xmm0, xmm4
+ movdqu xmm1, [eax + 16]
+ movdqu xmm4, [eax + esi + 16]
+ pavgb xmm1, xmm4
+ movdqu xmm2, [eax + 32]
+ movdqu xmm4, [eax + esi + 32]
+ pavgb xmm2, xmm4
+ movdqu xmm3, [eax + 48]
+ movdqu xmm4, [eax + esi + 48]
+ pavgb xmm3, xmm4
+
+ lea eax, [eax + 64]
+ movdqa xmm4, xmm0
+ shufps xmm0, xmm1, 0x88
+ shufps xmm4, xmm1, 0xdd
+ pavgb xmm0, xmm4
+ movdqa xmm4, xmm2
+ shufps xmm2, xmm3, 0x88
+ shufps xmm4, xmm3, 0xdd
+ pavgb xmm2, xmm4
+
+ // step 2 - convert to U and V
+ // from here down is very similar to Y code except
+ // instead of 16 different pixels, its 8 pixels of U and 8 of V
+ movdqa xmm1, xmm0
+ movdqa xmm3, xmm2
+ pmaddubsw xmm0, xmm7 // U
+ pmaddubsw xmm2, xmm7
+ pmaddubsw xmm1, xmm6 // V
+ pmaddubsw xmm3, xmm6
+ phaddw xmm0, xmm2
+ phaddw xmm1, xmm3
+ psraw xmm0, 8
+ psraw xmm1, 8
+ packsswb xmm0, xmm1
+ paddb xmm0, xmm5 // -> unsigned
+
+ // step 3 - store 8 U and 8 V values
+ movlps qword ptr [edx], xmm0 // U
+ movhps qword ptr [edx + edi], xmm0 // V
+ lea edx, [edx + 8]
+ sub ecx, 16
+ jg convertloop
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+__declspec(naked) void RGBAToUVRow_SSSE3(const uint8* src_argb0,
+ int src_stride_argb,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // src_argb
+ mov esi, [esp + 8 + 8] // src_stride_argb
+ mov edx, [esp + 8 + 12] // dst_u
+ mov edi, [esp + 8 + 16] // dst_v
+ mov ecx, [esp + 8 + 20] // width
+ movdqa xmm5, xmmword ptr kAddUV128
+ movdqa xmm6, xmmword ptr kRGBAToV
+ movdqa xmm7, xmmword ptr kRGBAToU
+ sub edi, edx // stride from u to v
+
+ convertloop:
+ /* step 1 - subsample 16x2 argb pixels to 8x1 */
+ movdqu xmm0, [eax]
+ movdqu xmm4, [eax + esi]
+ pavgb xmm0, xmm4
+ movdqu xmm1, [eax + 16]
+ movdqu xmm4, [eax + esi + 16]
+ pavgb xmm1, xmm4
+ movdqu xmm2, [eax + 32]
+ movdqu xmm4, [eax + esi + 32]
+ pavgb xmm2, xmm4
+ movdqu xmm3, [eax + 48]
+ movdqu xmm4, [eax + esi + 48]
+ pavgb xmm3, xmm4
+
+ lea eax, [eax + 64]
+ movdqa xmm4, xmm0
+ shufps xmm0, xmm1, 0x88
+ shufps xmm4, xmm1, 0xdd
+ pavgb xmm0, xmm4
+ movdqa xmm4, xmm2
+ shufps xmm2, xmm3, 0x88
+ shufps xmm4, xmm3, 0xdd
+ pavgb xmm2, xmm4
+
+ // step 2 - convert to U and V
+ // from here down is very similar to Y code except
+ // instead of 16 different pixels, its 8 pixels of U and 8 of V
+ movdqa xmm1, xmm0
+ movdqa xmm3, xmm2
+ pmaddubsw xmm0, xmm7 // U
+ pmaddubsw xmm2, xmm7
+ pmaddubsw xmm1, xmm6 // V
+ pmaddubsw xmm3, xmm6
+ phaddw xmm0, xmm2
+ phaddw xmm1, xmm3
+ psraw xmm0, 8
+ psraw xmm1, 8
+ packsswb xmm0, xmm1
+ paddb xmm0, xmm5 // -> unsigned
+
+ // step 3 - store 8 U and 8 V values
+ movlps qword ptr [edx], xmm0 // U
+ movhps qword ptr [edx + edi], xmm0 // V
+ lea edx, [edx + 8]
+ sub ecx, 16
+ jg convertloop
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+#endif // HAS_ARGBTOYROW_SSSE3
+
+// Read 16 UV from 444
+#define READYUV444_AVX2 \
+ __asm { \
+ __asm vmovdqu xmm0, [esi] /* U */ \
+ __asm vmovdqu xmm1, [esi + edi] /* V */ \
+ __asm lea esi, [esi + 16] \
+ __asm vpermq ymm0, ymm0, 0xd8 \
+ __asm vpermq ymm1, ymm1, 0xd8 \
+ __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \
+ __asm vmovdqu xmm4, [eax] /* Y */ \
+ __asm vpermq ymm4, ymm4, 0xd8 \
+ __asm vpunpcklbw ymm4, ymm4, ymm4 \
+ __asm lea eax, [eax + 16]}
+
+// Read 8 UV from 422, upsample to 16 UV.
+#define READYUV422_AVX2 \
+ __asm { \
+ __asm vmovq xmm0, qword ptr [esi] /* U */ \
+ __asm vmovq xmm1, qword ptr [esi + edi] /* V */ \
+ __asm lea esi, [esi + 8] \
+ __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \
+ __asm vpermq ymm0, ymm0, 0xd8 \
+ __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \
+ __asm vmovdqu xmm4, [eax] /* Y */ \
+ __asm vpermq ymm4, ymm4, 0xd8 \
+ __asm vpunpcklbw ymm4, ymm4, ymm4 \
+ __asm lea eax, [eax + 16]}
+
+// Read 8 UV from 422, upsample to 16 UV. With 16 Alpha.
+#define READYUVA422_AVX2 \
+ __asm { \
+ __asm vmovq xmm0, qword ptr [esi] /* U */ \
+ __asm vmovq xmm1, qword ptr [esi + edi] /* V */ \
+ __asm lea esi, [esi + 8] \
+ __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \
+ __asm vpermq ymm0, ymm0, 0xd8 \
+ __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \
+ __asm vmovdqu xmm4, [eax] /* Y */ \
+ __asm vpermq ymm4, ymm4, 0xd8 \
+ __asm vpunpcklbw ymm4, ymm4, ymm4 \
+ __asm lea eax, [eax + 16] \
+ __asm vmovdqu xmm5, [ebp] /* A */ \
+ __asm vpermq ymm5, ymm5, 0xd8 \
+ __asm lea ebp, [ebp + 16]}
+
+// Read 8 UV from NV12, upsample to 16 UV.
+#define READNV12_AVX2 \
+ __asm { \
+ __asm vmovdqu xmm0, [esi] /* UV */ \
+ __asm lea esi, [esi + 16] \
+ __asm vpermq ymm0, ymm0, 0xd8 \
+ __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \
+ __asm vmovdqu xmm4, [eax] /* Y */ \
+ __asm vpermq ymm4, ymm4, 0xd8 \
+ __asm vpunpcklbw ymm4, ymm4, ymm4 \
+ __asm lea eax, [eax + 16]}
+
+// Read 8 UV from NV21, upsample to 16 UV.
+#define READNV21_AVX2 \
+ __asm { \
+ __asm vmovdqu xmm0, [esi] /* UV */ \
+ __asm lea esi, [esi + 16] \
+ __asm vpermq ymm0, ymm0, 0xd8 \
+ __asm vpshufb ymm0, ymm0, ymmword ptr kShuffleNV21 \
+ __asm vmovdqu xmm4, [eax] /* Y */ \
+ __asm vpermq ymm4, ymm4, 0xd8 \
+ __asm vpunpcklbw ymm4, ymm4, ymm4 \
+ __asm lea eax, [eax + 16]}
+
+// Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV.
+#define READYUY2_AVX2 \
+ __asm { \
+ __asm vmovdqu ymm4, [eax] /* YUY2 */ \
+ __asm vpshufb ymm4, ymm4, ymmword ptr kShuffleYUY2Y \
+ __asm vmovdqu ymm0, [eax] /* UV */ \
+ __asm vpshufb ymm0, ymm0, ymmword ptr kShuffleYUY2UV \
+ __asm lea eax, [eax + 32]}
+
+// Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV.
+#define READUYVY_AVX2 \
+ __asm { \
+ __asm vmovdqu ymm4, [eax] /* UYVY */ \
+ __asm vpshufb ymm4, ymm4, ymmword ptr kShuffleUYVYY \
+ __asm vmovdqu ymm0, [eax] /* UV */ \
+ __asm vpshufb ymm0, ymm0, ymmword ptr kShuffleUYVYUV \
+ __asm lea eax, [eax + 32]}
+
+// Convert 16 pixels: 16 UV and 16 Y.
+#define YUVTORGB_AVX2(YuvConstants) \
+ __asm { \
+ __asm vpmaddubsw ymm2, ymm0, ymmword ptr [YuvConstants + KUVTOR] /* R UV */\
+ __asm vpmaddubsw ymm1, ymm0, ymmword ptr [YuvConstants + KUVTOG] /* G UV */\
+ __asm vpmaddubsw ymm0, ymm0, ymmword ptr [YuvConstants + KUVTOB] /* B UV */\
+ __asm vmovdqu ymm3, ymmword ptr [YuvConstants + KUVBIASR] \
+ __asm vpsubw ymm2, ymm3, ymm2 \
+ __asm vmovdqu ymm3, ymmword ptr [YuvConstants + KUVBIASG] \
+ __asm vpsubw ymm1, ymm3, ymm1 \
+ __asm vmovdqu ymm3, ymmword ptr [YuvConstants + KUVBIASB] \
+ __asm vpsubw ymm0, ymm3, ymm0 /* Step 2: Find Y contribution to 16 R,G,B values */ \
+ __asm vpmulhuw ymm4, ymm4, ymmword ptr [YuvConstants + KYTORGB] \
+ __asm vpaddsw ymm0, ymm0, ymm4 /* B += Y */ \
+ __asm vpaddsw ymm1, ymm1, ymm4 /* G += Y */ \
+ __asm vpaddsw ymm2, ymm2, ymm4 /* R += Y */ \
+ __asm vpsraw ymm0, ymm0, 6 \
+ __asm vpsraw ymm1, ymm1, 6 \
+ __asm vpsraw ymm2, ymm2, 6 \
+ __asm vpackuswb ymm0, ymm0, ymm0 /* B */ \
+ __asm vpackuswb ymm1, ymm1, ymm1 /* G */ \
+ __asm vpackuswb ymm2, ymm2, ymm2 /* R */ \
+ }
+
+// Store 16 ARGB values.
+#define STOREARGB_AVX2 \
+ __asm { \
+ __asm vpunpcklbw ymm0, ymm0, ymm1 /* BG */ \
+ __asm vpermq ymm0, ymm0, 0xd8 \
+ __asm vpunpcklbw ymm2, ymm2, ymm5 /* RA */ \
+ __asm vpermq ymm2, ymm2, 0xd8 \
+ __asm vpunpcklwd ymm1, ymm0, ymm2 /* BGRA first 8 pixels */ \
+ __asm vpunpckhwd ymm0, ymm0, ymm2 /* BGRA next 8 pixels */ \
+ __asm vmovdqu 0[edx], ymm1 \
+ __asm vmovdqu 32[edx], ymm0 \
+ __asm lea edx, [edx + 64]}
+
+// Store 16 RGBA values.
+#define STORERGBA_AVX2 \
+ __asm { \
+ __asm vpunpcklbw ymm1, ymm1, ymm2 /* GR */ \
+ __asm vpermq ymm1, ymm1, 0xd8 \
+ __asm vpunpcklbw ymm2, ymm5, ymm0 /* AB */ \
+ __asm vpermq ymm2, ymm2, 0xd8 \
+ __asm vpunpcklwd ymm0, ymm2, ymm1 /* ABGR first 8 pixels */ \
+ __asm vpunpckhwd ymm1, ymm2, ymm1 /* ABGR next 8 pixels */ \
+ __asm vmovdqu [edx], ymm0 \
+ __asm vmovdqu [edx + 32], ymm1 \
+ __asm lea edx, [edx + 64]}
+
+#ifdef HAS_I422TOARGBROW_AVX2
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
+__declspec(naked) void I422ToARGBRow_AVX2(
+ const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ __asm {
+ push esi
+ push edi
+ push ebx
+ mov eax, [esp + 12 + 4] // Y
+ mov esi, [esp + 12 + 8] // U
+ mov edi, [esp + 12 + 12] // V
+ mov edx, [esp + 12 + 16] // argb
+ mov ebx, [esp + 12 + 20] // yuvconstants
+ mov ecx, [esp + 12 + 24] // width
+ sub edi, esi
+ vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
+
+ convertloop:
+ READYUV422_AVX2
+ YUVTORGB_AVX2(ebx)
+ STOREARGB_AVX2
+
+ sub ecx, 16
+ jg convertloop
+
+ pop ebx
+ pop edi
+ pop esi
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_I422TOARGBROW_AVX2
+
+#ifdef HAS_I422ALPHATOARGBROW_AVX2
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB.
+__declspec(naked) void I422AlphaToARGBRow_AVX2(
+ const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ const uint8* a_buf,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ __asm {
+ push esi
+ push edi
+ push ebx
+ push ebp
+ mov eax, [esp + 16 + 4] // Y
+ mov esi, [esp + 16 + 8] // U
+ mov edi, [esp + 16 + 12] // V
+ mov ebp, [esp + 16 + 16] // A
+ mov edx, [esp + 16 + 20] // argb
+ mov ebx, [esp + 16 + 24] // yuvconstants
+ mov ecx, [esp + 16 + 28] // width
+ sub edi, esi
+
+ convertloop:
+ READYUVA422_AVX2
+ YUVTORGB_AVX2(ebx)
+ STOREARGB_AVX2
+
+ sub ecx, 16
+ jg convertloop
+
+ pop ebp
+ pop ebx
+ pop edi
+ pop esi
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_I422ALPHATOARGBROW_AVX2
+
+#ifdef HAS_I444TOARGBROW_AVX2
+// 16 pixels
+// 16 UV values with 16 Y producing 16 ARGB (64 bytes).
+__declspec(naked) void I444ToARGBRow_AVX2(
+ const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ __asm {
+ push esi
+ push edi
+ push ebx
+ mov eax, [esp + 12 + 4] // Y
+ mov esi, [esp + 12 + 8] // U
+ mov edi, [esp + 12 + 12] // V
+ mov edx, [esp + 12 + 16] // argb
+ mov ebx, [esp + 12 + 20] // yuvconstants
+ mov ecx, [esp + 12 + 24] // width
+ sub edi, esi
+ vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
+ convertloop:
+ READYUV444_AVX2
+ YUVTORGB_AVX2(ebx)
+ STOREARGB_AVX2
+
+ sub ecx, 16
+ jg convertloop
+
+ pop ebx
+ pop edi
+ pop esi
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_I444TOARGBROW_AVX2
+
+#ifdef HAS_NV12TOARGBROW_AVX2
+// 16 pixels.
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
+__declspec(naked) void NV12ToARGBRow_AVX2(
+ const uint8* y_buf,
+ const uint8* uv_buf,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ __asm {
+ push esi
+ push ebx
+ mov eax, [esp + 8 + 4] // Y
+ mov esi, [esp + 8 + 8] // UV
+ mov edx, [esp + 8 + 12] // argb
+ mov ebx, [esp + 8 + 16] // yuvconstants
+ mov ecx, [esp + 8 + 20] // width
+ vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
+
+ convertloop:
+ READNV12_AVX2
+ YUVTORGB_AVX2(ebx)
+ STOREARGB_AVX2
+
+ sub ecx, 16
+ jg convertloop
+
+ pop ebx
+ pop esi
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_NV12TOARGBROW_AVX2
+
+#ifdef HAS_NV21TOARGBROW_AVX2
+// 16 pixels.
+// 8 VU values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
+__declspec(naked) void NV21ToARGBRow_AVX2(
+ const uint8* y_buf,
+ const uint8* vu_buf,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ __asm {
+ push esi
+ push ebx
+ mov eax, [esp + 8 + 4] // Y
+ mov esi, [esp + 8 + 8] // VU
+ mov edx, [esp + 8 + 12] // argb
+ mov ebx, [esp + 8 + 16] // yuvconstants
+ mov ecx, [esp + 8 + 20] // width
+ vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
+
+ convertloop:
+ READNV21_AVX2
+ YUVTORGB_AVX2(ebx)
+ STOREARGB_AVX2
+
+ sub ecx, 16
+ jg convertloop
+
+ pop ebx
+ pop esi
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_NV21TOARGBROW_AVX2
+
+#ifdef HAS_YUY2TOARGBROW_AVX2
+// 16 pixels.
+// 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
+__declspec(naked) void YUY2ToARGBRow_AVX2(
+ const uint8* src_yuy2,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ __asm {
+ push ebx
+ mov eax, [esp + 4 + 4] // yuy2
+ mov edx, [esp + 4 + 8] // argb
+ mov ebx, [esp + 4 + 12] // yuvconstants
+ mov ecx, [esp + 4 + 16] // width
+ vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
+
+ convertloop:
+ READYUY2_AVX2
+ YUVTORGB_AVX2(ebx)
+ STOREARGB_AVX2
+
+ sub ecx, 16
+ jg convertloop
+
+ pop ebx
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_YUY2TOARGBROW_AVX2
+
+#ifdef HAS_UYVYTOARGBROW_AVX2
+// 16 pixels.
+// 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
+__declspec(naked) void UYVYToARGBRow_AVX2(
+ const uint8* src_uyvy,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ __asm {
+ push ebx
+ mov eax, [esp + 4 + 4] // uyvy
+ mov edx, [esp + 4 + 8] // argb
+ mov ebx, [esp + 4 + 12] // yuvconstants
+ mov ecx, [esp + 4 + 16] // width
+ vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
+
+ convertloop:
+ READUYVY_AVX2
+ YUVTORGB_AVX2(ebx)
+ STOREARGB_AVX2
+
+ sub ecx, 16
+ jg convertloop
+
+ pop ebx
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_UYVYTOARGBROW_AVX2
+
+#ifdef HAS_I422TORGBAROW_AVX2
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes).
+__declspec(naked) void I422ToRGBARow_AVX2(
+ const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ __asm {
+ push esi
+ push edi
+ push ebx
+ mov eax, [esp + 12 + 4] // Y
+ mov esi, [esp + 12 + 8] // U
+ mov edi, [esp + 12 + 12] // V
+ mov edx, [esp + 12 + 16] // abgr
+ mov ebx, [esp + 12 + 20] // yuvconstants
+ mov ecx, [esp + 12 + 24] // width
+ sub edi, esi
+ vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
+
+ convertloop:
+ READYUV422_AVX2
+ YUVTORGB_AVX2(ebx)
+ STORERGBA_AVX2
+
+ sub ecx, 16
+ jg convertloop
+
+ pop ebx
+ pop edi
+ pop esi
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_I422TORGBAROW_AVX2
+
+#if defined(HAS_I422TOARGBROW_SSSE3)
+// TODO(fbarchard): Read that does half size on Y and treats 420 as 444.
+// Allows a conversion with half size scaling.
+
+// Read 8 UV from 444.
+#define READYUV444 \
+ __asm { \
+ __asm movq xmm0, qword ptr [esi] /* U */ \
+ __asm movq xmm1, qword ptr [esi + edi] /* V */ \
+ __asm lea esi, [esi + 8] \
+ __asm punpcklbw xmm0, xmm1 /* UV */ \
+ __asm movq xmm4, qword ptr [eax] \
+ __asm punpcklbw xmm4, xmm4 \
+ __asm lea eax, [eax + 8]}
+
+// Read 4 UV from 422, upsample to 8 UV.
+#define READYUV422 \
+ __asm { \
+ __asm movd xmm0, [esi] /* U */ \
+ __asm movd xmm1, [esi + edi] /* V */ \
+ __asm lea esi, [esi + 4] \
+ __asm punpcklbw xmm0, xmm1 /* UV */ \
+ __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \
+ __asm movq xmm4, qword ptr [eax] \
+ __asm punpcklbw xmm4, xmm4 \
+ __asm lea eax, [eax + 8]}
+
+// Read 4 UV from 422, upsample to 8 UV. With 8 Alpha.
+#define READYUVA422 \
+ __asm { \
+ __asm movd xmm0, [esi] /* U */ \
+ __asm movd xmm1, [esi + edi] /* V */ \
+ __asm lea esi, [esi + 4] \
+ __asm punpcklbw xmm0, xmm1 /* UV */ \
+ __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \
+ __asm movq xmm4, qword ptr [eax] /* Y */ \
+ __asm punpcklbw xmm4, xmm4 \
+ __asm lea eax, [eax + 8] \
+ __asm movq xmm5, qword ptr [ebp] /* A */ \
+ __asm lea ebp, [ebp + 8]}
+
+// Read 4 UV from NV12, upsample to 8 UV.
+#define READNV12 \
+ __asm { \
+ __asm movq xmm0, qword ptr [esi] /* UV */ \
+ __asm lea esi, [esi + 8] \
+ __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \
+ __asm movq xmm4, qword ptr [eax] \
+ __asm punpcklbw xmm4, xmm4 \
+ __asm lea eax, [eax + 8]}
+
+// Read 4 VU from NV21, upsample to 8 UV.
+#define READNV21 \
+ __asm { \
+ __asm movq xmm0, qword ptr [esi] /* UV */ \
+ __asm lea esi, [esi + 8] \
+ __asm pshufb xmm0, xmmword ptr kShuffleNV21 \
+ __asm movq xmm4, qword ptr [eax] \
+ __asm punpcklbw xmm4, xmm4 \
+ __asm lea eax, [eax + 8]}
+
+// Read 4 YUY2 with 8 Y and upsample 4 UV to 8 UV.
+#define READYUY2 \
+ __asm { \
+ __asm movdqu xmm4, [eax] /* YUY2 */ \
+ __asm pshufb xmm4, xmmword ptr kShuffleYUY2Y \
+ __asm movdqu xmm0, [eax] /* UV */ \
+ __asm pshufb xmm0, xmmword ptr kShuffleYUY2UV \
+ __asm lea eax, [eax + 16]}
+
+// Read 4 UYVY with 8 Y and upsample 4 UV to 8 UV.
+#define READUYVY \
+ __asm { \
+ __asm movdqu xmm4, [eax] /* UYVY */ \
+ __asm pshufb xmm4, xmmword ptr kShuffleUYVYY \
+ __asm movdqu xmm0, [eax] /* UV */ \
+ __asm pshufb xmm0, xmmword ptr kShuffleUYVYUV \
+ __asm lea eax, [eax + 16]}
+
+// Convert 8 pixels: 8 UV and 8 Y.
+#define YUVTORGB(YuvConstants) \
+ __asm { \
+ __asm movdqa xmm1, xmm0 \
+ __asm movdqa xmm2, xmm0 \
+ __asm movdqa xmm3, xmm0 \
+ __asm movdqa xmm0, xmmword ptr [YuvConstants + KUVBIASB] \
+ __asm pmaddubsw xmm1, xmmword ptr [YuvConstants + KUVTOB] \
+ __asm psubw xmm0, xmm1 \
+ __asm movdqa xmm1, xmmword ptr [YuvConstants + KUVBIASG] \
+ __asm pmaddubsw xmm2, xmmword ptr [YuvConstants + KUVTOG] \
+ __asm psubw xmm1, xmm2 \
+ __asm movdqa xmm2, xmmword ptr [YuvConstants + KUVBIASR] \
+ __asm pmaddubsw xmm3, xmmword ptr [YuvConstants + KUVTOR] \
+ __asm psubw xmm2, xmm3 \
+ __asm pmulhuw xmm4, xmmword ptr [YuvConstants + KYTORGB] \
+ __asm paddsw xmm0, xmm4 /* B += Y */ \
+ __asm paddsw xmm1, xmm4 /* G += Y */ \
+ __asm paddsw xmm2, xmm4 /* R += Y */ \
+ __asm psraw xmm0, 6 \
+ __asm psraw xmm1, 6 \
+ __asm psraw xmm2, 6 \
+ __asm packuswb xmm0, xmm0 /* B */ \
+ __asm packuswb xmm1, xmm1 /* G */ \
+ __asm packuswb xmm2, xmm2 /* R */ \
+ }
+
+// Store 8 ARGB values.
+#define STOREARGB \
+ __asm { \
+ __asm punpcklbw xmm0, xmm1 /* BG */ \
+ __asm punpcklbw xmm2, xmm5 /* RA */ \
+ __asm movdqa xmm1, xmm0 \
+ __asm punpcklwd xmm0, xmm2 /* BGRA first 4 pixels */ \
+ __asm punpckhwd xmm1, xmm2 /* BGRA next 4 pixels */ \
+ __asm movdqu 0[edx], xmm0 \
+ __asm movdqu 16[edx], xmm1 \
+ __asm lea edx, [edx + 32]}
+
+// Store 8 BGRA values.
+#define STOREBGRA \
+ __asm { \
+ __asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \
+ __asm punpcklbw xmm1, xmm0 /* GB */ \
+ __asm punpcklbw xmm5, xmm2 /* AR */ \
+ __asm movdqa xmm0, xmm5 \
+ __asm punpcklwd xmm5, xmm1 /* BGRA first 4 pixels */ \
+ __asm punpckhwd xmm0, xmm1 /* BGRA next 4 pixels */ \
+ __asm movdqu 0[edx], xmm5 \
+ __asm movdqu 16[edx], xmm0 \
+ __asm lea edx, [edx + 32]}
+
+// Store 8 RGBA values.
+#define STORERGBA \
+ __asm { \
+ __asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \
+ __asm punpcklbw xmm1, xmm2 /* GR */ \
+ __asm punpcklbw xmm5, xmm0 /* AB */ \
+ __asm movdqa xmm0, xmm5 \
+ __asm punpcklwd xmm5, xmm1 /* RGBA first 4 pixels */ \
+ __asm punpckhwd xmm0, xmm1 /* RGBA next 4 pixels */ \
+ __asm movdqu 0[edx], xmm5 \
+ __asm movdqu 16[edx], xmm0 \
+ __asm lea edx, [edx + 32]}
+
+// Store 8 RGB24 values.
+#define STORERGB24 \
+ __asm {/* Weave into RRGB */ \
+ __asm punpcklbw xmm0, xmm1 /* BG */ \
+ __asm punpcklbw xmm2, xmm2 /* RR */ \
+ __asm movdqa xmm1, xmm0 \
+ __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \
+ __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ /* RRGB -> RGB24 */ \
+ __asm pshufb xmm0, xmm5 /* Pack first 8 and last 4 bytes. */ \
+ __asm pshufb xmm1, xmm6 /* Pack first 12 bytes. */ \
+ __asm palignr xmm1, xmm0, 12 /* last 4 bytes of xmm0 + 12 xmm1 */ \
+ __asm movq qword ptr 0[edx], xmm0 /* First 8 bytes */ \
+ __asm movdqu 8[edx], xmm1 /* Last 16 bytes */ \
+ __asm lea edx, [edx + 24]}
+
+// Store 8 RGB565 values.
+#define STORERGB565 \
+ __asm {/* Weave into RRGB */ \
+ __asm punpcklbw xmm0, xmm1 /* BG */ \
+ __asm punpcklbw xmm2, xmm2 /* RR */ \
+ __asm movdqa xmm1, xmm0 \
+ __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \
+ __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ /* RRGB -> RGB565 */ \
+ __asm movdqa xmm3, xmm0 /* B first 4 pixels of argb */ \
+ __asm movdqa xmm2, xmm0 /* G */ \
+ __asm pslld xmm0, 8 /* R */ \
+ __asm psrld xmm3, 3 /* B */ \
+ __asm psrld xmm2, 5 /* G */ \
+ __asm psrad xmm0, 16 /* R */ \
+ __asm pand xmm3, xmm5 /* B */ \
+ __asm pand xmm2, xmm6 /* G */ \
+ __asm pand xmm0, xmm7 /* R */ \
+ __asm por xmm3, xmm2 /* BG */ \
+ __asm por xmm0, xmm3 /* BGR */ \
+ __asm movdqa xmm3, xmm1 /* B next 4 pixels of argb */ \
+ __asm movdqa xmm2, xmm1 /* G */ \
+ __asm pslld xmm1, 8 /* R */ \
+ __asm psrld xmm3, 3 /* B */ \
+ __asm psrld xmm2, 5 /* G */ \
+ __asm psrad xmm1, 16 /* R */ \
+ __asm pand xmm3, xmm5 /* B */ \
+ __asm pand xmm2, xmm6 /* G */ \
+ __asm pand xmm1, xmm7 /* R */ \
+ __asm por xmm3, xmm2 /* BG */ \
+ __asm por xmm1, xmm3 /* BGR */ \
+ __asm packssdw xmm0, xmm1 \
+ __asm movdqu 0[edx], xmm0 /* store 8 pixels of RGB565 */ \
+ __asm lea edx, [edx + 16]}
+
+// 8 pixels.
+// 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes).
+__declspec(naked) void I444ToARGBRow_SSSE3(
+ const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ __asm {
+ push esi
+ push edi
+ push ebx
+ mov eax, [esp + 12 + 4] // Y
+ mov esi, [esp + 12 + 8] // U
+ mov edi, [esp + 12 + 12] // V
+ mov edx, [esp + 12 + 16] // argb
+ mov ebx, [esp + 12 + 20] // yuvconstants
+ mov ecx, [esp + 12 + 24] // width
+ sub edi, esi
+ pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
+
+ convertloop:
+ READYUV444
+ YUVTORGB(ebx)
+ STOREARGB
+
+ sub ecx, 8
+ jg convertloop
+
+ pop ebx
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+// 8 pixels.
+// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB24 (24 bytes).
+__declspec(naked) void I422ToRGB24Row_SSSE3(
+ const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* dst_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ __asm {
+ push esi
+ push edi
+ push ebx
+ mov eax, [esp + 12 + 4] // Y
+ mov esi, [esp + 12 + 8] // U
+ mov edi, [esp + 12 + 12] // V
+ mov edx, [esp + 12 + 16] // argb
+ mov ebx, [esp + 12 + 20] // yuvconstants
+ mov ecx, [esp + 12 + 24] // width
+ sub edi, esi
+ movdqa xmm5, xmmword ptr kShuffleMaskARGBToRGB24_0
+ movdqa xmm6, xmmword ptr kShuffleMaskARGBToRGB24
+
+ convertloop:
+ READYUV422
+ YUVTORGB(ebx)
+ STORERGB24
+
+ sub ecx, 8
+ jg convertloop
+
+ pop ebx
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+// 8 pixels
+// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB565 (16 bytes).
+__declspec(naked) void I422ToRGB565Row_SSSE3(
+ const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb565_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ __asm {
+ push esi
+ push edi
+ push ebx
+ mov eax, [esp + 12 + 4] // Y
+ mov esi, [esp + 12 + 8] // U
+ mov edi, [esp + 12 + 12] // V
+ mov edx, [esp + 12 + 16] // argb
+ mov ebx, [esp + 12 + 20] // yuvconstants
+ mov ecx, [esp + 12 + 24] // width
+ sub edi, esi
+ pcmpeqb xmm5, xmm5 // generate mask 0x0000001f
+ psrld xmm5, 27
+ pcmpeqb xmm6, xmm6 // generate mask 0x000007e0
+ psrld xmm6, 26
+ pslld xmm6, 5
+ pcmpeqb xmm7, xmm7 // generate mask 0xfffff800
+ pslld xmm7, 11
+
+ convertloop:
+ READYUV422
+ YUVTORGB(ebx)
+ STORERGB565
+
+ sub ecx, 8
+ jg convertloop
+
+ pop ebx
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+// 8 pixels.
+// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
+__declspec(naked) void I422ToARGBRow_SSSE3(
+ const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ __asm {
+ push esi
+ push edi
+ push ebx
+ mov eax, [esp + 12 + 4] // Y
+ mov esi, [esp + 12 + 8] // U
+ mov edi, [esp + 12 + 12] // V
+ mov edx, [esp + 12 + 16] // argb
+ mov ebx, [esp + 12 + 20] // yuvconstants
+ mov ecx, [esp + 12 + 24] // width
+ sub edi, esi
+ pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
+
+ convertloop:
+ READYUV422
+ YUVTORGB(ebx)
+ STOREARGB
+
+ sub ecx, 8
+ jg convertloop
+
+ pop ebx
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+// 8 pixels.
+// 4 UV values upsampled to 8 UV, mixed with 8 Y and 8 A producing 8 ARGB.
+__declspec(naked) void I422AlphaToARGBRow_SSSE3(
+ const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ const uint8* a_buf,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ __asm {
+ push esi
+ push edi
+ push ebx
+ push ebp
+ mov eax, [esp + 16 + 4] // Y
+ mov esi, [esp + 16 + 8] // U
+ mov edi, [esp + 16 + 12] // V
+ mov ebp, [esp + 16 + 16] // A
+ mov edx, [esp + 16 + 20] // argb
+ mov ebx, [esp + 16 + 24] // yuvconstants
+ mov ecx, [esp + 16 + 28] // width
+ sub edi, esi
+
+ convertloop:
+ READYUVA422
+ YUVTORGB(ebx)
+ STOREARGB
+
+ sub ecx, 8
+ jg convertloop
+
+ pop ebp
+ pop ebx
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+// 8 pixels.
+// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
+__declspec(naked) void NV12ToARGBRow_SSSE3(
+ const uint8* y_buf,
+ const uint8* uv_buf,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ __asm {
+ push esi
+ push ebx
+ mov eax, [esp + 8 + 4] // Y
+ mov esi, [esp + 8 + 8] // UV
+ mov edx, [esp + 8 + 12] // argb
+ mov ebx, [esp + 8 + 16] // yuvconstants
+ mov ecx, [esp + 8 + 20] // width
+ pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
+
+ convertloop:
+ READNV12
+ YUVTORGB(ebx)
+ STOREARGB
+
+ sub ecx, 8
+ jg convertloop
+
+ pop ebx
+ pop esi
+ ret
+ }
+}
+
+// 8 pixels.
+// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
+__declspec(naked) void NV21ToARGBRow_SSSE3(
+ const uint8* y_buf,
+ const uint8* vu_buf,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ __asm {
+ push esi
+ push ebx
+ mov eax, [esp + 8 + 4] // Y
+ mov esi, [esp + 8 + 8] // VU
+ mov edx, [esp + 8 + 12] // argb
+ mov ebx, [esp + 8 + 16] // yuvconstants
+ mov ecx, [esp + 8 + 20] // width
+ pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
+
+ convertloop:
+ READNV21
+ YUVTORGB(ebx)
+ STOREARGB
+
+ sub ecx, 8
+ jg convertloop
+
+ pop ebx
+ pop esi
+ ret
+ }
+}
+
+// 8 pixels.
+// 4 YUY2 values with 8 Y and 4 UV producing 8 ARGB (32 bytes).
+__declspec(naked) void YUY2ToARGBRow_SSSE3(
+ const uint8* src_yuy2,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ __asm {
+ push ebx
+ mov eax, [esp + 4 + 4] // yuy2
+ mov edx, [esp + 4 + 8] // argb
+ mov ebx, [esp + 4 + 12] // yuvconstants
+ mov ecx, [esp + 4 + 16] // width
+ pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
+
+ convertloop:
+ READYUY2
+ YUVTORGB(ebx)
+ STOREARGB
+
+ sub ecx, 8
+ jg convertloop
+
+ pop ebx
+ ret
+ }
+}
+
+// 8 pixels.
+// 4 UYVY values with 8 Y and 4 UV producing 8 ARGB (32 bytes).
+__declspec(naked) void UYVYToARGBRow_SSSE3(
+ const uint8* src_uyvy,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ __asm {
+ push ebx
+ mov eax, [esp + 4 + 4] // uyvy
+ mov edx, [esp + 4 + 8] // argb
+ mov ebx, [esp + 4 + 12] // yuvconstants
+ mov ecx, [esp + 4 + 16] // width
+ pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
+
+ convertloop:
+ READUYVY
+ YUVTORGB(ebx)
+ STOREARGB
+
+ sub ecx, 8
+ jg convertloop
+
+ pop ebx
+ ret
+ }
+}
+
+__declspec(naked) void I422ToRGBARow_SSSE3(
+ const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* dst_rgba,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ __asm {
+ push esi
+ push edi
+ push ebx
+ mov eax, [esp + 12 + 4] // Y
+ mov esi, [esp + 12 + 8] // U
+ mov edi, [esp + 12 + 12] // V
+ mov edx, [esp + 12 + 16] // argb
+ mov ebx, [esp + 12 + 20] // yuvconstants
+ mov ecx, [esp + 12 + 24] // width
+ sub edi, esi
+
+ convertloop:
+ READYUV422
+ YUVTORGB(ebx)
+ STORERGBA
+
+ sub ecx, 8
+ jg convertloop
+
+ pop ebx
+ pop edi
+ pop esi
+ ret
+ }
+}
+#endif // HAS_I422TOARGBROW_SSSE3
+
+#ifdef HAS_I400TOARGBROW_SSE2
+// 8 pixels of Y converted to 8 pixels of ARGB (32 bytes).
+__declspec(naked) void I400ToARGBRow_SSE2(const uint8* y_buf,
+ uint8* rgb_buf,
+ int width) {
+ __asm {
+ mov eax, 0x4a354a35 // 4a35 = 18997 = round(1.164 * 64 * 256)
+ movd xmm2, eax
+ pshufd xmm2, xmm2,0
+ mov eax, 0x04880488 // 0488 = 1160 = round(1.164 * 64 * 16)
+ movd xmm3, eax
+ pshufd xmm3, xmm3, 0
+ pcmpeqb xmm4, xmm4 // generate mask 0xff000000
+ pslld xmm4, 24
+
+ mov eax, [esp + 4] // Y
+ mov edx, [esp + 8] // rgb
+ mov ecx, [esp + 12] // width
+
+ convertloop:
+ // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
+ movq xmm0, qword ptr [eax]
+ lea eax, [eax + 8]
+ punpcklbw xmm0, xmm0 // Y.Y
+ pmulhuw xmm0, xmm2
+ psubusw xmm0, xmm3
+ psrlw xmm0, 6
+ packuswb xmm0, xmm0 // G
+
+ // Step 2: Weave into ARGB
+ punpcklbw xmm0, xmm0 // GG
+ movdqa xmm1, xmm0
+ punpcklwd xmm0, xmm0 // BGRA first 4 pixels
+ punpckhwd xmm1, xmm1 // BGRA next 4 pixels
+ por xmm0, xmm4
+ por xmm1, xmm4
+ movdqu [edx], xmm0
+ movdqu [edx + 16], xmm1
+ lea edx, [edx + 32]
+ sub ecx, 8
+ jg convertloop
+ ret
+ }
+}
+#endif // HAS_I400TOARGBROW_SSE2
+
+#ifdef HAS_I400TOARGBROW_AVX2
+// 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).
+// note: vpunpcklbw mutates and vpackuswb unmutates.
+__declspec(naked) void I400ToARGBRow_AVX2(const uint8* y_buf,
+ uint8* rgb_buf,
+ int width) {
+ __asm {
+ mov eax, 0x4a354a35 // 4a35 = 18997 = round(1.164 * 64 * 256)
+ vmovd xmm2, eax
+ vbroadcastss ymm2, xmm2
+ mov eax, 0x04880488 // 0488 = 1160 = round(1.164 * 64 * 16)
+ vmovd xmm3, eax
+ vbroadcastss ymm3, xmm3
+ vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0xff000000
+ vpslld ymm4, ymm4, 24
+
+ mov eax, [esp + 4] // Y
+ mov edx, [esp + 8] // rgb
+ mov ecx, [esp + 12] // width
+
+ convertloop:
+ // Step 1: Scale Y contriportbution to 16 G values. G = (y - 16) * 1.164
+ vmovdqu xmm0, [eax]
+ lea eax, [eax + 16]
+ vpermq ymm0, ymm0, 0xd8 // vpunpcklbw mutates
+ vpunpcklbw ymm0, ymm0, ymm0 // Y.Y
+ vpmulhuw ymm0, ymm0, ymm2
+ vpsubusw ymm0, ymm0, ymm3
+ vpsrlw ymm0, ymm0, 6
+ vpackuswb ymm0, ymm0, ymm0 // G. still mutated: 3120
+
+ // TODO(fbarchard): Weave alpha with unpack.
+ // Step 2: Weave into ARGB
+ vpunpcklbw ymm1, ymm0, ymm0 // GG - mutates
+ vpermq ymm1, ymm1, 0xd8
+ vpunpcklwd ymm0, ymm1, ymm1 // GGGG first 8 pixels
+ vpunpckhwd ymm1, ymm1, ymm1 // GGGG next 8 pixels
+ vpor ymm0, ymm0, ymm4
+ vpor ymm1, ymm1, ymm4
+ vmovdqu [edx], ymm0
+ vmovdqu [edx + 32], ymm1
+ lea edx, [edx + 64]
+ sub ecx, 16
+ jg convertloop
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_I400TOARGBROW_AVX2
+
+#ifdef HAS_MIRRORROW_SSSE3
+// Shuffle table for reversing the bytes.
+static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u,
+ 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u};
+
+// TODO(fbarchard): Replace lea with -16 offset.
+__declspec(naked) void MirrorRow_SSSE3(const uint8* src,
+ uint8* dst,
+ int width) {
+ __asm {
+ mov eax, [esp + 4] // src
+ mov edx, [esp + 8] // dst
+ mov ecx, [esp + 12] // width
+ movdqa xmm5, xmmword ptr kShuffleMirror
+
+ convertloop:
+ movdqu xmm0, [eax - 16 + ecx]
+ pshufb xmm0, xmm5
+ movdqu [edx], xmm0
+ lea edx, [edx + 16]
+ sub ecx, 16
+ jg convertloop
+ ret
+ }
+}
+#endif // HAS_MIRRORROW_SSSE3
+
+#ifdef HAS_MIRRORROW_AVX2
+__declspec(naked) void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
+ __asm {
+ mov eax, [esp + 4] // src
+ mov edx, [esp + 8] // dst
+ mov ecx, [esp + 12] // width
+ vbroadcastf128 ymm5, xmmword ptr kShuffleMirror
+
+ convertloop:
+ vmovdqu ymm0, [eax - 32 + ecx]
+ vpshufb ymm0, ymm0, ymm5
+ vpermq ymm0, ymm0, 0x4e // swap high and low halfs
+ vmovdqu [edx], ymm0
+ lea edx, [edx + 32]
+ sub ecx, 32
+ jg convertloop
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_MIRRORROW_AVX2
+
+#ifdef HAS_MIRRORUVROW_SSSE3
+// Shuffle table for reversing the bytes of UV channels.
+static const uvec8 kShuffleMirrorUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u,
+ 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u};
+
+__declspec(naked) void MirrorUVRow_SSSE3(const uint8* src,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
+ __asm {
+ push edi
+ mov eax, [esp + 4 + 4] // src
+ mov edx, [esp + 4 + 8] // dst_u
+ mov edi, [esp + 4 + 12] // dst_v
+ mov ecx, [esp + 4 + 16] // width
+ movdqa xmm1, xmmword ptr kShuffleMirrorUV
+ lea eax, [eax + ecx * 2 - 16]
+ sub edi, edx
+
+ convertloop:
+ movdqu xmm0, [eax]
+ lea eax, [eax - 16]
+ pshufb xmm0, xmm1
+ movlpd qword ptr [edx], xmm0
+ movhpd qword ptr [edx + edi], xmm0
+ lea edx, [edx + 8]
+ sub ecx, 8
+ jg convertloop
+
+ pop edi
+ ret
+ }
+}
+#endif // HAS_MIRRORUVROW_SSSE3
+
+#ifdef HAS_ARGBMIRRORROW_SSE2
+__declspec(naked) void ARGBMirrorRow_SSE2(const uint8* src,
+ uint8* dst,
+ int width) {
+ __asm {
+ mov eax, [esp + 4] // src
+ mov edx, [esp + 8] // dst
+ mov ecx, [esp + 12] // width
+ lea eax, [eax - 16 + ecx * 4] // last 4 pixels.
+
+ convertloop:
+ movdqu xmm0, [eax]
+ lea eax, [eax - 16]
+ pshufd xmm0, xmm0, 0x1b
+ movdqu [edx], xmm0
+ lea edx, [edx + 16]
+ sub ecx, 4
+ jg convertloop
+ ret
+ }
+}
+#endif // HAS_ARGBMIRRORROW_SSE2
+
+#ifdef HAS_ARGBMIRRORROW_AVX2
+// Shuffle table for reversing the bytes.
+static const ulvec32 kARGBShuffleMirror_AVX2 = {7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u};
+
+__declspec(naked) void ARGBMirrorRow_AVX2(const uint8* src,
+ uint8* dst,
+ int width) {
+ __asm {
+ mov eax, [esp + 4] // src
+ mov edx, [esp + 8] // dst
+ mov ecx, [esp + 12] // width
+ vmovdqu ymm5, ymmword ptr kARGBShuffleMirror_AVX2
+
+ convertloop:
+ vpermd ymm0, ymm5, [eax - 32 + ecx * 4] // permute dword order
+ vmovdqu [edx], ymm0
+ lea edx, [edx + 32]
+ sub ecx, 8
+ jg convertloop
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_ARGBMIRRORROW_AVX2
+
+#ifdef HAS_SPLITUVROW_SSE2
+__declspec(naked) void SplitUVRow_SSE2(const uint8* src_uv,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
+ __asm {
+ push edi
+ mov eax, [esp + 4 + 4] // src_uv
+ mov edx, [esp + 4 + 8] // dst_u
+ mov edi, [esp + 4 + 12] // dst_v
+ mov ecx, [esp + 4 + 16] // width
+ pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
+ psrlw xmm5, 8
+ sub edi, edx
+
+ convertloop:
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ lea eax, [eax + 32]
+ movdqa xmm2, xmm0
+ movdqa xmm3, xmm1
+ pand xmm0, xmm5 // even bytes
+ pand xmm1, xmm5
+ packuswb xmm0, xmm1
+ psrlw xmm2, 8 // odd bytes
+ psrlw xmm3, 8
+ packuswb xmm2, xmm3
+ movdqu [edx], xmm0
+ movdqu [edx + edi], xmm2
+ lea edx, [edx + 16]
+ sub ecx, 16
+ jg convertloop
+
+ pop edi
+ ret
+ }
+}
+
+#endif // HAS_SPLITUVROW_SSE2
+
+#ifdef HAS_SPLITUVROW_AVX2
+__declspec(naked) void SplitUVRow_AVX2(const uint8* src_uv,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
+ __asm {
+ push edi
+ mov eax, [esp + 4 + 4] // src_uv
+ mov edx, [esp + 4 + 8] // dst_u
+ mov edi, [esp + 4 + 12] // dst_v
+ mov ecx, [esp + 4 + 16] // width
+ vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
+ vpsrlw ymm5, ymm5, 8
+ sub edi, edx
+
+ convertloop:
+ vmovdqu ymm0, [eax]
+ vmovdqu ymm1, [eax + 32]
+ lea eax, [eax + 64]
+ vpsrlw ymm2, ymm0, 8 // odd bytes
+ vpsrlw ymm3, ymm1, 8
+ vpand ymm0, ymm0, ymm5 // even bytes
+ vpand ymm1, ymm1, ymm5
+ vpackuswb ymm0, ymm0, ymm1
+ vpackuswb ymm2, ymm2, ymm3
+ vpermq ymm0, ymm0, 0xd8
+ vpermq ymm2, ymm2, 0xd8
+ vmovdqu [edx], ymm0
+ vmovdqu [edx + edi], ymm2
+ lea edx, [edx + 32]
+ sub ecx, 32
+ jg convertloop
+
+ pop edi
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_SPLITUVROW_AVX2
+
+#ifdef HAS_MERGEUVROW_SSE2
+__declspec(naked) void MergeUVRow_SSE2(const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_uv,
+ int width) {
+ __asm {
+ push edi
+ mov eax, [esp + 4 + 4] // src_u
+ mov edx, [esp + 4 + 8] // src_v
+ mov edi, [esp + 4 + 12] // dst_uv
+ mov ecx, [esp + 4 + 16] // width
+ sub edx, eax
+
+ convertloop:
+ movdqu xmm0, [eax] // read 16 U's
+ movdqu xmm1, [eax + edx] // and 16 V's
+ lea eax, [eax + 16]
+ movdqa xmm2, xmm0
+ punpcklbw xmm0, xmm1 // first 8 UV pairs
+ punpckhbw xmm2, xmm1 // next 8 UV pairs
+ movdqu [edi], xmm0
+ movdqu [edi + 16], xmm2
+ lea edi, [edi + 32]
+ sub ecx, 16
+ jg convertloop
+
+ pop edi
+ ret
+ }
+}
+#endif // HAS_MERGEUVROW_SSE2
+
+#ifdef HAS_MERGEUVROW_AVX2
+__declspec(naked) void MergeUVRow_AVX2(const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_uv,
+ int width) {
+ __asm {
+ push edi
+ mov eax, [esp + 4 + 4] // src_u
+ mov edx, [esp + 4 + 8] // src_v
+ mov edi, [esp + 4 + 12] // dst_uv
+ mov ecx, [esp + 4 + 16] // width
+ sub edx, eax
+
+ convertloop:
+ vmovdqu ymm0, [eax] // read 32 U's
+ vmovdqu ymm1, [eax + edx] // and 32 V's
+ lea eax, [eax + 32]
+ vpunpcklbw ymm2, ymm0, ymm1 // low 16 UV pairs. mutated qqword 0,2
+ vpunpckhbw ymm0, ymm0, ymm1 // high 16 UV pairs. mutated qqword 1,3
+ vextractf128 [edi], ymm2, 0 // bytes 0..15
+ vextractf128 [edi + 16], ymm0, 0 // bytes 16..31
+ vextractf128 [edi + 32], ymm2, 1 // bytes 32..47
+ vextractf128 [edi + 48], ymm0, 1 // bytes 47..63
+ lea edi, [edi + 64]
+ sub ecx, 32
+ jg convertloop
+
+ pop edi
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_MERGEUVROW_AVX2
+
+#ifdef HAS_COPYROW_SSE2
+// CopyRow copys 'count' bytes using a 16 byte load/store, 32 bytes at time.
+__declspec(naked) void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
+ __asm {
+ mov eax, [esp + 4] // src
+ mov edx, [esp + 8] // dst
+ mov ecx, [esp + 12] // count
+ test eax, 15
+ jne convertloopu
+ test edx, 15
+ jne convertloopu
+
+ convertloopa:
+ movdqa xmm0, [eax]
+ movdqa xmm1, [eax + 16]
+ lea eax, [eax + 32]
+ movdqa [edx], xmm0
+ movdqa [edx + 16], xmm1
+ lea edx, [edx + 32]
+ sub ecx, 32
+ jg convertloopa
+ ret
+
+ convertloopu:
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ lea eax, [eax + 32]
+ movdqu [edx], xmm0
+ movdqu [edx + 16], xmm1
+ lea edx, [edx + 32]
+ sub ecx, 32
+ jg convertloopu
+ ret
+ }
+}
+#endif // HAS_COPYROW_SSE2
+
+#ifdef HAS_COPYROW_AVX
+// CopyRow copys 'count' bytes using a 32 byte load/store, 64 bytes at time.
+__declspec(naked) void CopyRow_AVX(const uint8* src, uint8* dst, int count) {
+ __asm {
+ mov eax, [esp + 4] // src
+ mov edx, [esp + 8] // dst
+ mov ecx, [esp + 12] // count
+
+ convertloop:
+ vmovdqu ymm0, [eax]
+ vmovdqu ymm1, [eax + 32]
+ lea eax, [eax + 64]
+ vmovdqu [edx], ymm0
+ vmovdqu [edx + 32], ymm1
+ lea edx, [edx + 64]
+ sub ecx, 64
+ jg convertloop
+
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_COPYROW_AVX
+
+// Multiple of 1.
+__declspec(naked) void CopyRow_ERMS(const uint8* src, uint8* dst, int count) {
+ __asm {
+ mov eax, esi
+ mov edx, edi
+ mov esi, [esp + 4] // src
+ mov edi, [esp + 8] // dst
+ mov ecx, [esp + 12] // count
+ rep movsb
+ mov edi, edx
+ mov esi, eax
+ ret
+ }
+}
+
+#ifdef HAS_ARGBCOPYALPHAROW_SSE2
+// width in pixels
+__declspec(naked) void ARGBCopyAlphaRow_SSE2(const uint8* src,
+ uint8* dst,
+ int width) {
+ __asm {
+ mov eax, [esp + 4] // src
+ mov edx, [esp + 8] // dst
+ mov ecx, [esp + 12] // count
+ pcmpeqb xmm0, xmm0 // generate mask 0xff000000
+ pslld xmm0, 24
+ pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff
+ psrld xmm1, 8
+
+ convertloop:
+ movdqu xmm2, [eax]
+ movdqu xmm3, [eax + 16]
+ lea eax, [eax + 32]
+ movdqu xmm4, [edx]
+ movdqu xmm5, [edx + 16]
+ pand xmm2, xmm0
+ pand xmm3, xmm0
+ pand xmm4, xmm1
+ pand xmm5, xmm1
+ por xmm2, xmm4
+ por xmm3, xmm5
+ movdqu [edx], xmm2
+ movdqu [edx + 16], xmm3
+ lea edx, [edx + 32]
+ sub ecx, 8
+ jg convertloop
+
+ ret
+ }
+}
+#endif // HAS_ARGBCOPYALPHAROW_SSE2
+
+#ifdef HAS_ARGBCOPYALPHAROW_AVX2
+// width in pixels
+__declspec(naked) void ARGBCopyAlphaRow_AVX2(const uint8* src,
+ uint8* dst,
+ int width) {
+ __asm {
+ mov eax, [esp + 4] // src
+ mov edx, [esp + 8] // dst
+ mov ecx, [esp + 12] // count
+ vpcmpeqb ymm0, ymm0, ymm0
+ vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff
+
+ convertloop:
+ vmovdqu ymm1, [eax]
+ vmovdqu ymm2, [eax + 32]
+ lea eax, [eax + 64]
+ vpblendvb ymm1, ymm1, [edx], ymm0
+ vpblendvb ymm2, ymm2, [edx + 32], ymm0
+ vmovdqu [edx], ymm1
+ vmovdqu [edx + 32], ymm2
+ lea edx, [edx + 64]
+ sub ecx, 16
+ jg convertloop
+
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_ARGBCOPYALPHAROW_AVX2
+
+#ifdef HAS_ARGBEXTRACTALPHAROW_SSE2
+// width in pixels
+__declspec(naked) void ARGBExtractAlphaRow_SSE2(const uint8* src_argb,
+ uint8* dst_a,
+ int width) {
+ __asm {
+ mov eax, [esp + 4] // src_argb
+ mov edx, [esp + 8] // dst_a
+ mov ecx, [esp + 12] // width
+
+ extractloop:
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ lea eax, [eax + 32]
+ psrld xmm0, 24
+ psrld xmm1, 24
+ packssdw xmm0, xmm1
+ packuswb xmm0, xmm0
+ movq qword ptr [edx], xmm0
+ lea edx, [edx + 8]
+ sub ecx, 8
+ jg extractloop
+
+ ret
+ }
+}
+#endif // HAS_ARGBEXTRACTALPHAROW_SSE2
+
+#ifdef HAS_ARGBEXTRACTALPHAROW_AVX2
+// width in pixels
+__declspec(naked) void ARGBExtractAlphaRow_AVX2(const uint8* src_argb,
+ uint8* dst_a,
+ int width) {
+ __asm {
+ mov eax, [esp + 4] // src_argb
+ mov edx, [esp + 8] // dst_a
+ mov ecx, [esp + 12] // width
+ vmovdqa ymm4, ymmword ptr kPermdARGBToY_AVX
+
+ extractloop:
+ vmovdqu ymm0, [eax]
+ vmovdqu ymm1, [eax + 32]
+ vpsrld ymm0, ymm0, 24
+ vpsrld ymm1, ymm1, 24
+ vmovdqu ymm2, [eax + 64]
+ vmovdqu ymm3, [eax + 96]
+ lea eax, [eax + 128]
+ vpackssdw ymm0, ymm0, ymm1 // mutates
+ vpsrld ymm2, ymm2, 24
+ vpsrld ymm3, ymm3, 24
+ vpackssdw ymm2, ymm2, ymm3 // mutates
+ vpackuswb ymm0, ymm0, ymm2 // mutates
+ vpermd ymm0, ymm4, ymm0 // unmutate
+ vmovdqu [edx], ymm0
+ lea edx, [edx + 32]
+ sub ecx, 32
+ jg extractloop
+
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_ARGBEXTRACTALPHAROW_AVX2
+
+#ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
+// width in pixels
+__declspec(naked) void ARGBCopyYToAlphaRow_SSE2(const uint8* src,
+ uint8* dst,
+ int width) {
+ __asm {
+ mov eax, [esp + 4] // src
+ mov edx, [esp + 8] // dst
+ mov ecx, [esp + 12] // count
+ pcmpeqb xmm0, xmm0 // generate mask 0xff000000
+ pslld xmm0, 24
+ pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff
+ psrld xmm1, 8
+
+ convertloop:
+ movq xmm2, qword ptr [eax] // 8 Y's
+ lea eax, [eax + 8]
+ punpcklbw xmm2, xmm2
+ punpckhwd xmm3, xmm2
+ punpcklwd xmm2, xmm2
+ movdqu xmm4, [edx]
+ movdqu xmm5, [edx + 16]
+ pand xmm2, xmm0
+ pand xmm3, xmm0
+ pand xmm4, xmm1
+ pand xmm5, xmm1
+ por xmm2, xmm4
+ por xmm3, xmm5
+ movdqu [edx], xmm2
+ movdqu [edx + 16], xmm3
+ lea edx, [edx + 32]
+ sub ecx, 8
+ jg convertloop
+
+ ret
+ }
+}
+#endif // HAS_ARGBCOPYYTOALPHAROW_SSE2
+
+#ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
+// width in pixels
+__declspec(naked) void ARGBCopyYToAlphaRow_AVX2(const uint8* src,
+ uint8* dst,
+ int width) {
+ __asm {
+ mov eax, [esp + 4] // src
+ mov edx, [esp + 8] // dst
+ mov ecx, [esp + 12] // count
+ vpcmpeqb ymm0, ymm0, ymm0
+ vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff
+
+ convertloop:
+ vpmovzxbd ymm1, qword ptr [eax]
+ vpmovzxbd ymm2, qword ptr [eax + 8]
+ lea eax, [eax + 16]
+ vpslld ymm1, ymm1, 24
+ vpslld ymm2, ymm2, 24
+ vpblendvb ymm1, ymm1, [edx], ymm0
+ vpblendvb ymm2, ymm2, [edx + 32], ymm0
+ vmovdqu [edx], ymm1
+ vmovdqu [edx + 32], ymm2
+ lea edx, [edx + 64]
+ sub ecx, 16
+ jg convertloop
+
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_ARGBCOPYYTOALPHAROW_AVX2
+
+#ifdef HAS_SETROW_X86
+// Write 'count' bytes using an 8 bit value repeated.
+// Count should be multiple of 4.
+__declspec(naked) void SetRow_X86(uint8* dst, uint8 v8, int count) {
+ __asm {
+ movzx eax, byte ptr [esp + 8] // v8
+ mov edx, 0x01010101 // Duplicate byte to all bytes.
+ mul edx // overwrites edx with upper part of result.
+ mov edx, edi
+ mov edi, [esp + 4] // dst
+ mov ecx, [esp + 12] // count
+ shr ecx, 2
+ rep stosd
+ mov edi, edx
+ ret
+ }
+}
+
+// Write 'count' bytes using an 8 bit value repeated.
+__declspec(naked) void SetRow_ERMS(uint8* dst, uint8 v8, int count) {
+ __asm {
+ mov edx, edi
+ mov edi, [esp + 4] // dst
+ mov eax, [esp + 8] // v8
+ mov ecx, [esp + 12] // count
+ rep stosb
+ mov edi, edx
+ ret
+ }
+}
+
+// Write 'count' 32 bit values.
+__declspec(naked) void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int count) {
+ __asm {
+ mov edx, edi
+ mov edi, [esp + 4] // dst
+ mov eax, [esp + 8] // v32
+ mov ecx, [esp + 12] // count
+ rep stosd
+ mov edi, edx
+ ret
+ }
+}
+#endif // HAS_SETROW_X86
+
+#ifdef HAS_YUY2TOYROW_AVX2
+__declspec(naked) void YUY2ToYRow_AVX2(const uint8* src_yuy2,
+ uint8* dst_y,
+ int width) {
+ __asm {
+ mov eax, [esp + 4] // src_yuy2
+ mov edx, [esp + 8] // dst_y
+ mov ecx, [esp + 12] // width
+ vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
+ vpsrlw ymm5, ymm5, 8
+
+ convertloop:
+ vmovdqu ymm0, [eax]
+ vmovdqu ymm1, [eax + 32]
+ lea eax, [eax + 64]
+ vpand ymm0, ymm0, ymm5 // even bytes are Y
+ vpand ymm1, ymm1, ymm5
+ vpackuswb ymm0, ymm0, ymm1 // mutates.
+ vpermq ymm0, ymm0, 0xd8
+ vmovdqu [edx], ymm0
+ lea edx, [edx + 32]
+ sub ecx, 32
+ jg convertloop
+ vzeroupper
+ ret
+ }
+}
+
+__declspec(naked) void YUY2ToUVRow_AVX2(const uint8* src_yuy2,
+ int stride_yuy2,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // src_yuy2
+ mov esi, [esp + 8 + 8] // stride_yuy2
+ mov edx, [esp + 8 + 12] // dst_u
+ mov edi, [esp + 8 + 16] // dst_v
+ mov ecx, [esp + 8 + 20] // width
+ vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
+ vpsrlw ymm5, ymm5, 8
+ sub edi, edx
+
+ convertloop:
+ vmovdqu ymm0, [eax]
+ vmovdqu ymm1, [eax + 32]
+ vpavgb ymm0, ymm0, [eax + esi]
+ vpavgb ymm1, ymm1, [eax + esi + 32]
+ lea eax, [eax + 64]
+ vpsrlw ymm0, ymm0, 8 // YUYV -> UVUV
+ vpsrlw ymm1, ymm1, 8
+ vpackuswb ymm0, ymm0, ymm1 // mutates.
+ vpermq ymm0, ymm0, 0xd8
+ vpand ymm1, ymm0, ymm5 // U
+ vpsrlw ymm0, ymm0, 8 // V
+ vpackuswb ymm1, ymm1, ymm1 // mutates.
+ vpackuswb ymm0, ymm0, ymm0 // mutates.
+ vpermq ymm1, ymm1, 0xd8
+ vpermq ymm0, ymm0, 0xd8
+ vextractf128 [edx], ymm1, 0 // U
+ vextractf128 [edx + edi], ymm0, 0 // V
+ lea edx, [edx + 16]
+ sub ecx, 32
+ jg convertloop
+
+ pop edi
+ pop esi
+ vzeroupper
+ ret
+ }
+}
+
+__declspec(naked) void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
+ __asm {
+ push edi
+ mov eax, [esp + 4 + 4] // src_yuy2
+ mov edx, [esp + 4 + 8] // dst_u
+ mov edi, [esp + 4 + 12] // dst_v
+ mov ecx, [esp + 4 + 16] // width
+ vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
+ vpsrlw ymm5, ymm5, 8
+ sub edi, edx
+
+ convertloop:
+ vmovdqu ymm0, [eax]
+ vmovdqu ymm1, [eax + 32]
+ lea eax, [eax + 64]
+ vpsrlw ymm0, ymm0, 8 // YUYV -> UVUV
+ vpsrlw ymm1, ymm1, 8
+ vpackuswb ymm0, ymm0, ymm1 // mutates.
+ vpermq ymm0, ymm0, 0xd8
+ vpand ymm1, ymm0, ymm5 // U
+ vpsrlw ymm0, ymm0, 8 // V
+ vpackuswb ymm1, ymm1, ymm1 // mutates.
+ vpackuswb ymm0, ymm0, ymm0 // mutates.
+ vpermq ymm1, ymm1, 0xd8
+ vpermq ymm0, ymm0, 0xd8
+ vextractf128 [edx], ymm1, 0 // U
+ vextractf128 [edx + edi], ymm0, 0 // V
+ lea edx, [edx + 16]
+ sub ecx, 32
+ jg convertloop
+
+ pop edi
+ vzeroupper
+ ret
+ }
+}
+
+__declspec(naked) void UYVYToYRow_AVX2(const uint8* src_uyvy,
+ uint8* dst_y,
+ int width) {
+ __asm {
+ mov eax, [esp + 4] // src_uyvy
+ mov edx, [esp + 8] // dst_y
+ mov ecx, [esp + 12] // width
+
+ convertloop:
+ vmovdqu ymm0, [eax]
+ vmovdqu ymm1, [eax + 32]
+ lea eax, [eax + 64]
+ vpsrlw ymm0, ymm0, 8 // odd bytes are Y
+ vpsrlw ymm1, ymm1, 8
+ vpackuswb ymm0, ymm0, ymm1 // mutates.
+ vpermq ymm0, ymm0, 0xd8
+ vmovdqu [edx], ymm0
+ lea edx, [edx + 32]
+ sub ecx, 32
+ jg convertloop
+ vzeroupper
+ ret
+ }
+}
+
+__declspec(naked) void UYVYToUVRow_AVX2(const uint8* src_uyvy,
+ int stride_uyvy,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // src_yuy2
+ mov esi, [esp + 8 + 8] // stride_yuy2
+ mov edx, [esp + 8 + 12] // dst_u
+ mov edi, [esp + 8 + 16] // dst_v
+ mov ecx, [esp + 8 + 20] // width
+ vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
+ vpsrlw ymm5, ymm5, 8
+ sub edi, edx
+
+ convertloop:
+ vmovdqu ymm0, [eax]
+ vmovdqu ymm1, [eax + 32]
+ vpavgb ymm0, ymm0, [eax + esi]
+ vpavgb ymm1, ymm1, [eax + esi + 32]
+ lea eax, [eax + 64]
+ vpand ymm0, ymm0, ymm5 // UYVY -> UVUV
+ vpand ymm1, ymm1, ymm5
+ vpackuswb ymm0, ymm0, ymm1 // mutates.
+ vpermq ymm0, ymm0, 0xd8
+ vpand ymm1, ymm0, ymm5 // U
+ vpsrlw ymm0, ymm0, 8 // V
+ vpackuswb ymm1, ymm1, ymm1 // mutates.
+ vpackuswb ymm0, ymm0, ymm0 // mutates.
+ vpermq ymm1, ymm1, 0xd8
+ vpermq ymm0, ymm0, 0xd8
+ vextractf128 [edx], ymm1, 0 // U
+ vextractf128 [edx + edi], ymm0, 0 // V
+ lea edx, [edx + 16]
+ sub ecx, 32
+ jg convertloop
+
+ pop edi
+ pop esi
+ vzeroupper
+ ret
+ }
+}
+
+__declspec(naked) void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
+ __asm {
+ push edi
+ mov eax, [esp + 4 + 4] // src_yuy2
+ mov edx, [esp + 4 + 8] // dst_u
+ mov edi, [esp + 4 + 12] // dst_v
+ mov ecx, [esp + 4 + 16] // width
+ vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
+ vpsrlw ymm5, ymm5, 8
+ sub edi, edx
+
+ convertloop:
+ vmovdqu ymm0, [eax]
+ vmovdqu ymm1, [eax + 32]
+ lea eax, [eax + 64]
+ vpand ymm0, ymm0, ymm5 // UYVY -> UVUV
+ vpand ymm1, ymm1, ymm5
+ vpackuswb ymm0, ymm0, ymm1 // mutates.
+ vpermq ymm0, ymm0, 0xd8
+ vpand ymm1, ymm0, ymm5 // U
+ vpsrlw ymm0, ymm0, 8 // V
+ vpackuswb ymm1, ymm1, ymm1 // mutates.
+ vpackuswb ymm0, ymm0, ymm0 // mutates.
+ vpermq ymm1, ymm1, 0xd8
+ vpermq ymm0, ymm0, 0xd8
+ vextractf128 [edx], ymm1, 0 // U
+ vextractf128 [edx + edi], ymm0, 0 // V
+ lea edx, [edx + 16]
+ sub ecx, 32
+ jg convertloop
+
+ pop edi
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_YUY2TOYROW_AVX2
+
+#ifdef HAS_YUY2TOYROW_SSE2
+__declspec(naked) void YUY2ToYRow_SSE2(const uint8* src_yuy2,
+ uint8* dst_y,
+ int width) {
+ __asm {
+ mov eax, [esp + 4] // src_yuy2
+ mov edx, [esp + 8] // dst_y
+ mov ecx, [esp + 12] // width
+ pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
+ psrlw xmm5, 8
+
+ convertloop:
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ lea eax, [eax + 32]
+ pand xmm0, xmm5 // even bytes are Y
+ pand xmm1, xmm5
+ packuswb xmm0, xmm1
+ movdqu [edx], xmm0
+ lea edx, [edx + 16]
+ sub ecx, 16
+ jg convertloop
+ ret
+ }
+}
+
+__declspec(naked) void YUY2ToUVRow_SSE2(const uint8* src_yuy2,
+ int stride_yuy2,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // src_yuy2
+ mov esi, [esp + 8 + 8] // stride_yuy2
+ mov edx, [esp + 8 + 12] // dst_u
+ mov edi, [esp + 8 + 16] // dst_v
+ mov ecx, [esp + 8 + 20] // width
+ pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
+ psrlw xmm5, 8
+ sub edi, edx
+
+ convertloop:
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ movdqu xmm2, [eax + esi]
+ movdqu xmm3, [eax + esi + 16]
+ lea eax, [eax + 32]
+ pavgb xmm0, xmm2
+ pavgb xmm1, xmm3
+ psrlw xmm0, 8 // YUYV -> UVUV
+ psrlw xmm1, 8
+ packuswb xmm0, xmm1
+ movdqa xmm1, xmm0
+ pand xmm0, xmm5 // U
+ packuswb xmm0, xmm0
+ psrlw xmm1, 8 // V
+ packuswb xmm1, xmm1
+ movq qword ptr [edx], xmm0
+ movq qword ptr [edx + edi], xmm1
+ lea edx, [edx + 8]
+ sub ecx, 16
+ jg convertloop
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+__declspec(naked) void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
+ __asm {
+ push edi
+ mov eax, [esp + 4 + 4] // src_yuy2
+ mov edx, [esp + 4 + 8] // dst_u
+ mov edi, [esp + 4 + 12] // dst_v
+ mov ecx, [esp + 4 + 16] // width
+ pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
+ psrlw xmm5, 8
+ sub edi, edx
+
+ convertloop:
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ lea eax, [eax + 32]
+ psrlw xmm0, 8 // YUYV -> UVUV
+ psrlw xmm1, 8
+ packuswb xmm0, xmm1
+ movdqa xmm1, xmm0
+ pand xmm0, xmm5 // U
+ packuswb xmm0, xmm0
+ psrlw xmm1, 8 // V
+ packuswb xmm1, xmm1
+ movq qword ptr [edx], xmm0
+ movq qword ptr [edx + edi], xmm1
+ lea edx, [edx + 8]
+ sub ecx, 16
+ jg convertloop
+
+ pop edi
+ ret
+ }
+}
+
+__declspec(naked) void UYVYToYRow_SSE2(const uint8* src_uyvy,
+ uint8* dst_y,
+ int width) {
+ __asm {
+ mov eax, [esp + 4] // src_uyvy
+ mov edx, [esp + 8] // dst_y
+ mov ecx, [esp + 12] // width
+
+ convertloop:
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ lea eax, [eax + 32]
+ psrlw xmm0, 8 // odd bytes are Y
+ psrlw xmm1, 8
+ packuswb xmm0, xmm1
+ movdqu [edx], xmm0
+ lea edx, [edx + 16]
+ sub ecx, 16
+ jg convertloop
+ ret
+ }
+}
+
+__declspec(naked) void UYVYToUVRow_SSE2(const uint8* src_uyvy,
+ int stride_uyvy,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // src_yuy2
+ mov esi, [esp + 8 + 8] // stride_yuy2
+ mov edx, [esp + 8 + 12] // dst_u
+ mov edi, [esp + 8 + 16] // dst_v
+ mov ecx, [esp + 8 + 20] // width
+ pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
+ psrlw xmm5, 8
+ sub edi, edx
+
+ convertloop:
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ movdqu xmm2, [eax + esi]
+ movdqu xmm3, [eax + esi + 16]
+ lea eax, [eax + 32]
+ pavgb xmm0, xmm2
+ pavgb xmm1, xmm3
+ pand xmm0, xmm5 // UYVY -> UVUV
+ pand xmm1, xmm5
+ packuswb xmm0, xmm1
+ movdqa xmm1, xmm0
+ pand xmm0, xmm5 // U
+ packuswb xmm0, xmm0
+ psrlw xmm1, 8 // V
+ packuswb xmm1, xmm1
+ movq qword ptr [edx], xmm0
+ movq qword ptr [edx + edi], xmm1
+ lea edx, [edx + 8]
+ sub ecx, 16
+ jg convertloop
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+__declspec(naked) void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
+ __asm {
+ push edi
+ mov eax, [esp + 4 + 4] // src_yuy2
+ mov edx, [esp + 4 + 8] // dst_u
+ mov edi, [esp + 4 + 12] // dst_v
+ mov ecx, [esp + 4 + 16] // width
+ pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
+ psrlw xmm5, 8
+ sub edi, edx
+
+ convertloop:
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ lea eax, [eax + 32]
+ pand xmm0, xmm5 // UYVY -> UVUV
+ pand xmm1, xmm5
+ packuswb xmm0, xmm1
+ movdqa xmm1, xmm0
+ pand xmm0, xmm5 // U
+ packuswb xmm0, xmm0
+ psrlw xmm1, 8 // V
+ packuswb xmm1, xmm1
+ movq qword ptr [edx], xmm0
+ movq qword ptr [edx + edi], xmm1
+ lea edx, [edx + 8]
+ sub ecx, 16
+ jg convertloop
+
+ pop edi
+ ret
+ }
+}
+#endif // HAS_YUY2TOYROW_SSE2
+
+#ifdef HAS_BLENDPLANEROW_SSSE3
+// Blend 8 pixels at a time.
+// unsigned version of math
+// =((A2*C2)+(B2*(255-C2))+255)/256
+// signed version of math
+// =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
+__declspec(naked) void BlendPlaneRow_SSSE3(const uint8* src0,
+ const uint8* src1,
+ const uint8* alpha,
+ uint8* dst,
+ int width) {
+ __asm {
+ push esi
+ push edi
+ pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00
+ psllw xmm5, 8
+ mov eax, 0x80808080 // 128 for biasing image to signed.
+ movd xmm6, eax
+ pshufd xmm6, xmm6, 0x00
+
+ mov eax, 0x807f807f // 32768 + 127 for unbias and round.
+ movd xmm7, eax
+ pshufd xmm7, xmm7, 0x00
+ mov eax, [esp + 8 + 4] // src0
+ mov edx, [esp + 8 + 8] // src1
+ mov esi, [esp + 8 + 12] // alpha
+ mov edi, [esp + 8 + 16] // dst
+ mov ecx, [esp + 8 + 20] // width
+ sub eax, esi
+ sub edx, esi
+ sub edi, esi
+
+ // 8 pixel loop.
+ convertloop8:
+ movq xmm0, qword ptr [esi] // alpha
+ punpcklbw xmm0, xmm0
+ pxor xmm0, xmm5 // a, 255-a
+ movq xmm1, qword ptr [eax + esi] // src0
+ movq xmm2, qword ptr [edx + esi] // src1
+ punpcklbw xmm1, xmm2
+ psubb xmm1, xmm6 // bias src0/1 - 128
+ pmaddubsw xmm0, xmm1
+ paddw xmm0, xmm7 // unbias result - 32768 and round.
+ psrlw xmm0, 8
+ packuswb xmm0, xmm0
+ movq qword ptr [edi + esi], xmm0
+ lea esi, [esi + 8]
+ sub ecx, 8
+ jg convertloop8
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+#endif // HAS_BLENDPLANEROW_SSSE3
+
+#ifdef HAS_BLENDPLANEROW_AVX2
+// Blend 32 pixels at a time.
+// unsigned version of math
+// =((A2*C2)+(B2*(255-C2))+255)/256
+// signed version of math
+// =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
+__declspec(naked) void BlendPlaneRow_AVX2(const uint8* src0,
+ const uint8* src1,
+ const uint8* alpha,
+ uint8* dst,
+ int width) {
+ __asm {
+ push esi
+ push edi
+ vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff00ff00
+ vpsllw ymm5, ymm5, 8
+ mov eax, 0x80808080 // 128 for biasing image to signed.
+ vmovd xmm6, eax
+ vbroadcastss ymm6, xmm6
+ mov eax, 0x807f807f // 32768 + 127 for unbias and round.
+ vmovd xmm7, eax
+ vbroadcastss ymm7, xmm7
+ mov eax, [esp + 8 + 4] // src0
+ mov edx, [esp + 8 + 8] // src1
+ mov esi, [esp + 8 + 12] // alpha
+ mov edi, [esp + 8 + 16] // dst
+ mov ecx, [esp + 8 + 20] // width
+ sub eax, esi
+ sub edx, esi
+ sub edi, esi
+
+ // 32 pixel loop.
+ convertloop32:
+ vmovdqu ymm0, [esi] // alpha
+ vpunpckhbw ymm3, ymm0, ymm0 // 8..15, 24..31
+ vpunpcklbw ymm0, ymm0, ymm0 // 0..7, 16..23
+ vpxor ymm3, ymm3, ymm5 // a, 255-a
+ vpxor ymm0, ymm0, ymm5 // a, 255-a
+ vmovdqu ymm1, [eax + esi] // src0
+ vmovdqu ymm2, [edx + esi] // src1
+ vpunpckhbw ymm4, ymm1, ymm2
+ vpunpcklbw ymm1, ymm1, ymm2
+ vpsubb ymm4, ymm4, ymm6 // bias src0/1 - 128
+ vpsubb ymm1, ymm1, ymm6 // bias src0/1 - 128
+ vpmaddubsw ymm3, ymm3, ymm4
+ vpmaddubsw ymm0, ymm0, ymm1
+ vpaddw ymm3, ymm3, ymm7 // unbias result - 32768 and round.
+ vpaddw ymm0, ymm0, ymm7 // unbias result - 32768 and round.
+ vpsrlw ymm3, ymm3, 8
+ vpsrlw ymm0, ymm0, 8
+ vpackuswb ymm0, ymm0, ymm3
+ vmovdqu [edi + esi], ymm0
+ lea esi, [esi + 32]
+ sub ecx, 32
+ jg convertloop32
+
+ pop edi
+ pop esi
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_BLENDPLANEROW_AVX2
+
+#ifdef HAS_ARGBBLENDROW_SSSE3
+// Shuffle table for isolating alpha.
+static const uvec8 kShuffleAlpha = {3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
+ 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80};
+
+// Blend 8 pixels at a time.
+__declspec(naked) void ARGBBlendRow_SSSE3(const uint8* src_argb0,
+ const uint8* src_argb1,
+ uint8* dst_argb,
+ int width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] // src_argb0
+ mov esi, [esp + 4 + 8] // src_argb1
+ mov edx, [esp + 4 + 12] // dst_argb
+ mov ecx, [esp + 4 + 16] // width
+ pcmpeqb xmm7, xmm7 // generate constant 0x0001
+ psrlw xmm7, 15
+ pcmpeqb xmm6, xmm6 // generate mask 0x00ff00ff
+ psrlw xmm6, 8
+ pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00
+ psllw xmm5, 8
+ pcmpeqb xmm4, xmm4 // generate mask 0xff000000
+ pslld xmm4, 24
+ sub ecx, 4
+ jl convertloop4b // less than 4 pixels?
+
+ // 4 pixel loop.
+ convertloop4:
+ movdqu xmm3, [eax] // src argb
+ lea eax, [eax + 16]
+ movdqa xmm0, xmm3 // src argb
+ pxor xmm3, xmm4 // ~alpha
+ movdqu xmm2, [esi] // _r_b
+ pshufb xmm3, xmmword ptr kShuffleAlpha // alpha
+ pand xmm2, xmm6 // _r_b
+ paddw xmm3, xmm7 // 256 - alpha
+ pmullw xmm2, xmm3 // _r_b * alpha
+ movdqu xmm1, [esi] // _a_g
+ lea esi, [esi + 16]
+ psrlw xmm1, 8 // _a_g
+ por xmm0, xmm4 // set alpha to 255
+ pmullw xmm1, xmm3 // _a_g * alpha
+ psrlw xmm2, 8 // _r_b convert to 8 bits again
+ paddusb xmm0, xmm2 // + src argb
+ pand xmm1, xmm5 // a_g_ convert to 8 bits again
+ paddusb xmm0, xmm1 // + src argb
+ movdqu [edx], xmm0
+ lea edx, [edx + 16]
+ sub ecx, 4
+ jge convertloop4
+
+ convertloop4b:
+ add ecx, 4 - 1
+ jl convertloop1b
+
+ // 1 pixel loop.
+ convertloop1:
+ movd xmm3, [eax] // src argb
+ lea eax, [eax + 4]
+ movdqa xmm0, xmm3 // src argb
+ pxor xmm3, xmm4 // ~alpha
+ movd xmm2, [esi] // _r_b
+ pshufb xmm3, xmmword ptr kShuffleAlpha // alpha
+ pand xmm2, xmm6 // _r_b
+ paddw xmm3, xmm7 // 256 - alpha
+ pmullw xmm2, xmm3 // _r_b * alpha
+ movd xmm1, [esi] // _a_g
+ lea esi, [esi + 4]
+ psrlw xmm1, 8 // _a_g
+ por xmm0, xmm4 // set alpha to 255
+ pmullw xmm1, xmm3 // _a_g * alpha
+ psrlw xmm2, 8 // _r_b convert to 8 bits again
+ paddusb xmm0, xmm2 // + src argb
+ pand xmm1, xmm5 // a_g_ convert to 8 bits again
+ paddusb xmm0, xmm1 // + src argb
+ movd [edx], xmm0
+ lea edx, [edx + 4]
+ sub ecx, 1
+ jge convertloop1
+
+ convertloop1b:
+ pop esi
+ ret
+ }
+}
+#endif // HAS_ARGBBLENDROW_SSSE3
+
+#ifdef HAS_ARGBATTENUATEROW_SSSE3
+// Shuffle table duplicating alpha.
+static const uvec8 kShuffleAlpha0 = {
+ 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,
+};
+static const uvec8 kShuffleAlpha1 = {
+ 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
+ 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
+};
+__declspec(naked) void ARGBAttenuateRow_SSSE3(const uint8* src_argb,
+ uint8* dst_argb,
+ int width) {
+ __asm {
+ mov eax, [esp + 4] // src_argb0
+ mov edx, [esp + 8] // dst_argb
+ mov ecx, [esp + 12] // width
+ pcmpeqb xmm3, xmm3 // generate mask 0xff000000
+ pslld xmm3, 24
+ movdqa xmm4, xmmword ptr kShuffleAlpha0
+ movdqa xmm5, xmmword ptr kShuffleAlpha1
+
+ convertloop:
+ movdqu xmm0, [eax] // read 4 pixels
+ pshufb xmm0, xmm4 // isolate first 2 alphas
+ movdqu xmm1, [eax] // read 4 pixels
+ punpcklbw xmm1, xmm1 // first 2 pixel rgbs
+ pmulhuw xmm0, xmm1 // rgb * a
+ movdqu xmm1, [eax] // read 4 pixels
+ pshufb xmm1, xmm5 // isolate next 2 alphas
+ movdqu xmm2, [eax] // read 4 pixels
+ punpckhbw xmm2, xmm2 // next 2 pixel rgbs
+ pmulhuw xmm1, xmm2 // rgb * a
+ movdqu xmm2, [eax] // mask original alpha
+ lea eax, [eax + 16]
+ pand xmm2, xmm3
+ psrlw xmm0, 8
+ psrlw xmm1, 8
+ packuswb xmm0, xmm1
+ por xmm0, xmm2 // copy original alpha
+ movdqu [edx], xmm0
+ lea edx, [edx + 16]
+ sub ecx, 4
+ jg convertloop
+
+ ret
+ }
+}
+#endif // HAS_ARGBATTENUATEROW_SSSE3
+
+#ifdef HAS_ARGBATTENUATEROW_AVX2
+// Shuffle table duplicating alpha.
+static const uvec8 kShuffleAlpha_AVX2 = {6u, 7u, 6u, 7u, 6u, 7u,
+ 128u, 128u, 14u, 15u, 14u, 15u,
+ 14u, 15u, 128u, 128u};
+__declspec(naked) void ARGBAttenuateRow_AVX2(const uint8* src_argb,
+ uint8* dst_argb,
+ int width) {
+ __asm {
+ mov eax, [esp + 4] // src_argb0
+ mov edx, [esp + 8] // dst_argb
+ mov ecx, [esp + 12] // width
+ sub edx, eax
+ vbroadcastf128 ymm4, xmmword ptr kShuffleAlpha_AVX2
+ vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000
+ vpslld ymm5, ymm5, 24
+
+ convertloop:
+ vmovdqu ymm6, [eax] // read 8 pixels.
+ vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated.
+ vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated.
+ vpshufb ymm2, ymm0, ymm4 // low 4 alphas
+ vpshufb ymm3, ymm1, ymm4 // high 4 alphas
+ vpmulhuw ymm0, ymm0, ymm2 // rgb * a
+ vpmulhuw ymm1, ymm1, ymm3 // rgb * a
+ vpand ymm6, ymm6, ymm5 // isolate alpha
+ vpsrlw ymm0, ymm0, 8
+ vpsrlw ymm1, ymm1, 8
+ vpackuswb ymm0, ymm0, ymm1 // unmutated.
+ vpor ymm0, ymm0, ymm6 // copy original alpha
+ vmovdqu [eax + edx], ymm0
+ lea eax, [eax + 32]
+ sub ecx, 8
+ jg convertloop
+
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_ARGBATTENUATEROW_AVX2
+
+#ifdef HAS_ARGBUNATTENUATEROW_SSE2
+// Unattenuate 4 pixels at a time.
+__declspec(naked) void ARGBUnattenuateRow_SSE2(const uint8* src_argb,
+ uint8* dst_argb,
+ int width) {
+ __asm {
+ push ebx
+ push esi
+ push edi
+ mov eax, [esp + 12 + 4] // src_argb
+ mov edx, [esp + 12 + 8] // dst_argb
+ mov ecx, [esp + 12 + 12] // width
+ lea ebx, fixed_invtbl8
+
+ convertloop:
+ movdqu xmm0, [eax] // read 4 pixels
+ movzx esi, byte ptr [eax + 3] // first alpha
+ movzx edi, byte ptr [eax + 7] // second alpha
+ punpcklbw xmm0, xmm0 // first 2
+ movd xmm2, dword ptr [ebx + esi * 4]
+ movd xmm3, dword ptr [ebx + edi * 4]
+ pshuflw xmm2, xmm2, 040h // first 4 inv_alpha words. 1, a, a, a
+ pshuflw xmm3, xmm3, 040h // next 4 inv_alpha words
+ movlhps xmm2, xmm3
+ pmulhuw xmm0, xmm2 // rgb * a
+
+ movdqu xmm1, [eax] // read 4 pixels
+ movzx esi, byte ptr [eax + 11] // third alpha
+ movzx edi, byte ptr [eax + 15] // forth alpha
+ punpckhbw xmm1, xmm1 // next 2
+ movd xmm2, dword ptr [ebx + esi * 4]
+ movd xmm3, dword ptr [ebx + edi * 4]
+ pshuflw xmm2, xmm2, 040h // first 4 inv_alpha words
+ pshuflw xmm3, xmm3, 040h // next 4 inv_alpha words
+ movlhps xmm2, xmm3
+ pmulhuw xmm1, xmm2 // rgb * a
+ lea eax, [eax + 16]
+ packuswb xmm0, xmm1
+ movdqu [edx], xmm0
+ lea edx, [edx + 16]
+ sub ecx, 4
+ jg convertloop
+
+ pop edi
+ pop esi
+ pop ebx
+ ret
+ }
+}
+#endif // HAS_ARGBUNATTENUATEROW_SSE2
+
+#ifdef HAS_ARGBUNATTENUATEROW_AVX2
+// Shuffle table duplicating alpha.
+static const uvec8 kUnattenShuffleAlpha_AVX2 = {
+ 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u};
+// TODO(fbarchard): Enable USE_GATHER for future hardware if faster.
+// USE_GATHER is not on by default, due to being a slow instruction.
+#ifdef USE_GATHER
+__declspec(naked) void ARGBUnattenuateRow_AVX2(const uint8* src_argb,
+ uint8* dst_argb,
+ int width) {
+ __asm {
+ mov eax, [esp + 4] // src_argb0
+ mov edx, [esp + 8] // dst_argb
+ mov ecx, [esp + 12] // width
+ sub edx, eax
+ vbroadcastf128 ymm4, xmmword ptr kUnattenShuffleAlpha_AVX2
+
+ convertloop:
+ vmovdqu ymm6, [eax] // read 8 pixels.
+ vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xffffffff for gather.
+ vpsrld ymm2, ymm6, 24 // alpha in low 8 bits.
+ vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated.
+ vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated.
+ vpgatherdd ymm3, [ymm2 * 4 + fixed_invtbl8], ymm5 // ymm5 cleared. 1, a
+ vpunpcklwd ymm2, ymm3, ymm3 // low 4 inverted alphas. mutated. 1, 1, a, a
+ vpunpckhwd ymm3, ymm3, ymm3 // high 4 inverted alphas. mutated.
+ vpshufb ymm2, ymm2, ymm4 // replicate low 4 alphas. 1, a, a, a
+ vpshufb ymm3, ymm3, ymm4 // replicate high 4 alphas
+ vpmulhuw ymm0, ymm0, ymm2 // rgb * ia
+ vpmulhuw ymm1, ymm1, ymm3 // rgb * ia
+ vpackuswb ymm0, ymm0, ymm1 // unmutated.
+ vmovdqu [eax + edx], ymm0
+ lea eax, [eax + 32]
+ sub ecx, 8
+ jg convertloop
+
+ vzeroupper
+ ret
+ }
+}
+#else // USE_GATHER
+__declspec(naked) void ARGBUnattenuateRow_AVX2(const uint8* src_argb,
+ uint8* dst_argb,
+ int width) {
+ __asm {
+
+ push ebx
+ push esi
+ push edi
+ mov eax, [esp + 12 + 4] // src_argb
+ mov edx, [esp + 12 + 8] // dst_argb
+ mov ecx, [esp + 12 + 12] // width
+ sub edx, eax
+ lea ebx, fixed_invtbl8
+ vbroadcastf128 ymm5, xmmword ptr kUnattenShuffleAlpha_AVX2
+
+ convertloop:
+ // replace VPGATHER
+ movzx esi, byte ptr [eax + 3] // alpha0
+ movzx edi, byte ptr [eax + 7] // alpha1
+ vmovd xmm0, dword ptr [ebx + esi * 4] // [1,a0]
+ vmovd xmm1, dword ptr [ebx + edi * 4] // [1,a1]
+ movzx esi, byte ptr [eax + 11] // alpha2
+ movzx edi, byte ptr [eax + 15] // alpha3
+ vpunpckldq xmm6, xmm0, xmm1 // [1,a1,1,a0]
+ vmovd xmm2, dword ptr [ebx + esi * 4] // [1,a2]
+ vmovd xmm3, dword ptr [ebx + edi * 4] // [1,a3]
+ movzx esi, byte ptr [eax + 19] // alpha4
+ movzx edi, byte ptr [eax + 23] // alpha5
+ vpunpckldq xmm7, xmm2, xmm3 // [1,a3,1,a2]
+ vmovd xmm0, dword ptr [ebx + esi * 4] // [1,a4]
+ vmovd xmm1, dword ptr [ebx + edi * 4] // [1,a5]
+ movzx esi, byte ptr [eax + 27] // alpha6
+ movzx edi, byte ptr [eax + 31] // alpha7
+ vpunpckldq xmm0, xmm0, xmm1 // [1,a5,1,a4]
+ vmovd xmm2, dword ptr [ebx + esi * 4] // [1,a6]
+ vmovd xmm3, dword ptr [ebx + edi * 4] // [1,a7]
+ vpunpckldq xmm2, xmm2, xmm3 // [1,a7,1,a6]
+ vpunpcklqdq xmm3, xmm6, xmm7 // [1,a3,1,a2,1,a1,1,a0]
+ vpunpcklqdq xmm0, xmm0, xmm2 // [1,a7,1,a6,1,a5,1,a4]
+ vinserti128 ymm3, ymm3, xmm0, 1 // [1,a7,1,a6,1,a5,1,a4,1,a3,1,a2,1,a1,1,a0]
+ // end of VPGATHER
+
+ vmovdqu ymm6, [eax] // read 8 pixels.
+ vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated.
+ vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated.
+ vpunpcklwd ymm2, ymm3, ymm3 // low 4 inverted alphas. mutated. 1, 1, a, a
+ vpunpckhwd ymm3, ymm3, ymm3 // high 4 inverted alphas. mutated.
+ vpshufb ymm2, ymm2, ymm5 // replicate low 4 alphas. 1, a, a, a
+ vpshufb ymm3, ymm3, ymm5 // replicate high 4 alphas
+ vpmulhuw ymm0, ymm0, ymm2 // rgb * ia
+ vpmulhuw ymm1, ymm1, ymm3 // rgb * ia
+ vpackuswb ymm0, ymm0, ymm1 // unmutated.
+ vmovdqu [eax + edx], ymm0
+ lea eax, [eax + 32]
+ sub ecx, 8
+ jg convertloop
+
+ pop edi
+ pop esi
+ pop ebx
+ vzeroupper
+ ret
+ }
+}
+#endif // USE_GATHER
+#endif // HAS_ARGBATTENUATEROW_AVX2
+
+#ifdef HAS_ARGBGRAYROW_SSSE3
+// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels.
+__declspec(naked) void ARGBGrayRow_SSSE3(const uint8* src_argb,
+ uint8* dst_argb,
+ int width) {
+ __asm {
+ mov eax, [esp + 4] /* src_argb */
+ mov edx, [esp + 8] /* dst_argb */
+ mov ecx, [esp + 12] /* width */
+ movdqa xmm4, xmmword ptr kARGBToYJ
+ movdqa xmm5, xmmword ptr kAddYJ64
+
+ convertloop:
+ movdqu xmm0, [eax] // G
+ movdqu xmm1, [eax + 16]
+ pmaddubsw xmm0, xmm4
+ pmaddubsw xmm1, xmm4
+ phaddw xmm0, xmm1
+ paddw xmm0, xmm5 // Add .5 for rounding.
+ psrlw xmm0, 7
+ packuswb xmm0, xmm0 // 8 G bytes
+ movdqu xmm2, [eax] // A
+ movdqu xmm3, [eax + 16]
+ lea eax, [eax + 32]
+ psrld xmm2, 24
+ psrld xmm3, 24
+ packuswb xmm2, xmm3
+ packuswb xmm2, xmm2 // 8 A bytes
+ movdqa xmm3, xmm0 // Weave into GG, GA, then GGGA
+ punpcklbw xmm0, xmm0 // 8 GG words
+ punpcklbw xmm3, xmm2 // 8 GA words
+ movdqa xmm1, xmm0
+ punpcklwd xmm0, xmm3 // GGGA first 4
+ punpckhwd xmm1, xmm3 // GGGA next 4
+ movdqu [edx], xmm0
+ movdqu [edx + 16], xmm1
+ lea edx, [edx + 32]
+ sub ecx, 8
+ jg convertloop
+ ret
+ }
+}
+#endif // HAS_ARGBGRAYROW_SSSE3
+
+#ifdef HAS_ARGBSEPIAROW_SSSE3
+// b = (r * 35 + g * 68 + b * 17) >> 7
+// g = (r * 45 + g * 88 + b * 22) >> 7
+// r = (r * 50 + g * 98 + b * 24) >> 7
+// Constant for ARGB color to sepia tone.
+static const vec8 kARGBToSepiaB = {17, 68, 35, 0, 17, 68, 35, 0,
+ 17, 68, 35, 0, 17, 68, 35, 0};
+
+static const vec8 kARGBToSepiaG = {22, 88, 45, 0, 22, 88, 45, 0,
+ 22, 88, 45, 0, 22, 88, 45, 0};
+
+static const vec8 kARGBToSepiaR = {24, 98, 50, 0, 24, 98, 50, 0,
+ 24, 98, 50, 0, 24, 98, 50, 0};
+
+// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
+__declspec(naked) void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
+ __asm {
+ mov eax, [esp + 4] /* dst_argb */
+ mov ecx, [esp + 8] /* width */
+ movdqa xmm2, xmmword ptr kARGBToSepiaB
+ movdqa xmm3, xmmword ptr kARGBToSepiaG
+ movdqa xmm4, xmmword ptr kARGBToSepiaR
+
+ convertloop:
+ movdqu xmm0, [eax] // B
+ movdqu xmm6, [eax + 16]
+ pmaddubsw xmm0, xmm2
+ pmaddubsw xmm6, xmm2
+ phaddw xmm0, xmm6
+ psrlw xmm0, 7
+ packuswb xmm0, xmm0 // 8 B values
+ movdqu xmm5, [eax] // G
+ movdqu xmm1, [eax + 16]
+ pmaddubsw xmm5, xmm3
+ pmaddubsw xmm1, xmm3
+ phaddw xmm5, xmm1
+ psrlw xmm5, 7
+ packuswb xmm5, xmm5 // 8 G values
+ punpcklbw xmm0, xmm5 // 8 BG values
+ movdqu xmm5, [eax] // R
+ movdqu xmm1, [eax + 16]
+ pmaddubsw xmm5, xmm4
+ pmaddubsw xmm1, xmm4
+ phaddw xmm5, xmm1
+ psrlw xmm5, 7
+ packuswb xmm5, xmm5 // 8 R values
+ movdqu xmm6, [eax] // A
+ movdqu xmm1, [eax + 16]
+ psrld xmm6, 24
+ psrld xmm1, 24
+ packuswb xmm6, xmm1
+ packuswb xmm6, xmm6 // 8 A values
+ punpcklbw xmm5, xmm6 // 8 RA values
+ movdqa xmm1, xmm0 // Weave BG, RA together
+ punpcklwd xmm0, xmm5 // BGRA first 4
+ punpckhwd xmm1, xmm5 // BGRA next 4
+ movdqu [eax], xmm0
+ movdqu [eax + 16], xmm1
+ lea eax, [eax + 32]
+ sub ecx, 8
+ jg convertloop
+ ret
+ }
+}
+#endif // HAS_ARGBSEPIAROW_SSSE3
+
+#ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
+// Tranform 8 ARGB pixels (32 bytes) with color matrix.
+// Same as Sepia except matrix is provided.
+// TODO(fbarchard): packuswbs only use half of the reg. To make RGBA, combine R
+// and B into a high and low, then G/A, unpackl/hbw and then unpckl/hwd.
+__declspec(naked) void ARGBColorMatrixRow_SSSE3(const uint8* src_argb,
+ uint8* dst_argb,
+ const int8* matrix_argb,
+ int width) {
+ __asm {
+ mov eax, [esp + 4] /* src_argb */
+ mov edx, [esp + 8] /* dst_argb */
+ mov ecx, [esp + 12] /* matrix_argb */
+ movdqu xmm5, [ecx]
+ pshufd xmm2, xmm5, 0x00
+ pshufd xmm3, xmm5, 0x55
+ pshufd xmm4, xmm5, 0xaa
+ pshufd xmm5, xmm5, 0xff
+ mov ecx, [esp + 16] /* width */
+
+ convertloop:
+ movdqu xmm0, [eax] // B
+ movdqu xmm7, [eax + 16]
+ pmaddubsw xmm0, xmm2
+ pmaddubsw xmm7, xmm2
+ movdqu xmm6, [eax] // G
+ movdqu xmm1, [eax + 16]
+ pmaddubsw xmm6, xmm3
+ pmaddubsw xmm1, xmm3
+ phaddsw xmm0, xmm7 // B
+ phaddsw xmm6, xmm1 // G
+ psraw xmm0, 6 // B
+ psraw xmm6, 6 // G
+ packuswb xmm0, xmm0 // 8 B values
+ packuswb xmm6, xmm6 // 8 G values
+ punpcklbw xmm0, xmm6 // 8 BG values
+ movdqu xmm1, [eax] // R
+ movdqu xmm7, [eax + 16]
+ pmaddubsw xmm1, xmm4
+ pmaddubsw xmm7, xmm4
+ phaddsw xmm1, xmm7 // R
+ movdqu xmm6, [eax] // A
+ movdqu xmm7, [eax + 16]
+ pmaddubsw xmm6, xmm5
+ pmaddubsw xmm7, xmm5
+ phaddsw xmm6, xmm7 // A
+ psraw xmm1, 6 // R
+ psraw xmm6, 6 // A
+ packuswb xmm1, xmm1 // 8 R values
+ packuswb xmm6, xmm6 // 8 A values
+ punpcklbw xmm1, xmm6 // 8 RA values
+ movdqa xmm6, xmm0 // Weave BG, RA together
+ punpcklwd xmm0, xmm1 // BGRA first 4
+ punpckhwd xmm6, xmm1 // BGRA next 4
+ movdqu [edx], xmm0
+ movdqu [edx + 16], xmm6
+ lea eax, [eax + 32]
+ lea edx, [edx + 32]
+ sub ecx, 8
+ jg convertloop
+ ret
+ }
+}
+#endif // HAS_ARGBCOLORMATRIXROW_SSSE3
+
+#ifdef HAS_ARGBQUANTIZEROW_SSE2
+// Quantize 4 ARGB pixels (16 bytes).
+__declspec(naked) void ARGBQuantizeRow_SSE2(uint8* dst_argb,
+ int scale,
+ int interval_size,
+ int interval_offset,
+ int width) {
+ __asm {
+ mov eax, [esp + 4] /* dst_argb */
+ movd xmm2, [esp + 8] /* scale */
+ movd xmm3, [esp + 12] /* interval_size */
+ movd xmm4, [esp + 16] /* interval_offset */
+ mov ecx, [esp + 20] /* width */
+ pshuflw xmm2, xmm2, 040h
+ pshufd xmm2, xmm2, 044h
+ pshuflw xmm3, xmm3, 040h
+ pshufd xmm3, xmm3, 044h
+ pshuflw xmm4, xmm4, 040h
+ pshufd xmm4, xmm4, 044h
+ pxor xmm5, xmm5 // constant 0
+ pcmpeqb xmm6, xmm6 // generate mask 0xff000000
+ pslld xmm6, 24
+
+ convertloop:
+ movdqu xmm0, [eax] // read 4 pixels
+ punpcklbw xmm0, xmm5 // first 2 pixels
+ pmulhuw xmm0, xmm2 // pixel * scale >> 16
+ movdqu xmm1, [eax] // read 4 pixels
+ punpckhbw xmm1, xmm5 // next 2 pixels
+ pmulhuw xmm1, xmm2
+ pmullw xmm0, xmm3 // * interval_size
+ movdqu xmm7, [eax] // read 4 pixels
+ pmullw xmm1, xmm3
+ pand xmm7, xmm6 // mask alpha
+ paddw xmm0, xmm4 // + interval_size / 2
+ paddw xmm1, xmm4
+ packuswb xmm0, xmm1
+ por xmm0, xmm7
+ movdqu [eax], xmm0
+ lea eax, [eax + 16]
+ sub ecx, 4
+ jg convertloop
+ ret
+ }
+}
+#endif // HAS_ARGBQUANTIZEROW_SSE2
+
+#ifdef HAS_ARGBSHADEROW_SSE2
+// Shade 4 pixels at a time by specified value.
+__declspec(naked) void ARGBShadeRow_SSE2(const uint8* src_argb,
+ uint8* dst_argb,
+ int width,
+ uint32 value) {
+ __asm {
+ mov eax, [esp + 4] // src_argb
+ mov edx, [esp + 8] // dst_argb
+ mov ecx, [esp + 12] // width
+ movd xmm2, [esp + 16] // value
+ punpcklbw xmm2, xmm2
+ punpcklqdq xmm2, xmm2
+
+ convertloop:
+ movdqu xmm0, [eax] // read 4 pixels
+ lea eax, [eax + 16]
+ movdqa xmm1, xmm0
+ punpcklbw xmm0, xmm0 // first 2
+ punpckhbw xmm1, xmm1 // next 2
+ pmulhuw xmm0, xmm2 // argb * value
+ pmulhuw xmm1, xmm2 // argb * value
+ psrlw xmm0, 8
+ psrlw xmm1, 8
+ packuswb xmm0, xmm1
+ movdqu [edx], xmm0
+ lea edx, [edx + 16]
+ sub ecx, 4
+ jg convertloop
+
+ ret
+ }
+}
+#endif // HAS_ARGBSHADEROW_SSE2
+
+#ifdef HAS_ARGBMULTIPLYROW_SSE2
+// Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
+__declspec(naked) void ARGBMultiplyRow_SSE2(const uint8* src_argb0,
+ const uint8* src_argb1,
+ uint8* dst_argb,
+ int width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] // src_argb0
+ mov esi, [esp + 4 + 8] // src_argb1
+ mov edx, [esp + 4 + 12] // dst_argb
+ mov ecx, [esp + 4 + 16] // width
+ pxor xmm5, xmm5 // constant 0
+
+ convertloop:
+ movdqu xmm0, [eax] // read 4 pixels from src_argb0
+ movdqu xmm2, [esi] // read 4 pixels from src_argb1
+ movdqu xmm1, xmm0
+ movdqu xmm3, xmm2
+ punpcklbw xmm0, xmm0 // first 2
+ punpckhbw xmm1, xmm1 // next 2
+ punpcklbw xmm2, xmm5 // first 2
+ punpckhbw xmm3, xmm5 // next 2
+ pmulhuw xmm0, xmm2 // src_argb0 * src_argb1 first 2
+ pmulhuw xmm1, xmm3 // src_argb0 * src_argb1 next 2
+ lea eax, [eax + 16]
+ lea esi, [esi + 16]
+ packuswb xmm0, xmm1
+ movdqu [edx], xmm0
+ lea edx, [edx + 16]
+ sub ecx, 4
+ jg convertloop
+
+ pop esi
+ ret
+ }
+}
+#endif // HAS_ARGBMULTIPLYROW_SSE2
+
+#ifdef HAS_ARGBADDROW_SSE2
+// Add 2 rows of ARGB pixels together, 4 pixels at a time.
+// TODO(fbarchard): Port this to posix, neon and other math functions.
+__declspec(naked) void ARGBAddRow_SSE2(const uint8* src_argb0,
+ const uint8* src_argb1,
+ uint8* dst_argb,
+ int width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] // src_argb0
+ mov esi, [esp + 4 + 8] // src_argb1
+ mov edx, [esp + 4 + 12] // dst_argb
+ mov ecx, [esp + 4 + 16] // width
+
+ sub ecx, 4
+ jl convertloop49
+
+ convertloop4:
+ movdqu xmm0, [eax] // read 4 pixels from src_argb0
+ lea eax, [eax + 16]
+ movdqu xmm1, [esi] // read 4 pixels from src_argb1
+ lea esi, [esi + 16]
+ paddusb xmm0, xmm1 // src_argb0 + src_argb1
+ movdqu [edx], xmm0
+ lea edx, [edx + 16]
+ sub ecx, 4
+ jge convertloop4
+
+ convertloop49:
+ add ecx, 4 - 1
+ jl convertloop19
+
+ convertloop1:
+ movd xmm0, [eax] // read 1 pixels from src_argb0
+ lea eax, [eax + 4]
+ movd xmm1, [esi] // read 1 pixels from src_argb1
+ lea esi, [esi + 4]
+ paddusb xmm0, xmm1 // src_argb0 + src_argb1
+ movd [edx], xmm0
+ lea edx, [edx + 4]
+ sub ecx, 1
+ jge convertloop1
+
+ convertloop19:
+ pop esi
+ ret
+ }
+}
+#endif // HAS_ARGBADDROW_SSE2
+
+#ifdef HAS_ARGBSUBTRACTROW_SSE2
+// Subtract 2 rows of ARGB pixels together, 4 pixels at a time.
+__declspec(naked) void ARGBSubtractRow_SSE2(const uint8* src_argb0,
+ const uint8* src_argb1,
+ uint8* dst_argb,
+ int width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] // src_argb0
+ mov esi, [esp + 4 + 8] // src_argb1
+ mov edx, [esp + 4 + 12] // dst_argb
+ mov ecx, [esp + 4 + 16] // width
+
+ convertloop:
+ movdqu xmm0, [eax] // read 4 pixels from src_argb0
+ lea eax, [eax + 16]
+ movdqu xmm1, [esi] // read 4 pixels from src_argb1
+ lea esi, [esi + 16]
+ psubusb xmm0, xmm1 // src_argb0 - src_argb1
+ movdqu [edx], xmm0
+ lea edx, [edx + 16]
+ sub ecx, 4
+ jg convertloop
+
+ pop esi
+ ret
+ }
+}
+#endif // HAS_ARGBSUBTRACTROW_SSE2
+
+#ifdef HAS_ARGBMULTIPLYROW_AVX2
+// Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
+__declspec(naked) void ARGBMultiplyRow_AVX2(const uint8* src_argb0,
+ const uint8* src_argb1,
+ uint8* dst_argb,
+ int width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] // src_argb0
+ mov esi, [esp + 4 + 8] // src_argb1
+ mov edx, [esp + 4 + 12] // dst_argb
+ mov ecx, [esp + 4 + 16] // width
+ vpxor ymm5, ymm5, ymm5 // constant 0
+
+ convertloop:
+ vmovdqu ymm1, [eax] // read 8 pixels from src_argb0
+ lea eax, [eax + 32]
+ vmovdqu ymm3, [esi] // read 8 pixels from src_argb1
+ lea esi, [esi + 32]
+ vpunpcklbw ymm0, ymm1, ymm1 // low 4
+ vpunpckhbw ymm1, ymm1, ymm1 // high 4
+ vpunpcklbw ymm2, ymm3, ymm5 // low 4
+ vpunpckhbw ymm3, ymm3, ymm5 // high 4
+ vpmulhuw ymm0, ymm0, ymm2 // src_argb0 * src_argb1 low 4
+ vpmulhuw ymm1, ymm1, ymm3 // src_argb0 * src_argb1 high 4
+ vpackuswb ymm0, ymm0, ymm1
+ vmovdqu [edx], ymm0
+ lea edx, [edx + 32]
+ sub ecx, 8
+ jg convertloop
+
+ pop esi
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_ARGBMULTIPLYROW_AVX2
+
+#ifdef HAS_ARGBADDROW_AVX2
+// Add 2 rows of ARGB pixels together, 8 pixels at a time.
+__declspec(naked) void ARGBAddRow_AVX2(const uint8* src_argb0,
+ const uint8* src_argb1,
+ uint8* dst_argb,
+ int width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] // src_argb0
+ mov esi, [esp + 4 + 8] // src_argb1
+ mov edx, [esp + 4 + 12] // dst_argb
+ mov ecx, [esp + 4 + 16] // width
+
+ convertloop:
+ vmovdqu ymm0, [eax] // read 8 pixels from src_argb0
+ lea eax, [eax + 32]
+ vpaddusb ymm0, ymm0, [esi] // add 8 pixels from src_argb1
+ lea esi, [esi + 32]
+ vmovdqu [edx], ymm0
+ lea edx, [edx + 32]
+ sub ecx, 8
+ jg convertloop
+
+ pop esi
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_ARGBADDROW_AVX2
+
+#ifdef HAS_ARGBSUBTRACTROW_AVX2
+// Subtract 2 rows of ARGB pixels together, 8 pixels at a time.
+__declspec(naked) void ARGBSubtractRow_AVX2(const uint8* src_argb0,
+ const uint8* src_argb1,
+ uint8* dst_argb,
+ int width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] // src_argb0
+ mov esi, [esp + 4 + 8] // src_argb1
+ mov edx, [esp + 4 + 12] // dst_argb
+ mov ecx, [esp + 4 + 16] // width
+
+ convertloop:
+ vmovdqu ymm0, [eax] // read 8 pixels from src_argb0
+ lea eax, [eax + 32]
+ vpsubusb ymm0, ymm0, [esi] // src_argb0 - src_argb1
+ lea esi, [esi + 32]
+ vmovdqu [edx], ymm0
+ lea edx, [edx + 32]
+ sub ecx, 8
+ jg convertloop
+
+ pop esi
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_ARGBSUBTRACTROW_AVX2
+
+#ifdef HAS_SOBELXROW_SSE2
+// SobelX as a matrix is
+// -1 0 1
+// -2 0 2
+// -1 0 1
+__declspec(naked) void SobelXRow_SSE2(const uint8* src_y0,
+ const uint8* src_y1,
+ const uint8* src_y2,
+ uint8* dst_sobelx,
+ int width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // src_y0
+ mov esi, [esp + 8 + 8] // src_y1
+ mov edi, [esp + 8 + 12] // src_y2
+ mov edx, [esp + 8 + 16] // dst_sobelx
+ mov ecx, [esp + 8 + 20] // width
+ sub esi, eax
+ sub edi, eax
+ sub edx, eax
+ pxor xmm5, xmm5 // constant 0
+
+ convertloop:
+ movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0]
+ movq xmm1, qword ptr [eax + 2] // read 8 pixels from src_y0[2]
+ punpcklbw xmm0, xmm5
+ punpcklbw xmm1, xmm5
+ psubw xmm0, xmm1
+ movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0]
+ movq xmm2, qword ptr [eax + esi + 2] // read 8 pixels from src_y1[2]
+ punpcklbw xmm1, xmm5
+ punpcklbw xmm2, xmm5
+ psubw xmm1, xmm2
+ movq xmm2, qword ptr [eax + edi] // read 8 pixels from src_y2[0]
+ movq xmm3, qword ptr [eax + edi + 2] // read 8 pixels from src_y2[2]
+ punpcklbw xmm2, xmm5
+ punpcklbw xmm3, xmm5
+ psubw xmm2, xmm3
+ paddw xmm0, xmm2
+ paddw xmm0, xmm1
+ paddw xmm0, xmm1
+ pxor xmm1, xmm1 // abs = max(xmm0, -xmm0). SSSE3 could use pabsw
+ psubw xmm1, xmm0
+ pmaxsw xmm0, xmm1
+ packuswb xmm0, xmm0
+ movq qword ptr [eax + edx], xmm0
+ lea eax, [eax + 8]
+ sub ecx, 8
+ jg convertloop
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+#endif // HAS_SOBELXROW_SSE2
+
+#ifdef HAS_SOBELYROW_SSE2
+// SobelY as a matrix is
+// -1 -2 -1
+// 0 0 0
+// 1 2 1
+__declspec(naked) void SobelYRow_SSE2(const uint8* src_y0,
+ const uint8* src_y1,
+ uint8* dst_sobely,
+ int width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] // src_y0
+ mov esi, [esp + 4 + 8] // src_y1
+ mov edx, [esp + 4 + 12] // dst_sobely
+ mov ecx, [esp + 4 + 16] // width
+ sub esi, eax
+ sub edx, eax
+ pxor xmm5, xmm5 // constant 0
+
+ convertloop:
+ movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0]
+ movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0]
+ punpcklbw xmm0, xmm5
+ punpcklbw xmm1, xmm5
+ psubw xmm0, xmm1
+ movq xmm1, qword ptr [eax + 1] // read 8 pixels from src_y0[1]
+ movq xmm2, qword ptr [eax + esi + 1] // read 8 pixels from src_y1[1]
+ punpcklbw xmm1, xmm5
+ punpcklbw xmm2, xmm5
+ psubw xmm1, xmm2
+ movq xmm2, qword ptr [eax + 2] // read 8 pixels from src_y0[2]
+ movq xmm3, qword ptr [eax + esi + 2] // read 8 pixels from src_y1[2]
+ punpcklbw xmm2, xmm5
+ punpcklbw xmm3, xmm5
+ psubw xmm2, xmm3
+ paddw xmm0, xmm2
+ paddw xmm0, xmm1
+ paddw xmm0, xmm1
+ pxor xmm1, xmm1 // abs = max(xmm0, -xmm0). SSSE3 could use pabsw
+ psubw xmm1, xmm0
+ pmaxsw xmm0, xmm1
+ packuswb xmm0, xmm0
+ movq qword ptr [eax + edx], xmm0
+ lea eax, [eax + 8]
+ sub ecx, 8
+ jg convertloop
+
+ pop esi
+ ret
+ }
+}
+#endif // HAS_SOBELYROW_SSE2
+
+#ifdef HAS_SOBELROW_SSE2
+// Adds Sobel X and Sobel Y and stores Sobel into ARGB.
+// A = 255
+// R = Sobel
+// G = Sobel
+// B = Sobel
+__declspec(naked) void SobelRow_SSE2(const uint8* src_sobelx,
+ const uint8* src_sobely,
+ uint8* dst_argb,
+ int width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] // src_sobelx
+ mov esi, [esp + 4 + 8] // src_sobely
+ mov edx, [esp + 4 + 12] // dst_argb
+ mov ecx, [esp + 4 + 16] // width
+ sub esi, eax
+ pcmpeqb xmm5, xmm5 // alpha 255
+ pslld xmm5, 24 // 0xff000000
+
+ convertloop:
+ movdqu xmm0, [eax] // read 16 pixels src_sobelx
+ movdqu xmm1, [eax + esi] // read 16 pixels src_sobely
+ lea eax, [eax + 16]
+ paddusb xmm0, xmm1 // sobel = sobelx + sobely
+ movdqa xmm2, xmm0 // GG
+ punpcklbw xmm2, xmm0 // First 8
+ punpckhbw xmm0, xmm0 // Next 8
+ movdqa xmm1, xmm2 // GGGG
+ punpcklwd xmm1, xmm2 // First 4
+ punpckhwd xmm2, xmm2 // Next 4
+ por xmm1, xmm5 // GGGA
+ por xmm2, xmm5
+ movdqa xmm3, xmm0 // GGGG
+ punpcklwd xmm3, xmm0 // Next 4
+ punpckhwd xmm0, xmm0 // Last 4
+ por xmm3, xmm5 // GGGA
+ por xmm0, xmm5
+ movdqu [edx], xmm1
+ movdqu [edx + 16], xmm2
+ movdqu [edx + 32], xmm3
+ movdqu [edx + 48], xmm0
+ lea edx, [edx + 64]
+ sub ecx, 16
+ jg convertloop
+
+ pop esi
+ ret
+ }
+}
+#endif // HAS_SOBELROW_SSE2
+
+#ifdef HAS_SOBELTOPLANEROW_SSE2
+// Adds Sobel X and Sobel Y and stores Sobel into a plane.
+__declspec(naked) void SobelToPlaneRow_SSE2(const uint8* src_sobelx,
+ const uint8* src_sobely,
+ uint8* dst_y,
+ int width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] // src_sobelx
+ mov esi, [esp + 4 + 8] // src_sobely
+ mov edx, [esp + 4 + 12] // dst_argb
+ mov ecx, [esp + 4 + 16] // width
+ sub esi, eax
+
+ convertloop:
+ movdqu xmm0, [eax] // read 16 pixels src_sobelx
+ movdqu xmm1, [eax + esi] // read 16 pixels src_sobely
+ lea eax, [eax + 16]
+ paddusb xmm0, xmm1 // sobel = sobelx + sobely
+ movdqu [edx], xmm0
+ lea edx, [edx + 16]
+ sub ecx, 16
+ jg convertloop
+
+ pop esi
+ ret
+ }
+}
+#endif // HAS_SOBELTOPLANEROW_SSE2
+
+#ifdef HAS_SOBELXYROW_SSE2
+// Mixes Sobel X, Sobel Y and Sobel into ARGB.
+// A = 255
+// R = Sobel X
+// G = Sobel
+// B = Sobel Y
+__declspec(naked) void SobelXYRow_SSE2(const uint8* src_sobelx,
+ const uint8* src_sobely,
+ uint8* dst_argb,
+ int width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] // src_sobelx
+ mov esi, [esp + 4 + 8] // src_sobely
+ mov edx, [esp + 4 + 12] // dst_argb
+ mov ecx, [esp + 4 + 16] // width
+ sub esi, eax
+ pcmpeqb xmm5, xmm5 // alpha 255
+
+ convertloop:
+ movdqu xmm0, [eax] // read 16 pixels src_sobelx
+ movdqu xmm1, [eax + esi] // read 16 pixels src_sobely
+ lea eax, [eax + 16]
+ movdqa xmm2, xmm0
+ paddusb xmm2, xmm1 // sobel = sobelx + sobely
+ movdqa xmm3, xmm0 // XA
+ punpcklbw xmm3, xmm5
+ punpckhbw xmm0, xmm5
+ movdqa xmm4, xmm1 // YS
+ punpcklbw xmm4, xmm2
+ punpckhbw xmm1, xmm2
+ movdqa xmm6, xmm4 // YSXA
+ punpcklwd xmm6, xmm3 // First 4
+ punpckhwd xmm4, xmm3 // Next 4
+ movdqa xmm7, xmm1 // YSXA
+ punpcklwd xmm7, xmm0 // Next 4
+ punpckhwd xmm1, xmm0 // Last 4
+ movdqu [edx], xmm6
+ movdqu [edx + 16], xmm4
+ movdqu [edx + 32], xmm7
+ movdqu [edx + 48], xmm1
+ lea edx, [edx + 64]
+ sub ecx, 16
+ jg convertloop
+
+ pop esi
+ ret
+ }
+}
+#endif // HAS_SOBELXYROW_SSE2
+
+#ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
+// Consider float CumulativeSum.
+// Consider calling CumulativeSum one row at time as needed.
+// Consider circular CumulativeSum buffer of radius * 2 + 1 height.
+// Convert cumulative sum for an area to an average for 1 pixel.
+// topleft is pointer to top left of CumulativeSum buffer for area.
+// botleft is pointer to bottom left of CumulativeSum buffer.
+// width is offset from left to right of area in CumulativeSum buffer measured
+// in number of ints.
+// area is the number of pixels in the area being averaged.
+// dst points to pixel to store result to.
+// count is number of averaged pixels to produce.
+// Does 4 pixels at a time.
+// This function requires alignment on accumulation buffer pointers.
+void CumulativeSumToAverageRow_SSE2(const int32* topleft,
+ const int32* botleft,
+ int width,
+ int area,
+ uint8* dst,
+ int count) {
+ __asm {
+ mov eax, topleft // eax topleft
+ mov esi, botleft // esi botleft
+ mov edx, width
+ movd xmm5, area
+ mov edi, dst
+ mov ecx, count
+ cvtdq2ps xmm5, xmm5
+ rcpss xmm4, xmm5 // 1.0f / area
+ pshufd xmm4, xmm4, 0
+ sub ecx, 4
+ jl l4b
+
+ cmp area, 128 // 128 pixels will not overflow 15 bits.
+ ja l4
+
+ pshufd xmm5, xmm5, 0 // area
+ pcmpeqb xmm6, xmm6 // constant of 65536.0 - 1 = 65535.0
+ psrld xmm6, 16
+ cvtdq2ps xmm6, xmm6
+ addps xmm5, xmm6 // (65536.0 + area - 1)
+ mulps xmm5, xmm4 // (65536.0 + area - 1) * 1 / area
+ cvtps2dq xmm5, xmm5 // 0.16 fixed point
+ packssdw xmm5, xmm5 // 16 bit shorts
+
+ // 4 pixel loop small blocks.
+ s4:
+ // top left
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ movdqu xmm2, [eax + 32]
+ movdqu xmm3, [eax + 48]
+
+ // - top right
+ psubd xmm0, [eax + edx * 4]
+ psubd xmm1, [eax + edx * 4 + 16]
+ psubd xmm2, [eax + edx * 4 + 32]
+ psubd xmm3, [eax + edx * 4 + 48]
+ lea eax, [eax + 64]
+
+ // - bottom left
+ psubd xmm0, [esi]
+ psubd xmm1, [esi + 16]
+ psubd xmm2, [esi + 32]
+ psubd xmm3, [esi + 48]
+
+ // + bottom right
+ paddd xmm0, [esi + edx * 4]
+ paddd xmm1, [esi + edx * 4 + 16]
+ paddd xmm2, [esi + edx * 4 + 32]
+ paddd xmm3, [esi + edx * 4 + 48]
+ lea esi, [esi + 64]
+
+ packssdw xmm0, xmm1 // pack 4 pixels into 2 registers
+ packssdw xmm2, xmm3
+
+ pmulhuw xmm0, xmm5
+ pmulhuw xmm2, xmm5
+
+ packuswb xmm0, xmm2
+ movdqu [edi], xmm0
+ lea edi, [edi + 16]
+ sub ecx, 4
+ jge s4
+
+ jmp l4b
+
+ // 4 pixel loop
+ l4:
+ // top left
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ movdqu xmm2, [eax + 32]
+ movdqu xmm3, [eax + 48]
+
+ // - top right
+ psubd xmm0, [eax + edx * 4]
+ psubd xmm1, [eax + edx * 4 + 16]
+ psubd xmm2, [eax + edx * 4 + 32]
+ psubd xmm3, [eax + edx * 4 + 48]
+ lea eax, [eax + 64]
+
+ // - bottom left
+ psubd xmm0, [esi]
+ psubd xmm1, [esi + 16]
+ psubd xmm2, [esi + 32]
+ psubd xmm3, [esi + 48]
+
+ // + bottom right
+ paddd xmm0, [esi + edx * 4]
+ paddd xmm1, [esi + edx * 4 + 16]
+ paddd xmm2, [esi + edx * 4 + 32]
+ paddd xmm3, [esi + edx * 4 + 48]
+ lea esi, [esi + 64]
+
+ cvtdq2ps xmm0, xmm0 // Average = Sum * 1 / Area
+ cvtdq2ps xmm1, xmm1
+ mulps xmm0, xmm4
+ mulps xmm1, xmm4
+ cvtdq2ps xmm2, xmm2
+ cvtdq2ps xmm3, xmm3
+ mulps xmm2, xmm4
+ mulps xmm3, xmm4
+ cvtps2dq xmm0, xmm0
+ cvtps2dq xmm1, xmm1
+ cvtps2dq xmm2, xmm2
+ cvtps2dq xmm3, xmm3
+ packssdw xmm0, xmm1
+ packssdw xmm2, xmm3
+ packuswb xmm0, xmm2
+ movdqu [edi], xmm0
+ lea edi, [edi + 16]
+ sub ecx, 4
+ jge l4
+
+ l4b:
+ add ecx, 4 - 1
+ jl l1b
+
+ // 1 pixel loop
+ l1:
+ movdqu xmm0, [eax]
+ psubd xmm0, [eax + edx * 4]
+ lea eax, [eax + 16]
+ psubd xmm0, [esi]
+ paddd xmm0, [esi + edx * 4]
+ lea esi, [esi + 16]
+ cvtdq2ps xmm0, xmm0
+ mulps xmm0, xmm4
+ cvtps2dq xmm0, xmm0
+ packssdw xmm0, xmm0
+ packuswb xmm0, xmm0
+ movd dword ptr [edi], xmm0
+ lea edi, [edi + 4]
+ sub ecx, 1
+ jge l1
+ l1b:
+ }
+}
+#endif // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
+
+#ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
+// Creates a table of cumulative sums where each value is a sum of all values
+// above and to the left of the value.
+void ComputeCumulativeSumRow_SSE2(const uint8* row,
+ int32* cumsum,
+ const int32* previous_cumsum,
+ int width) {
+ __asm {
+ mov eax, row
+ mov edx, cumsum
+ mov esi, previous_cumsum
+ mov ecx, width
+ pxor xmm0, xmm0
+ pxor xmm1, xmm1
+
+ sub ecx, 4
+ jl l4b
+ test edx, 15
+ jne l4b
+
+ // 4 pixel loop
+ l4:
+ movdqu xmm2, [eax] // 4 argb pixels 16 bytes.
+ lea eax, [eax + 16]
+ movdqa xmm4, xmm2
+
+ punpcklbw xmm2, xmm1
+ movdqa xmm3, xmm2
+ punpcklwd xmm2, xmm1
+ punpckhwd xmm3, xmm1
+
+ punpckhbw xmm4, xmm1
+ movdqa xmm5, xmm4
+ punpcklwd xmm4, xmm1
+ punpckhwd xmm5, xmm1
+
+ paddd xmm0, xmm2
+ movdqu xmm2, [esi] // previous row above.
+ paddd xmm2, xmm0
+
+ paddd xmm0, xmm3
+ movdqu xmm3, [esi + 16]
+ paddd xmm3, xmm0
+
+ paddd xmm0, xmm4
+ movdqu xmm4, [esi + 32]
+ paddd xmm4, xmm0
+
+ paddd xmm0, xmm5
+ movdqu xmm5, [esi + 48]
+ lea esi, [esi + 64]
+ paddd xmm5, xmm0
+
+ movdqu [edx], xmm2
+ movdqu [edx + 16], xmm3
+ movdqu [edx + 32], xmm4
+ movdqu [edx + 48], xmm5
+
+ lea edx, [edx + 64]
+ sub ecx, 4
+ jge l4
+
+ l4b:
+ add ecx, 4 - 1
+ jl l1b
+
+ // 1 pixel loop
+ l1:
+ movd xmm2, dword ptr [eax] // 1 argb pixel 4 bytes.
+ lea eax, [eax + 4]
+ punpcklbw xmm2, xmm1
+ punpcklwd xmm2, xmm1
+ paddd xmm0, xmm2
+ movdqu xmm2, [esi]
+ lea esi, [esi + 16]
+ paddd xmm2, xmm0
+ movdqu [edx], xmm2
+ lea edx, [edx + 16]
+ sub ecx, 1
+ jge l1
+
+ l1b:
+ }
+}
+#endif // HAS_COMPUTECUMULATIVESUMROW_SSE2
+
+#ifdef HAS_ARGBAFFINEROW_SSE2
+// Copy ARGB pixels from source image with slope to a row of destination.
+__declspec(naked) LIBYUV_API void ARGBAffineRow_SSE2(const uint8* src_argb,
+ int src_argb_stride,
+ uint8* dst_argb,
+ const float* uv_dudv,
+ int width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 12] // src_argb
+ mov esi, [esp + 16] // stride
+ mov edx, [esp + 20] // dst_argb
+ mov ecx, [esp + 24] // pointer to uv_dudv
+ movq xmm2, qword ptr [ecx] // uv
+ movq xmm7, qword ptr [ecx + 8] // dudv
+ mov ecx, [esp + 28] // width
+ shl esi, 16 // 4, stride
+ add esi, 4
+ movd xmm5, esi
+ sub ecx, 4
+ jl l4b
+
+ // setup for 4 pixel loop
+ pshufd xmm7, xmm7, 0x44 // dup dudv
+ pshufd xmm5, xmm5, 0 // dup 4, stride
+ movdqa xmm0, xmm2 // x0, y0, x1, y1
+ addps xmm0, xmm7
+ movlhps xmm2, xmm0
+ movdqa xmm4, xmm7
+ addps xmm4, xmm4 // dudv *= 2
+ movdqa xmm3, xmm2 // x2, y2, x3, y3
+ addps xmm3, xmm4
+ addps xmm4, xmm4 // dudv *= 4
+
+ // 4 pixel loop
+ l4:
+ cvttps2dq xmm0, xmm2 // x, y float to int first 2
+ cvttps2dq xmm1, xmm3 // x, y float to int next 2
+ packssdw xmm0, xmm1 // x, y as 8 shorts
+ pmaddwd xmm0, xmm5 // offsets = x * 4 + y * stride.
+ movd esi, xmm0
+ pshufd xmm0, xmm0, 0x39 // shift right
+ movd edi, xmm0
+ pshufd xmm0, xmm0, 0x39 // shift right
+ movd xmm1, [eax + esi] // read pixel 0
+ movd xmm6, [eax + edi] // read pixel 1
+ punpckldq xmm1, xmm6 // combine pixel 0 and 1
+ addps xmm2, xmm4 // x, y += dx, dy first 2
+ movq qword ptr [edx], xmm1
+ movd esi, xmm0
+ pshufd xmm0, xmm0, 0x39 // shift right
+ movd edi, xmm0
+ movd xmm6, [eax + esi] // read pixel 2
+ movd xmm0, [eax + edi] // read pixel 3
+ punpckldq xmm6, xmm0 // combine pixel 2 and 3
+ addps xmm3, xmm4 // x, y += dx, dy next 2
+ movq qword ptr 8[edx], xmm6
+ lea edx, [edx + 16]
+ sub ecx, 4
+ jge l4
+
+ l4b:
+ add ecx, 4 - 1
+ jl l1b
+
+ // 1 pixel loop
+ l1:
+ cvttps2dq xmm0, xmm2 // x, y float to int
+ packssdw xmm0, xmm0 // x, y as shorts
+ pmaddwd xmm0, xmm5 // offset = x * 4 + y * stride
+ addps xmm2, xmm7 // x, y += dx, dy
+ movd esi, xmm0
+ movd xmm0, [eax + esi] // copy a pixel
+ movd [edx], xmm0
+ lea edx, [edx + 4]
+ sub ecx, 1
+ jge l1
+ l1b:
+ pop edi
+ pop esi
+ ret
+ }
+}
+#endif // HAS_ARGBAFFINEROW_SSE2
+
+#ifdef HAS_INTERPOLATEROW_AVX2
+// Bilinear filter 32x2 -> 32x1
+__declspec(naked) void InterpolateRow_AVX2(uint8* dst_ptr,
+ const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ int dst_width,
+ int source_y_fraction) {
+ __asm {
+ push esi
+ push edi
+ mov edi, [esp + 8 + 4] // dst_ptr
+ mov esi, [esp + 8 + 8] // src_ptr
+ mov edx, [esp + 8 + 12] // src_stride
+ mov ecx, [esp + 8 + 16] // dst_width
+ mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
+ // Dispatch to specialized filters if applicable.
+ cmp eax, 0
+ je xloop100 // 0 / 256. Blend 100 / 0.
+ sub edi, esi
+ cmp eax, 128
+ je xloop50 // 128 /256 is 0.50. Blend 50 / 50.
+
+ vmovd xmm0, eax // high fraction 0..255
+ neg eax
+ add eax, 256
+ vmovd xmm5, eax // low fraction 256..1
+ vpunpcklbw xmm5, xmm5, xmm0
+ vpunpcklwd xmm5, xmm5, xmm5
+ vbroadcastss ymm5, xmm5
+
+ mov eax, 0x80808080 // 128b for bias and rounding.
+ vmovd xmm4, eax
+ vbroadcastss ymm4, xmm4
+
+ xloop:
+ vmovdqu ymm0, [esi]
+ vmovdqu ymm2, [esi + edx]
+ vpunpckhbw ymm1, ymm0, ymm2 // mutates
+ vpunpcklbw ymm0, ymm0, ymm2
+ vpsubb ymm1, ymm1, ymm4 // bias to signed image
+ vpsubb ymm0, ymm0, ymm4
+ vpmaddubsw ymm1, ymm5, ymm1
+ vpmaddubsw ymm0, ymm5, ymm0
+ vpaddw ymm1, ymm1, ymm4 // unbias and round
+ vpaddw ymm0, ymm0, ymm4
+ vpsrlw ymm1, ymm1, 8
+ vpsrlw ymm0, ymm0, 8
+ vpackuswb ymm0, ymm0, ymm1 // unmutates
+ vmovdqu [esi + edi], ymm0
+ lea esi, [esi + 32]
+ sub ecx, 32
+ jg xloop
+ jmp xloop99
+
+ // Blend 50 / 50.
+ xloop50:
+ vmovdqu ymm0, [esi]
+ vpavgb ymm0, ymm0, [esi + edx]
+ vmovdqu [esi + edi], ymm0
+ lea esi, [esi + 32]
+ sub ecx, 32
+ jg xloop50
+ jmp xloop99
+
+ // Blend 100 / 0 - Copy row unchanged.
+ xloop100:
+ rep movsb
+
+ xloop99:
+ pop edi
+ pop esi
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_INTERPOLATEROW_AVX2
+
+// Bilinear filter 16x2 -> 16x1
+// TODO(fbarchard): Consider allowing 256 using memcpy.
+__declspec(naked) void InterpolateRow_SSSE3(uint8* dst_ptr,
+ const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ int dst_width,
+ int source_y_fraction) {
+ __asm {
+ push esi
+ push edi
+
+ mov edi, [esp + 8 + 4] // dst_ptr
+ mov esi, [esp + 8 + 8] // src_ptr
+ mov edx, [esp + 8 + 12] // src_stride
+ mov ecx, [esp + 8 + 16] // dst_width
+ mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
+ sub edi, esi
+ // Dispatch to specialized filters if applicable.
+ cmp eax, 0
+ je xloop100 // 0 /256. Blend 100 / 0.
+ cmp eax, 128
+ je xloop50 // 128 / 256 is 0.50. Blend 50 / 50.
+
+ movd xmm0, eax // high fraction 0..255
+ neg eax
+ add eax, 256
+ movd xmm5, eax // low fraction 255..1
+ punpcklbw xmm5, xmm0
+ punpcklwd xmm5, xmm5
+ pshufd xmm5, xmm5, 0
+ mov eax, 0x80808080 // 128 for biasing image to signed.
+ movd xmm4, eax
+ pshufd xmm4, xmm4, 0x00
+
+ xloop:
+ movdqu xmm0, [esi]
+ movdqu xmm2, [esi + edx]
+ movdqu xmm1, xmm0
+ punpcklbw xmm0, xmm2
+ punpckhbw xmm1, xmm2
+ psubb xmm0, xmm4 // bias image by -128
+ psubb xmm1, xmm4
+ movdqa xmm2, xmm5
+ movdqa xmm3, xmm5
+ pmaddubsw xmm2, xmm0
+ pmaddubsw xmm3, xmm1
+ paddw xmm2, xmm4
+ paddw xmm3, xmm4
+ psrlw xmm2, 8
+ psrlw xmm3, 8
+ packuswb xmm2, xmm3
+ movdqu [esi + edi], xmm2
+ lea esi, [esi + 16]
+ sub ecx, 16
+ jg xloop
+ jmp xloop99
+
+ // Blend 50 / 50.
+ xloop50:
+ movdqu xmm0, [esi]
+ movdqu xmm1, [esi + edx]
+ pavgb xmm0, xmm1
+ movdqu [esi + edi], xmm0
+ lea esi, [esi + 16]
+ sub ecx, 16
+ jg xloop50
+ jmp xloop99
+
+ // Blend 100 / 0 - Copy row unchanged.
+ xloop100:
+ movdqu xmm0, [esi]
+ movdqu [esi + edi], xmm0
+ lea esi, [esi + 16]
+ sub ecx, 16
+ jg xloop100
+
+ xloop99:
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
+__declspec(naked) void ARGBShuffleRow_SSSE3(const uint8* src_argb,
+ uint8* dst_argb,
+ const uint8* shuffler,
+ int width) {
+ __asm {
+ mov eax, [esp + 4] // src_argb
+ mov edx, [esp + 8] // dst_argb
+ mov ecx, [esp + 12] // shuffler
+ movdqu xmm5, [ecx]
+ mov ecx, [esp + 16] // width
+
+ wloop:
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ lea eax, [eax + 32]
+ pshufb xmm0, xmm5
+ pshufb xmm1, xmm5
+ movdqu [edx], xmm0
+ movdqu [edx + 16], xmm1
+ lea edx, [edx + 32]
+ sub ecx, 8
+ jg wloop
+ ret
+ }
+}
+
+#ifdef HAS_ARGBSHUFFLEROW_AVX2
+__declspec(naked) void ARGBShuffleRow_AVX2(const uint8* src_argb,
+ uint8* dst_argb,
+ const uint8* shuffler,
+ int width) {
+ __asm {
+ mov eax, [esp + 4] // src_argb
+ mov edx, [esp + 8] // dst_argb
+ mov ecx, [esp + 12] // shuffler
+ vbroadcastf128 ymm5, [ecx] // same shuffle in high as low.
+ mov ecx, [esp + 16] // width
+
+ wloop:
+ vmovdqu ymm0, [eax]
+ vmovdqu ymm1, [eax + 32]
+ lea eax, [eax + 64]
+ vpshufb ymm0, ymm0, ymm5
+ vpshufb ymm1, ymm1, ymm5
+ vmovdqu [edx], ymm0
+ vmovdqu [edx + 32], ymm1
+ lea edx, [edx + 64]
+ sub ecx, 16
+ jg wloop
+
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_ARGBSHUFFLEROW_AVX2
+
+__declspec(naked) void ARGBShuffleRow_SSE2(const uint8* src_argb,
+ uint8* dst_argb,
+ const uint8* shuffler,
+ int width) {
+ __asm {
+ push ebx
+ push esi
+ mov eax, [esp + 8 + 4] // src_argb
+ mov edx, [esp + 8 + 8] // dst_argb
+ mov esi, [esp + 8 + 12] // shuffler
+ mov ecx, [esp + 8 + 16] // width
+ pxor xmm5, xmm5
+
+ mov ebx, [esi] // shuffler
+ cmp ebx, 0x03000102
+ je shuf_3012
+ cmp ebx, 0x00010203
+ je shuf_0123
+ cmp ebx, 0x00030201
+ je shuf_0321
+ cmp ebx, 0x02010003
+ je shuf_2103
+
+ // TODO(fbarchard): Use one source pointer and 3 offsets.
+ shuf_any1:
+ movzx ebx, byte ptr [esi]
+ movzx ebx, byte ptr [eax + ebx]
+ mov [edx], bl
+ movzx ebx, byte ptr [esi + 1]
+ movzx ebx, byte ptr [eax + ebx]
+ mov [edx + 1], bl
+ movzx ebx, byte ptr [esi + 2]
+ movzx ebx, byte ptr [eax + ebx]
+ mov [edx + 2], bl
+ movzx ebx, byte ptr [esi + 3]
+ movzx ebx, byte ptr [eax + ebx]
+ mov [edx + 3], bl
+ lea eax, [eax + 4]
+ lea edx, [edx + 4]
+ sub ecx, 1
+ jg shuf_any1
+ jmp shuf99
+
+ shuf_0123:
+ movdqu xmm0, [eax]
+ lea eax, [eax + 16]
+ movdqa xmm1, xmm0
+ punpcklbw xmm0, xmm5
+ punpckhbw xmm1, xmm5
+ pshufhw xmm0, xmm0, 01Bh // 1B = 00011011 = 0x0123 = BGRAToARGB
+ pshuflw xmm0, xmm0, 01Bh
+ pshufhw xmm1, xmm1, 01Bh
+ pshuflw xmm1, xmm1, 01Bh
+ packuswb xmm0, xmm1
+ movdqu [edx], xmm0
+ lea edx, [edx + 16]
+ sub ecx, 4
+ jg shuf_0123
+ jmp shuf99
+
+ shuf_0321:
+ movdqu xmm0, [eax]
+ lea eax, [eax + 16]
+ movdqa xmm1, xmm0
+ punpcklbw xmm0, xmm5
+ punpckhbw xmm1, xmm5
+ pshufhw xmm0, xmm0, 039h // 39 = 00111001 = 0x0321 = RGBAToARGB
+ pshuflw xmm0, xmm0, 039h
+ pshufhw xmm1, xmm1, 039h
+ pshuflw xmm1, xmm1, 039h
+ packuswb xmm0, xmm1
+ movdqu [edx], xmm0
+ lea edx, [edx + 16]
+ sub ecx, 4
+ jg shuf_0321
+ jmp shuf99
+
+ shuf_2103:
+ movdqu xmm0, [eax]
+ lea eax, [eax + 16]
+ movdqa xmm1, xmm0
+ punpcklbw xmm0, xmm5
+ punpckhbw xmm1, xmm5
+ pshufhw xmm0, xmm0, 093h // 93 = 10010011 = 0x2103 = ARGBToRGBA
+ pshuflw xmm0, xmm0, 093h
+ pshufhw xmm1, xmm1, 093h
+ pshuflw xmm1, xmm1, 093h
+ packuswb xmm0, xmm1
+ movdqu [edx], xmm0
+ lea edx, [edx + 16]
+ sub ecx, 4
+ jg shuf_2103
+ jmp shuf99
+
+ shuf_3012:
+ movdqu xmm0, [eax]
+ lea eax, [eax + 16]
+ movdqa xmm1, xmm0
+ punpcklbw xmm0, xmm5
+ punpckhbw xmm1, xmm5
+ pshufhw xmm0, xmm0, 0C6h // C6 = 11000110 = 0x3012 = ABGRToARGB
+ pshuflw xmm0, xmm0, 0C6h
+ pshufhw xmm1, xmm1, 0C6h
+ pshuflw xmm1, xmm1, 0C6h
+ packuswb xmm0, xmm1
+ movdqu [edx], xmm0
+ lea edx, [edx + 16]
+ sub ecx, 4
+ jg shuf_3012
+
+ shuf99:
+ pop esi
+ pop ebx
+ ret
+ }
+}
+
+// YUY2 - Macro-pixel = 2 image pixels
+// Y0U0Y1V0....Y2U2Y3V2...Y4U4Y5V4....
+
+// UYVY - Macro-pixel = 2 image pixels
+// U0Y0V0Y1
+
+__declspec(naked) void I422ToYUY2Row_SSE2(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_frame,
+ int width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // src_y
+ mov esi, [esp + 8 + 8] // src_u
+ mov edx, [esp + 8 + 12] // src_v
+ mov edi, [esp + 8 + 16] // dst_frame
+ mov ecx, [esp + 8 + 20] // width
+ sub edx, esi
+
+ convertloop:
+ movq xmm2, qword ptr [esi] // U
+ movq xmm3, qword ptr [esi + edx] // V
+ lea esi, [esi + 8]
+ punpcklbw xmm2, xmm3 // UV
+ movdqu xmm0, [eax] // Y
+ lea eax, [eax + 16]
+ movdqa xmm1, xmm0
+ punpcklbw xmm0, xmm2 // YUYV
+ punpckhbw xmm1, xmm2
+ movdqu [edi], xmm0
+ movdqu [edi + 16], xmm1
+ lea edi, [edi + 32]
+ sub ecx, 16
+ jg convertloop
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+__declspec(naked) void I422ToUYVYRow_SSE2(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_frame,
+ int width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // src_y
+ mov esi, [esp + 8 + 8] // src_u
+ mov edx, [esp + 8 + 12] // src_v
+ mov edi, [esp + 8 + 16] // dst_frame
+ mov ecx, [esp + 8 + 20] // width
+ sub edx, esi
+
+ convertloop:
+ movq xmm2, qword ptr [esi] // U
+ movq xmm3, qword ptr [esi + edx] // V
+ lea esi, [esi + 8]
+ punpcklbw xmm2, xmm3 // UV
+ movdqu xmm0, [eax] // Y
+ movdqa xmm1, xmm2
+ lea eax, [eax + 16]
+ punpcklbw xmm1, xmm0 // UYVY
+ punpckhbw xmm2, xmm0
+ movdqu [edi], xmm1
+ movdqu [edi + 16], xmm2
+ lea edi, [edi + 32]
+ sub ecx, 16
+ jg convertloop
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+#ifdef HAS_ARGBPOLYNOMIALROW_SSE2
+__declspec(naked) void ARGBPolynomialRow_SSE2(const uint8* src_argb,
+ uint8* dst_argb,
+ const float* poly,
+ int width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] /* src_argb */
+ mov edx, [esp + 4 + 8] /* dst_argb */
+ mov esi, [esp + 4 + 12] /* poly */
+ mov ecx, [esp + 4 + 16] /* width */
+ pxor xmm3, xmm3 // 0 constant for zero extending bytes to ints.
+
+ // 2 pixel loop.
+ convertloop:
+ // pmovzxbd xmm0, dword ptr [eax] // BGRA pixel
+ // pmovzxbd xmm4, dword ptr [eax + 4] // BGRA pixel
+ movq xmm0, qword ptr [eax] // BGRABGRA
+ lea eax, [eax + 8]
+ punpcklbw xmm0, xmm3
+ movdqa xmm4, xmm0
+ punpcklwd xmm0, xmm3 // pixel 0
+ punpckhwd xmm4, xmm3 // pixel 1
+ cvtdq2ps xmm0, xmm0 // 4 floats
+ cvtdq2ps xmm4, xmm4
+ movdqa xmm1, xmm0 // X
+ movdqa xmm5, xmm4
+ mulps xmm0, [esi + 16] // C1 * X
+ mulps xmm4, [esi + 16]
+ addps xmm0, [esi] // result = C0 + C1 * X
+ addps xmm4, [esi]
+ movdqa xmm2, xmm1
+ movdqa xmm6, xmm5
+ mulps xmm2, xmm1 // X * X
+ mulps xmm6, xmm5
+ mulps xmm1, xmm2 // X * X * X
+ mulps xmm5, xmm6
+ mulps xmm2, [esi + 32] // C2 * X * X
+ mulps xmm6, [esi + 32]
+ mulps xmm1, [esi + 48] // C3 * X * X * X
+ mulps xmm5, [esi + 48]
+ addps xmm0, xmm2 // result += C2 * X * X
+ addps xmm4, xmm6
+ addps xmm0, xmm1 // result += C3 * X * X * X
+ addps xmm4, xmm5
+ cvttps2dq xmm0, xmm0
+ cvttps2dq xmm4, xmm4
+ packuswb xmm0, xmm4
+ packuswb xmm0, xmm0
+ movq qword ptr [edx], xmm0
+ lea edx, [edx + 8]
+ sub ecx, 2
+ jg convertloop
+ pop esi
+ ret
+ }
+}
+#endif // HAS_ARGBPOLYNOMIALROW_SSE2
+
+#ifdef HAS_ARGBPOLYNOMIALROW_AVX2
+__declspec(naked) void ARGBPolynomialRow_AVX2(const uint8* src_argb,
+ uint8* dst_argb,
+ const float* poly,
+ int width) {
+ __asm {
+ mov eax, [esp + 4] /* src_argb */
+ mov edx, [esp + 8] /* dst_argb */
+ mov ecx, [esp + 12] /* poly */
+ vbroadcastf128 ymm4, [ecx] // C0
+ vbroadcastf128 ymm5, [ecx + 16] // C1
+ vbroadcastf128 ymm6, [ecx + 32] // C2
+ vbroadcastf128 ymm7, [ecx + 48] // C3
+ mov ecx, [esp + 16] /* width */
+
+ // 2 pixel loop.
+ convertloop:
+ vpmovzxbd ymm0, qword ptr [eax] // 2 BGRA pixels
+ lea eax, [eax + 8]
+ vcvtdq2ps ymm0, ymm0 // X 8 floats
+ vmulps ymm2, ymm0, ymm0 // X * X
+ vmulps ymm3, ymm0, ymm7 // C3 * X
+ vfmadd132ps ymm0, ymm4, ymm5 // result = C0 + C1 * X
+ vfmadd231ps ymm0, ymm2, ymm6 // result += C2 * X * X
+ vfmadd231ps ymm0, ymm2, ymm3 // result += C3 * X * X * X
+ vcvttps2dq ymm0, ymm0
+ vpackusdw ymm0, ymm0, ymm0 // b0g0r0a0_00000000_b0g0r0a0_00000000
+ vpermq ymm0, ymm0, 0xd8 // b0g0r0a0_b0g0r0a0_00000000_00000000
+ vpackuswb xmm0, xmm0, xmm0 // bgrabgra_00000000_00000000_00000000
+ vmovq qword ptr [edx], xmm0
+ lea edx, [edx + 8]
+ sub ecx, 2
+ jg convertloop
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_ARGBPOLYNOMIALROW_AVX2
+
+#ifdef HAS_HALFFLOATROW_SSE2
+static float kExpBias = 1.9259299444e-34f;
+__declspec(naked) void HalfFloatRow_SSE2(const uint16* src,
+ uint16* dst,
+ float scale,
+ int width) {
+ __asm {
+ mov eax, [esp + 4] /* src */
+ mov edx, [esp + 8] /* dst */
+ movd xmm4, dword ptr [esp + 12] /* scale */
+ mov ecx, [esp + 16] /* width */
+ mulss xmm4, kExpBias
+ pshufd xmm4, xmm4, 0
+ pxor xmm5, xmm5
+ sub edx, eax
+
+ // 8 pixel loop.
+ convertloop:
+ movdqu xmm2, xmmword ptr [eax] // 8 shorts
+ add eax, 16
+ movdqa xmm3, xmm2
+ punpcklwd xmm2, xmm5
+ cvtdq2ps xmm2, xmm2 // convert 8 ints to floats
+ punpckhwd xmm3, xmm5
+ cvtdq2ps xmm3, xmm3
+ mulps xmm2, xmm4
+ mulps xmm3, xmm4
+ psrld xmm2, 13
+ psrld xmm3, 13
+ packssdw xmm2, xmm3
+ movdqu [eax + edx - 16], xmm2
+ sub ecx, 8
+ jg convertloop
+ ret
+ }
+}
+#endif // HAS_HALFFLOATROW_SSE2
+
+#ifdef HAS_HALFFLOATROW_AVX2
+__declspec(naked) void HalfFloatRow_AVX2(const uint16* src,
+ uint16* dst,
+ float scale,
+ int width) {
+ __asm {
+ mov eax, [esp + 4] /* src */
+ mov edx, [esp + 8] /* dst */
+ movd xmm4, dword ptr [esp + 12] /* scale */
+ mov ecx, [esp + 16] /* width */
+
+ vmulss xmm4, xmm4, kExpBias
+ vbroadcastss ymm4, xmm4
+ vpxor ymm5, ymm5, ymm5
+ sub edx, eax
+
+ // 16 pixel loop.
+ convertloop:
+ vmovdqu ymm2, [eax] // 16 shorts
+ add eax, 32
+ vpunpckhwd ymm3, ymm2, ymm5 // convert 16 shorts to 16 ints
+ vpunpcklwd ymm2, ymm2, ymm5
+ vcvtdq2ps ymm3, ymm3 // convert 16 ints to floats
+ vcvtdq2ps ymm2, ymm2
+ vmulps ymm3, ymm3, ymm4 // scale to adjust exponent for 5 bit range.
+ vmulps ymm2, ymm2, ymm4
+ vpsrld ymm3, ymm3, 13 // float convert to 8 half floats truncate
+ vpsrld ymm2, ymm2, 13
+ vpackssdw ymm2, ymm2, ymm3
+ vmovdqu [eax + edx - 32], ymm2
+ sub ecx, 16
+ jg convertloop
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_HALFFLOATROW_AVX2
+
+#ifdef HAS_HALFFLOATROW_F16C
+__declspec(naked) void HalfFloatRow_F16C(const uint16* src,
+ uint16* dst,
+ float scale,
+ int width) {
+ __asm {
+ mov eax, [esp + 4] /* src */
+ mov edx, [esp + 8] /* dst */
+ vbroadcastss ymm4, [esp + 12] /* scale */
+ mov ecx, [esp + 16] /* width */
+ sub edx, eax
+
+ // 16 pixel loop.
+ convertloop:
+ vpmovzxwd ymm2, xmmword ptr [eax] // 8 shorts -> 8 ints
+ vpmovzxwd ymm3, xmmword ptr [eax + 16] // 8 more shorts
+ add eax, 32
+ vcvtdq2ps ymm2, ymm2 // convert 8 ints to floats
+ vcvtdq2ps ymm3, ymm3
+ vmulps ymm2, ymm2, ymm4 // scale to normalized range 0 to 1
+ vmulps ymm3, ymm3, ymm4
+ vcvtps2ph xmm2, ymm2, 3 // float convert to 8 half floats truncate
+ vcvtps2ph xmm3, ymm3, 3
+ vmovdqu [eax + edx + 32], xmm2
+ vmovdqu [eax + edx + 32 + 16], xmm3
+ sub ecx, 16
+ jg convertloop
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_HALFFLOATROW_F16C
+
+#ifdef HAS_ARGBCOLORTABLEROW_X86
+// Tranform ARGB pixels with color table.
+__declspec(naked) void ARGBColorTableRow_X86(uint8* dst_argb,
+ const uint8* table_argb,
+ int width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] /* dst_argb */
+ mov esi, [esp + 4 + 8] /* table_argb */
+ mov ecx, [esp + 4 + 12] /* width */
+
+ // 1 pixel loop.
+ convertloop:
+ movzx edx, byte ptr [eax]
+ lea eax, [eax + 4]
+ movzx edx, byte ptr [esi + edx * 4]
+ mov byte ptr [eax - 4], dl
+ movzx edx, byte ptr [eax - 4 + 1]
+ movzx edx, byte ptr [esi + edx * 4 + 1]
+ mov byte ptr [eax - 4 + 1], dl
+ movzx edx, byte ptr [eax - 4 + 2]
+ movzx edx, byte ptr [esi + edx * 4 + 2]
+ mov byte ptr [eax - 4 + 2], dl
+ movzx edx, byte ptr [eax - 4 + 3]
+ movzx edx, byte ptr [esi + edx * 4 + 3]
+ mov byte ptr [eax - 4 + 3], dl
+ dec ecx
+ jg convertloop
+ pop esi
+ ret
+ }
+}
+#endif // HAS_ARGBCOLORTABLEROW_X86
+
+#ifdef HAS_RGBCOLORTABLEROW_X86
+// Tranform RGB pixels with color table.
+__declspec(naked) void RGBColorTableRow_X86(uint8* dst_argb,
+ const uint8* table_argb,
+ int width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] /* dst_argb */
+ mov esi, [esp + 4 + 8] /* table_argb */
+ mov ecx, [esp + 4 + 12] /* width */
+
+ // 1 pixel loop.
+ convertloop:
+ movzx edx, byte ptr [eax]
+ lea eax, [eax + 4]
+ movzx edx, byte ptr [esi + edx * 4]
+ mov byte ptr [eax - 4], dl
+ movzx edx, byte ptr [eax - 4 + 1]
+ movzx edx, byte ptr [esi + edx * 4 + 1]
+ mov byte ptr [eax - 4 + 1], dl
+ movzx edx, byte ptr [eax - 4 + 2]
+ movzx edx, byte ptr [esi + edx * 4 + 2]
+ mov byte ptr [eax - 4 + 2], dl
+ dec ecx
+ jg convertloop
+
+ pop esi
+ ret
+ }
+}
+#endif // HAS_RGBCOLORTABLEROW_X86
+
+#ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
+// Tranform RGB pixels with luma table.
+__declspec(naked) void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb,
+ uint8* dst_argb,
+ int width,
+ const uint8* luma,
+ uint32 lumacoeff) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] /* src_argb */
+ mov edi, [esp + 8 + 8] /* dst_argb */
+ mov ecx, [esp + 8 + 12] /* width */
+ movd xmm2, dword ptr [esp + 8 + 16] // luma table
+ movd xmm3, dword ptr [esp + 8 + 20] // lumacoeff
+ pshufd xmm2, xmm2, 0
+ pshufd xmm3, xmm3, 0
+ pcmpeqb xmm4, xmm4 // generate mask 0xff00ff00
+ psllw xmm4, 8
+ pxor xmm5, xmm5
+
+ // 4 pixel loop.
+ convertloop:
+ movdqu xmm0, xmmword ptr [eax] // generate luma ptr
+ pmaddubsw xmm0, xmm3
+ phaddw xmm0, xmm0
+ pand xmm0, xmm4 // mask out low bits
+ punpcklwd xmm0, xmm5
+ paddd xmm0, xmm2 // add table base
+ movd esi, xmm0
+ pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32
+
+ movzx edx, byte ptr [eax]
+ movzx edx, byte ptr [esi + edx]
+ mov byte ptr [edi], dl
+ movzx edx, byte ptr [eax + 1]
+ movzx edx, byte ptr [esi + edx]
+ mov byte ptr [edi + 1], dl
+ movzx edx, byte ptr [eax + 2]
+ movzx edx, byte ptr [esi + edx]
+ mov byte ptr [edi + 2], dl
+ movzx edx, byte ptr [eax + 3] // copy alpha.
+ mov byte ptr [edi + 3], dl
+
+ movd esi, xmm0
+ pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32
+
+ movzx edx, byte ptr [eax + 4]
+ movzx edx, byte ptr [esi + edx]
+ mov byte ptr [edi + 4], dl
+ movzx edx, byte ptr [eax + 5]
+ movzx edx, byte ptr [esi + edx]
+ mov byte ptr [edi + 5], dl
+ movzx edx, byte ptr [eax + 6]
+ movzx edx, byte ptr [esi + edx]
+ mov byte ptr [edi + 6], dl
+ movzx edx, byte ptr [eax + 7] // copy alpha.
+ mov byte ptr [edi + 7], dl
+
+ movd esi, xmm0
+ pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32
+
+ movzx edx, byte ptr [eax + 8]
+ movzx edx, byte ptr [esi + edx]
+ mov byte ptr [edi + 8], dl
+ movzx edx, byte ptr [eax + 9]
+ movzx edx, byte ptr [esi + edx]
+ mov byte ptr [edi + 9], dl
+ movzx edx, byte ptr [eax + 10]
+ movzx edx, byte ptr [esi + edx]
+ mov byte ptr [edi + 10], dl
+ movzx edx, byte ptr [eax + 11] // copy alpha.
+ mov byte ptr [edi + 11], dl
+
+ movd esi, xmm0
+
+ movzx edx, byte ptr [eax + 12]
+ movzx edx, byte ptr [esi + edx]
+ mov byte ptr [edi + 12], dl
+ movzx edx, byte ptr [eax + 13]
+ movzx edx, byte ptr [esi + edx]
+ mov byte ptr [edi + 13], dl
+ movzx edx, byte ptr [eax + 14]
+ movzx edx, byte ptr [esi + edx]
+ mov byte ptr [edi + 14], dl
+ movzx edx, byte ptr [eax + 15] // copy alpha.
+ mov byte ptr [edi + 15], dl
+
+ lea eax, [eax + 16]
+ lea edi, [edi + 16]
+ sub ecx, 4
+ jg convertloop
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+#endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3
+
+#endif // defined(_M_X64)
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
+
+#endif // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64))
diff --git a/libyuv/src/main/cpp/libyuv/source/scale.cc b/libyuv/src/main/cpp/libyuv/source/scale.cc
new file mode 100644
index 0000000..010ad9d
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/source/scale.cc
@@ -0,0 +1,1863 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/scale.h"
+
+#include
+#include
+
+#include "libyuv/cpu_id.h"
+#include "libyuv/planar_functions.h" // For CopyPlane
+#include "libyuv/row.h"
+#include "libyuv/scale_row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+static __inline int Abs(int v) {
+ return v >= 0 ? v : -v;
+}
+
+#define SUBSAMPLE(v, a, s) (v < 0) ? (-((-v + a) >> s)) : ((v + a) >> s)
+
+// Scale plane, 1/2
+// This is an optimized version for scaling down a plane to 1/2 of
+// its original size.
+
+static void ScalePlaneDown2(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint8* src_ptr,
+ uint8* dst_ptr,
+ enum FilterMode filtering) {
+ int y;
+ void (*ScaleRowDown2)(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) =
+ filtering == kFilterNone
+ ? ScaleRowDown2_C
+ : (filtering == kFilterLinear ? ScaleRowDown2Linear_C
+ : ScaleRowDown2Box_C);
+ int row_stride = src_stride << 1;
+ (void)src_width;
+ (void)src_height;
+ if (!filtering) {
+ src_ptr += src_stride; // Point to odd rows.
+ src_stride = 0;
+ }
+
+#if defined(HAS_SCALEROWDOWN2_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ScaleRowDown2 =
+ filtering == kFilterNone
+ ? ScaleRowDown2_Any_NEON
+ : (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_NEON
+ : ScaleRowDown2Box_Any_NEON);
+ if (IS_ALIGNED(dst_width, 16)) {
+ ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_NEON
+ : (filtering == kFilterLinear
+ ? ScaleRowDown2Linear_NEON
+ : ScaleRowDown2Box_NEON);
+ }
+ }
+#endif
+#if defined(HAS_SCALEROWDOWN2_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ScaleRowDown2 =
+ filtering == kFilterNone
+ ? ScaleRowDown2_Any_SSSE3
+ : (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_SSSE3
+ : ScaleRowDown2Box_Any_SSSE3);
+ if (IS_ALIGNED(dst_width, 16)) {
+ ScaleRowDown2 =
+ filtering == kFilterNone
+ ? ScaleRowDown2_SSSE3
+ : (filtering == kFilterLinear ? ScaleRowDown2Linear_SSSE3
+ : ScaleRowDown2Box_SSSE3);
+ }
+ }
+#endif
+#if defined(HAS_SCALEROWDOWN2_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ScaleRowDown2 =
+ filtering == kFilterNone
+ ? ScaleRowDown2_Any_AVX2
+ : (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_AVX2
+ : ScaleRowDown2Box_Any_AVX2);
+ if (IS_ALIGNED(dst_width, 32)) {
+ ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_AVX2
+ : (filtering == kFilterLinear
+ ? ScaleRowDown2Linear_AVX2
+ : ScaleRowDown2Box_AVX2);
+ }
+ }
+#endif
+#if defined(HAS_SCALEROWDOWN2_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(src_ptr, 4) &&
+ IS_ALIGNED(src_stride, 4) && IS_ALIGNED(row_stride, 4) &&
+ IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
+ ScaleRowDown2 = filtering ? ScaleRowDown2Box_DSPR2 : ScaleRowDown2_DSPR2;
+ }
+#endif
+#if defined(HAS_SCALEROWDOWN2_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ScaleRowDown2 =
+ filtering == kFilterNone
+ ? ScaleRowDown2_Any_MSA
+ : (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_MSA
+ : ScaleRowDown2Box_Any_MSA);
+ if (IS_ALIGNED(dst_width, 32)) {
+ ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_MSA
+ : (filtering == kFilterLinear
+ ? ScaleRowDown2Linear_MSA
+ : ScaleRowDown2Box_MSA);
+ }
+ }
+#endif
+
+ if (filtering == kFilterLinear) {
+ src_stride = 0;
+ }
+ // TODO(fbarchard): Loop through source height to allow odd height.
+ for (y = 0; y < dst_height; ++y) {
+ ScaleRowDown2(src_ptr, src_stride, dst_ptr, dst_width);
+ src_ptr += row_stride;
+ dst_ptr += dst_stride;
+ }
+}
+
+static void ScalePlaneDown2_16(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint16* src_ptr,
+ uint16* dst_ptr,
+ enum FilterMode filtering) {
+ int y;
+ void (*ScaleRowDown2)(const uint16* src_ptr, ptrdiff_t src_stride,
+ uint16* dst_ptr, int dst_width) =
+ filtering == kFilterNone
+ ? ScaleRowDown2_16_C
+ : (filtering == kFilterLinear ? ScaleRowDown2Linear_16_C
+ : ScaleRowDown2Box_16_C);
+ int row_stride = src_stride << 1;
+ (void)src_width;
+ (void)src_height;
+ if (!filtering) {
+ src_ptr += src_stride; // Point to odd rows.
+ src_stride = 0;
+ }
+
+#if defined(HAS_SCALEROWDOWN2_16_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 16)) {
+ ScaleRowDown2 =
+ filtering ? ScaleRowDown2Box_16_NEON : ScaleRowDown2_16_NEON;
+ }
+#endif
+#if defined(HAS_SCALEROWDOWN2_16_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 16)) {
+ ScaleRowDown2 =
+ filtering == kFilterNone
+ ? ScaleRowDown2_16_SSE2
+ : (filtering == kFilterLinear ? ScaleRowDown2Linear_16_SSE2
+ : ScaleRowDown2Box_16_SSE2);
+ }
+#endif
+#if defined(HAS_SCALEROWDOWN2_16_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(src_ptr, 4) &&
+ IS_ALIGNED(src_stride, 4) && IS_ALIGNED(row_stride, 4) &&
+ IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
+ ScaleRowDown2 =
+ filtering ? ScaleRowDown2Box_16_DSPR2 : ScaleRowDown2_16_DSPR2;
+ }
+#endif
+
+ if (filtering == kFilterLinear) {
+ src_stride = 0;
+ }
+ // TODO(fbarchard): Loop through source height to allow odd height.
+ for (y = 0; y < dst_height; ++y) {
+ ScaleRowDown2(src_ptr, src_stride, dst_ptr, dst_width);
+ src_ptr += row_stride;
+ dst_ptr += dst_stride;
+ }
+}
+
+// Scale plane, 1/4
+// This is an optimized version for scaling down a plane to 1/4 of
+// its original size.
+
+static void ScalePlaneDown4(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint8* src_ptr,
+ uint8* dst_ptr,
+ enum FilterMode filtering) {
+ int y;
+ void (*ScaleRowDown4)(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) =
+ filtering ? ScaleRowDown4Box_C : ScaleRowDown4_C;
+ int row_stride = src_stride << 2;
+ (void)src_width;
+ (void)src_height;
+ if (!filtering) {
+ src_ptr += src_stride * 2; // Point to row 2.
+ src_stride = 0;
+ }
+#if defined(HAS_SCALEROWDOWN4_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ScaleRowDown4 =
+ filtering ? ScaleRowDown4Box_Any_NEON : ScaleRowDown4_Any_NEON;
+ if (IS_ALIGNED(dst_width, 8)) {
+ ScaleRowDown4 = filtering ? ScaleRowDown4Box_NEON : ScaleRowDown4_NEON;
+ }
+ }
+#endif
+#if defined(HAS_SCALEROWDOWN4_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ScaleRowDown4 =
+ filtering ? ScaleRowDown4Box_Any_SSSE3 : ScaleRowDown4_Any_SSSE3;
+ if (IS_ALIGNED(dst_width, 8)) {
+ ScaleRowDown4 = filtering ? ScaleRowDown4Box_SSSE3 : ScaleRowDown4_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_SCALEROWDOWN4_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ScaleRowDown4 =
+ filtering ? ScaleRowDown4Box_Any_AVX2 : ScaleRowDown4_Any_AVX2;
+ if (IS_ALIGNED(dst_width, 16)) {
+ ScaleRowDown4 = filtering ? ScaleRowDown4Box_AVX2 : ScaleRowDown4_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_SCALEROWDOWN4_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(row_stride, 4) &&
+ IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
+ IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
+ ScaleRowDown4 = filtering ? ScaleRowDown4Box_DSPR2 : ScaleRowDown4_DSPR2;
+ }
+#endif
+#if defined(HAS_SCALEROWDOWN4_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ScaleRowDown4 =
+ filtering ? ScaleRowDown4Box_Any_MSA : ScaleRowDown4_Any_MSA;
+ if (IS_ALIGNED(dst_width, 16)) {
+ ScaleRowDown4 = filtering ? ScaleRowDown4Box_MSA : ScaleRowDown4_MSA;
+ }
+ }
+#endif
+
+ if (filtering == kFilterLinear) {
+ src_stride = 0;
+ }
+ for (y = 0; y < dst_height; ++y) {
+ ScaleRowDown4(src_ptr, src_stride, dst_ptr, dst_width);
+ src_ptr += row_stride;
+ dst_ptr += dst_stride;
+ }
+}
+
+static void ScalePlaneDown4_16(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint16* src_ptr,
+ uint16* dst_ptr,
+ enum FilterMode filtering) {
+ int y;
+ void (*ScaleRowDown4)(const uint16* src_ptr, ptrdiff_t src_stride,
+ uint16* dst_ptr, int dst_width) =
+ filtering ? ScaleRowDown4Box_16_C : ScaleRowDown4_16_C;
+ int row_stride = src_stride << 2;
+ (void)src_width;
+ (void)src_height;
+ if (!filtering) {
+ src_ptr += src_stride * 2; // Point to row 2.
+ src_stride = 0;
+ }
+#if defined(HAS_SCALEROWDOWN4_16_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8)) {
+ ScaleRowDown4 =
+ filtering ? ScaleRowDown4Box_16_NEON : ScaleRowDown4_16_NEON;
+ }
+#endif
+#if defined(HAS_SCALEROWDOWN4_16_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
+ ScaleRowDown4 =
+ filtering ? ScaleRowDown4Box_16_SSE2 : ScaleRowDown4_16_SSE2;
+ }
+#endif
+#if defined(HAS_SCALEROWDOWN4_16_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(row_stride, 4) &&
+ IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
+ IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
+ ScaleRowDown4 =
+ filtering ? ScaleRowDown4Box_16_DSPR2 : ScaleRowDown4_16_DSPR2;
+ }
+#endif
+
+ if (filtering == kFilterLinear) {
+ src_stride = 0;
+ }
+ for (y = 0; y < dst_height; ++y) {
+ ScaleRowDown4(src_ptr, src_stride, dst_ptr, dst_width);
+ src_ptr += row_stride;
+ dst_ptr += dst_stride;
+ }
+}
+
+// Scale plane down, 3/4
+static void ScalePlaneDown34(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint8* src_ptr,
+ uint8* dst_ptr,
+ enum FilterMode filtering) {
+ int y;
+ void (*ScaleRowDown34_0)(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width);
+ void (*ScaleRowDown34_1)(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width);
+ const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride;
+ (void)src_width;
+ (void)src_height;
+ assert(dst_width % 3 == 0);
+ if (!filtering) {
+ ScaleRowDown34_0 = ScaleRowDown34_C;
+ ScaleRowDown34_1 = ScaleRowDown34_C;
+ } else {
+ ScaleRowDown34_0 = ScaleRowDown34_0_Box_C;
+ ScaleRowDown34_1 = ScaleRowDown34_1_Box_C;
+ }
+#if defined(HAS_SCALEROWDOWN34_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ if (!filtering) {
+ ScaleRowDown34_0 = ScaleRowDown34_Any_NEON;
+ ScaleRowDown34_1 = ScaleRowDown34_Any_NEON;
+ } else {
+ ScaleRowDown34_0 = ScaleRowDown34_0_Box_Any_NEON;
+ ScaleRowDown34_1 = ScaleRowDown34_1_Box_Any_NEON;
+ }
+ if (dst_width % 24 == 0) {
+ if (!filtering) {
+ ScaleRowDown34_0 = ScaleRowDown34_NEON;
+ ScaleRowDown34_1 = ScaleRowDown34_NEON;
+ } else {
+ ScaleRowDown34_0 = ScaleRowDown34_0_Box_NEON;
+ ScaleRowDown34_1 = ScaleRowDown34_1_Box_NEON;
+ }
+ }
+ }
+#endif
+#if defined(HAS_SCALEROWDOWN34_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ if (!filtering) {
+ ScaleRowDown34_0 = ScaleRowDown34_Any_SSSE3;
+ ScaleRowDown34_1 = ScaleRowDown34_Any_SSSE3;
+ } else {
+ ScaleRowDown34_0 = ScaleRowDown34_0_Box_Any_SSSE3;
+ ScaleRowDown34_1 = ScaleRowDown34_1_Box_Any_SSSE3;
+ }
+ if (dst_width % 24 == 0) {
+ if (!filtering) {
+ ScaleRowDown34_0 = ScaleRowDown34_SSSE3;
+ ScaleRowDown34_1 = ScaleRowDown34_SSSE3;
+ } else {
+ ScaleRowDown34_0 = ScaleRowDown34_0_Box_SSSE3;
+ ScaleRowDown34_1 = ScaleRowDown34_1_Box_SSSE3;
+ }
+ }
+ }
+#endif
+#if defined(HAS_SCALEROWDOWN34_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2) && (dst_width % 24 == 0) &&
+ IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
+ IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
+ if (!filtering) {
+ ScaleRowDown34_0 = ScaleRowDown34_DSPR2;
+ ScaleRowDown34_1 = ScaleRowDown34_DSPR2;
+ } else {
+ ScaleRowDown34_0 = ScaleRowDown34_0_Box_DSPR2;
+ ScaleRowDown34_1 = ScaleRowDown34_1_Box_DSPR2;
+ }
+ }
+#endif
+
+ for (y = 0; y < dst_height - 2; y += 3) {
+ ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width);
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ ScaleRowDown34_1(src_ptr, filter_stride, dst_ptr, dst_width);
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ ScaleRowDown34_0(src_ptr + src_stride, -filter_stride, dst_ptr, dst_width);
+ src_ptr += src_stride * 2;
+ dst_ptr += dst_stride;
+ }
+
+ // Remainder 1 or 2 rows with last row vertically unfiltered
+ if ((dst_height % 3) == 2) {
+ ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width);
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ ScaleRowDown34_1(src_ptr, 0, dst_ptr, dst_width);
+ } else if ((dst_height % 3) == 1) {
+ ScaleRowDown34_0(src_ptr, 0, dst_ptr, dst_width);
+ }
+}
+
+static void ScalePlaneDown34_16(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint16* src_ptr,
+ uint16* dst_ptr,
+ enum FilterMode filtering) {
+ int y;
+ void (*ScaleRowDown34_0)(const uint16* src_ptr, ptrdiff_t src_stride,
+ uint16* dst_ptr, int dst_width);
+ void (*ScaleRowDown34_1)(const uint16* src_ptr, ptrdiff_t src_stride,
+ uint16* dst_ptr, int dst_width);
+ const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride;
+ (void)src_width;
+ (void)src_height;
+ assert(dst_width % 3 == 0);
+ if (!filtering) {
+ ScaleRowDown34_0 = ScaleRowDown34_16_C;
+ ScaleRowDown34_1 = ScaleRowDown34_16_C;
+ } else {
+ ScaleRowDown34_0 = ScaleRowDown34_0_Box_16_C;
+ ScaleRowDown34_1 = ScaleRowDown34_1_Box_16_C;
+ }
+#if defined(HAS_SCALEROWDOWN34_16_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && (dst_width % 24 == 0)) {
+ if (!filtering) {
+ ScaleRowDown34_0 = ScaleRowDown34_16_NEON;
+ ScaleRowDown34_1 = ScaleRowDown34_16_NEON;
+ } else {
+ ScaleRowDown34_0 = ScaleRowDown34_0_Box_16_NEON;
+ ScaleRowDown34_1 = ScaleRowDown34_1_Box_16_NEON;
+ }
+ }
+#endif
+#if defined(HAS_SCALEROWDOWN34_16_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0)) {
+ if (!filtering) {
+ ScaleRowDown34_0 = ScaleRowDown34_16_SSSE3;
+ ScaleRowDown34_1 = ScaleRowDown34_16_SSSE3;
+ } else {
+ ScaleRowDown34_0 = ScaleRowDown34_0_Box_16_SSSE3;
+ ScaleRowDown34_1 = ScaleRowDown34_1_Box_16_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_SCALEROWDOWN34_16_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2) && (dst_width % 24 == 0) &&
+ IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
+ IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
+ if (!filtering) {
+ ScaleRowDown34_0 = ScaleRowDown34_16_DSPR2;
+ ScaleRowDown34_1 = ScaleRowDown34_16_DSPR2;
+ } else {
+ ScaleRowDown34_0 = ScaleRowDown34_0_Box_16_DSPR2;
+ ScaleRowDown34_1 = ScaleRowDown34_1_Box_16_DSPR2;
+ }
+ }
+#endif
+
+ for (y = 0; y < dst_height - 2; y += 3) {
+ ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width);
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ ScaleRowDown34_1(src_ptr, filter_stride, dst_ptr, dst_width);
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ ScaleRowDown34_0(src_ptr + src_stride, -filter_stride, dst_ptr, dst_width);
+ src_ptr += src_stride * 2;
+ dst_ptr += dst_stride;
+ }
+
+ // Remainder 1 or 2 rows with last row vertically unfiltered
+ if ((dst_height % 3) == 2) {
+ ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width);
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ ScaleRowDown34_1(src_ptr, 0, dst_ptr, dst_width);
+ } else if ((dst_height % 3) == 1) {
+ ScaleRowDown34_0(src_ptr, 0, dst_ptr, dst_width);
+ }
+}
+
+// Scale plane, 3/8
+// This is an optimized version for scaling down a plane to 3/8
+// of its original size.
+//
+// Uses box filter arranges like this
+// aaabbbcc -> abc
+// aaabbbcc def
+// aaabbbcc ghi
+// dddeeeff
+// dddeeeff
+// dddeeeff
+// ggghhhii
+// ggghhhii
+// Boxes are 3x3, 2x3, 3x2 and 2x2
+
+static void ScalePlaneDown38(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint8* src_ptr,
+ uint8* dst_ptr,
+ enum FilterMode filtering) {
+ int y;
+ void (*ScaleRowDown38_3)(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width);
+ void (*ScaleRowDown38_2)(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width);
+ const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride;
+ assert(dst_width % 3 == 0);
+ (void)src_width;
+ (void)src_height;
+ if (!filtering) {
+ ScaleRowDown38_3 = ScaleRowDown38_C;
+ ScaleRowDown38_2 = ScaleRowDown38_C;
+ } else {
+ ScaleRowDown38_3 = ScaleRowDown38_3_Box_C;
+ ScaleRowDown38_2 = ScaleRowDown38_2_Box_C;
+ }
+
+#if defined(HAS_SCALEROWDOWN38_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ if (!filtering) {
+ ScaleRowDown38_3 = ScaleRowDown38_Any_NEON;
+ ScaleRowDown38_2 = ScaleRowDown38_Any_NEON;
+ } else {
+ ScaleRowDown38_3 = ScaleRowDown38_3_Box_Any_NEON;
+ ScaleRowDown38_2 = ScaleRowDown38_2_Box_Any_NEON;
+ }
+ if (dst_width % 12 == 0) {
+ if (!filtering) {
+ ScaleRowDown38_3 = ScaleRowDown38_NEON;
+ ScaleRowDown38_2 = ScaleRowDown38_NEON;
+ } else {
+ ScaleRowDown38_3 = ScaleRowDown38_3_Box_NEON;
+ ScaleRowDown38_2 = ScaleRowDown38_2_Box_NEON;
+ }
+ }
+ }
+#endif
+#if defined(HAS_SCALEROWDOWN38_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ if (!filtering) {
+ ScaleRowDown38_3 = ScaleRowDown38_Any_SSSE3;
+ ScaleRowDown38_2 = ScaleRowDown38_Any_SSSE3;
+ } else {
+ ScaleRowDown38_3 = ScaleRowDown38_3_Box_Any_SSSE3;
+ ScaleRowDown38_2 = ScaleRowDown38_2_Box_Any_SSSE3;
+ }
+ if (dst_width % 12 == 0 && !filtering) {
+ ScaleRowDown38_3 = ScaleRowDown38_SSSE3;
+ ScaleRowDown38_2 = ScaleRowDown38_SSSE3;
+ }
+ if (dst_width % 6 == 0 && filtering) {
+ ScaleRowDown38_3 = ScaleRowDown38_3_Box_SSSE3;
+ ScaleRowDown38_2 = ScaleRowDown38_2_Box_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_SCALEROWDOWN38_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2) && (dst_width % 12 == 0) &&
+ IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
+ IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
+ if (!filtering) {
+ ScaleRowDown38_3 = ScaleRowDown38_DSPR2;
+ ScaleRowDown38_2 = ScaleRowDown38_DSPR2;
+ } else {
+ ScaleRowDown38_3 = ScaleRowDown38_3_Box_DSPR2;
+ ScaleRowDown38_2 = ScaleRowDown38_2_Box_DSPR2;
+ }
+ }
+#endif
+#if defined(HAS_SCALEROWDOWN38_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ if (!filtering) {
+ ScaleRowDown38_3 = ScaleRowDown38_Any_MSA;
+ ScaleRowDown38_2 = ScaleRowDown38_Any_MSA;
+ } else {
+ ScaleRowDown38_3 = ScaleRowDown38_3_Box_Any_MSA;
+ ScaleRowDown38_2 = ScaleRowDown38_2_Box_Any_MSA;
+ }
+ if (dst_width % 12 == 0) {
+ if (!filtering) {
+ ScaleRowDown38_3 = ScaleRowDown38_MSA;
+ ScaleRowDown38_2 = ScaleRowDown38_MSA;
+ } else {
+ ScaleRowDown38_3 = ScaleRowDown38_3_Box_MSA;
+ ScaleRowDown38_2 = ScaleRowDown38_2_Box_MSA;
+ }
+ }
+ }
+#endif
+
+ for (y = 0; y < dst_height - 2; y += 3) {
+ ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width);
+ src_ptr += src_stride * 3;
+ dst_ptr += dst_stride;
+ ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width);
+ src_ptr += src_stride * 3;
+ dst_ptr += dst_stride;
+ ScaleRowDown38_2(src_ptr, filter_stride, dst_ptr, dst_width);
+ src_ptr += src_stride * 2;
+ dst_ptr += dst_stride;
+ }
+
+ // Remainder 1 or 2 rows with last row vertically unfiltered
+ if ((dst_height % 3) == 2) {
+ ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width);
+ src_ptr += src_stride * 3;
+ dst_ptr += dst_stride;
+ ScaleRowDown38_3(src_ptr, 0, dst_ptr, dst_width);
+ } else if ((dst_height % 3) == 1) {
+ ScaleRowDown38_3(src_ptr, 0, dst_ptr, dst_width);
+ }
+}
+
+static void ScalePlaneDown38_16(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint16* src_ptr,
+ uint16* dst_ptr,
+ enum FilterMode filtering) {
+ int y;
+ void (*ScaleRowDown38_3)(const uint16* src_ptr, ptrdiff_t src_stride,
+ uint16* dst_ptr, int dst_width);
+ void (*ScaleRowDown38_2)(const uint16* src_ptr, ptrdiff_t src_stride,
+ uint16* dst_ptr, int dst_width);
+ const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride;
+ (void)src_width;
+ (void)src_height;
+ assert(dst_width % 3 == 0);
+ if (!filtering) {
+ ScaleRowDown38_3 = ScaleRowDown38_16_C;
+ ScaleRowDown38_2 = ScaleRowDown38_16_C;
+ } else {
+ ScaleRowDown38_3 = ScaleRowDown38_3_Box_16_C;
+ ScaleRowDown38_2 = ScaleRowDown38_2_Box_16_C;
+ }
+#if defined(HAS_SCALEROWDOWN38_16_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && (dst_width % 12 == 0)) {
+ if (!filtering) {
+ ScaleRowDown38_3 = ScaleRowDown38_16_NEON;
+ ScaleRowDown38_2 = ScaleRowDown38_16_NEON;
+ } else {
+ ScaleRowDown38_3 = ScaleRowDown38_3_Box_16_NEON;
+ ScaleRowDown38_2 = ScaleRowDown38_2_Box_16_NEON;
+ }
+ }
+#endif
+#if defined(HAS_SCALEROWDOWN38_16_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0)) {
+ if (!filtering) {
+ ScaleRowDown38_3 = ScaleRowDown38_16_SSSE3;
+ ScaleRowDown38_2 = ScaleRowDown38_16_SSSE3;
+ } else {
+ ScaleRowDown38_3 = ScaleRowDown38_3_Box_16_SSSE3;
+ ScaleRowDown38_2 = ScaleRowDown38_2_Box_16_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_SCALEROWDOWN38_16_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2) && (dst_width % 12 == 0) &&
+ IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
+ IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
+ if (!filtering) {
+ ScaleRowDown38_3 = ScaleRowDown38_16_DSPR2;
+ ScaleRowDown38_2 = ScaleRowDown38_16_DSPR2;
+ } else {
+ ScaleRowDown38_3 = ScaleRowDown38_3_Box_16_DSPR2;
+ ScaleRowDown38_2 = ScaleRowDown38_2_Box_16_DSPR2;
+ }
+ }
+#endif
+
+ for (y = 0; y < dst_height - 2; y += 3) {
+ ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width);
+ src_ptr += src_stride * 3;
+ dst_ptr += dst_stride;
+ ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width);
+ src_ptr += src_stride * 3;
+ dst_ptr += dst_stride;
+ ScaleRowDown38_2(src_ptr, filter_stride, dst_ptr, dst_width);
+ src_ptr += src_stride * 2;
+ dst_ptr += dst_stride;
+ }
+
+ // Remainder 1 or 2 rows with last row vertically unfiltered
+ if ((dst_height % 3) == 2) {
+ ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width);
+ src_ptr += src_stride * 3;
+ dst_ptr += dst_stride;
+ ScaleRowDown38_3(src_ptr, 0, dst_ptr, dst_width);
+ } else if ((dst_height % 3) == 1) {
+ ScaleRowDown38_3(src_ptr, 0, dst_ptr, dst_width);
+ }
+}
+
+#define MIN1(x) ((x) < 1 ? 1 : (x))
+
+static __inline uint32 SumPixels(int iboxwidth, const uint16* src_ptr) {
+ uint32 sum = 0u;
+ int x;
+ assert(iboxwidth > 0);
+ for (x = 0; x < iboxwidth; ++x) {
+ sum += src_ptr[x];
+ }
+ return sum;
+}
+
+static __inline uint32 SumPixels_16(int iboxwidth, const uint32* src_ptr) {
+ uint32 sum = 0u;
+ int x;
+ assert(iboxwidth > 0);
+ for (x = 0; x < iboxwidth; ++x) {
+ sum += src_ptr[x];
+ }
+ return sum;
+}
+
+static void ScaleAddCols2_C(int dst_width,
+ int boxheight,
+ int x,
+ int dx,
+ const uint16* src_ptr,
+ uint8* dst_ptr) {
+ int i;
+ int scaletbl[2];
+ int minboxwidth = dx >> 16;
+ int boxwidth;
+ scaletbl[0] = 65536 / (MIN1(minboxwidth) * boxheight);
+ scaletbl[1] = 65536 / (MIN1(minboxwidth + 1) * boxheight);
+ for (i = 0; i < dst_width; ++i) {
+ int ix = x >> 16;
+ x += dx;
+ boxwidth = MIN1((x >> 16) - ix);
+ *dst_ptr++ =
+ SumPixels(boxwidth, src_ptr + ix) * scaletbl[boxwidth - minboxwidth] >>
+ 16;
+ }
+}
+
+static void ScaleAddCols2_16_C(int dst_width,
+ int boxheight,
+ int x,
+ int dx,
+ const uint32* src_ptr,
+ uint16* dst_ptr) {
+ int i;
+ int scaletbl[2];
+ int minboxwidth = dx >> 16;
+ int boxwidth;
+ scaletbl[0] = 65536 / (MIN1(minboxwidth) * boxheight);
+ scaletbl[1] = 65536 / (MIN1(minboxwidth + 1) * boxheight);
+ for (i = 0; i < dst_width; ++i) {
+ int ix = x >> 16;
+ x += dx;
+ boxwidth = MIN1((x >> 16) - ix);
+ *dst_ptr++ = SumPixels_16(boxwidth, src_ptr + ix) *
+ scaletbl[boxwidth - minboxwidth] >>
+ 16;
+ }
+}
+
+static void ScaleAddCols0_C(int dst_width,
+ int boxheight,
+ int x,
+ int,
+ const uint16* src_ptr,
+ uint8* dst_ptr) {
+ int scaleval = 65536 / boxheight;
+ int i;
+ src_ptr += (x >> 16);
+ for (i = 0; i < dst_width; ++i) {
+ *dst_ptr++ = src_ptr[i] * scaleval >> 16;
+ }
+}
+
+static void ScaleAddCols1_C(int dst_width,
+ int boxheight,
+ int x,
+ int dx,
+ const uint16* src_ptr,
+ uint8* dst_ptr) {
+ int boxwidth = MIN1(dx >> 16);
+ int scaleval = 65536 / (boxwidth * boxheight);
+ int i;
+ x >>= 16;
+ for (i = 0; i < dst_width; ++i) {
+ *dst_ptr++ = SumPixels(boxwidth, src_ptr + x) * scaleval >> 16;
+ x += boxwidth;
+ }
+}
+
+static void ScaleAddCols1_16_C(int dst_width,
+ int boxheight,
+ int x,
+ int dx,
+ const uint32* src_ptr,
+ uint16* dst_ptr) {
+ int boxwidth = MIN1(dx >> 16);
+ int scaleval = 65536 / (boxwidth * boxheight);
+ int i;
+ for (i = 0; i < dst_width; ++i) {
+ *dst_ptr++ = SumPixels_16(boxwidth, src_ptr + x) * scaleval >> 16;
+ x += boxwidth;
+ }
+}
+
+// Scale plane down to any dimensions, with interpolation.
+// (boxfilter).
+//
+// Same method as SimpleScale, which is fixed point, outputting
+// one pixel of destination using fixed point (16.16) to step
+// through source, sampling a box of pixel with simple
+// averaging.
+static void ScalePlaneBox(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint8* src_ptr,
+ uint8* dst_ptr) {
+ int j, k;
+ // Initial source x/y coordinate and step values as 16.16 fixed point.
+ int x = 0;
+ int y = 0;
+ int dx = 0;
+ int dy = 0;
+ const int max_y = (src_height << 16);
+ ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterBox, &x, &y,
+ &dx, &dy);
+ src_width = Abs(src_width);
+ {
+ // Allocate a row buffer of uint16.
+ align_buffer_64(row16, src_width * 2);
+ void (*ScaleAddCols)(int dst_width, int boxheight, int x, int dx,
+ const uint16* src_ptr, uint8* dst_ptr) =
+ (dx & 0xffff) ? ScaleAddCols2_C
+ : ((dx != 0x10000) ? ScaleAddCols1_C : ScaleAddCols0_C);
+ void (*ScaleAddRow)(const uint8* src_ptr, uint16* dst_ptr, int src_width) =
+ ScaleAddRow_C;
+#if defined(HAS_SCALEADDROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ ScaleAddRow = ScaleAddRow_Any_SSE2;
+ if (IS_ALIGNED(src_width, 16)) {
+ ScaleAddRow = ScaleAddRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_SCALEADDROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ScaleAddRow = ScaleAddRow_Any_AVX2;
+ if (IS_ALIGNED(src_width, 32)) {
+ ScaleAddRow = ScaleAddRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_SCALEADDROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ScaleAddRow = ScaleAddRow_Any_NEON;
+ if (IS_ALIGNED(src_width, 16)) {
+ ScaleAddRow = ScaleAddRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_SCALEADDROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ScaleAddRow = ScaleAddRow_Any_MSA;
+ if (IS_ALIGNED(src_width, 16)) {
+ ScaleAddRow = ScaleAddRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_SCALEADDROW_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2)) {
+ ScaleAddRow = ScaleAddRow_Any_DSPR2;
+ if (IS_ALIGNED(src_width, 16)) {
+ ScaleAddRow = ScaleAddRow_DSPR2;
+ }
+ }
+#endif
+
+ for (j = 0; j < dst_height; ++j) {
+ int boxheight;
+ int iy = y >> 16;
+ const uint8* src = src_ptr + iy * src_stride;
+ y += dy;
+ if (y > max_y) {
+ y = max_y;
+ }
+ boxheight = MIN1((y >> 16) - iy);
+ memset(row16, 0, src_width * 2);
+ for (k = 0; k < boxheight; ++k) {
+ ScaleAddRow(src, (uint16*)(row16), src_width);
+ src += src_stride;
+ }
+ ScaleAddCols(dst_width, boxheight, x, dx, (uint16*)(row16), dst_ptr);
+ dst_ptr += dst_stride;
+ }
+ free_aligned_buffer_64(row16);
+ }
+}
+
+static void ScalePlaneBox_16(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint16* src_ptr,
+ uint16* dst_ptr) {
+ int j, k;
+ // Initial source x/y coordinate and step values as 16.16 fixed point.
+ int x = 0;
+ int y = 0;
+ int dx = 0;
+ int dy = 0;
+ const int max_y = (src_height << 16);
+ ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterBox, &x, &y,
+ &dx, &dy);
+ src_width = Abs(src_width);
+ {
+ // Allocate a row buffer of uint32.
+ align_buffer_64(row32, src_width * 4);
+ void (*ScaleAddCols)(int dst_width, int boxheight, int x, int dx,
+ const uint32* src_ptr, uint16* dst_ptr) =
+ (dx & 0xffff) ? ScaleAddCols2_16_C : ScaleAddCols1_16_C;
+ void (*ScaleAddRow)(const uint16* src_ptr, uint32* dst_ptr, int src_width) =
+ ScaleAddRow_16_C;
+
+#if defined(HAS_SCALEADDROW_16_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(src_width, 16)) {
+ ScaleAddRow = ScaleAddRow_16_SSE2;
+ }
+#endif
+
+ for (j = 0; j < dst_height; ++j) {
+ int boxheight;
+ int iy = y >> 16;
+ const uint16* src = src_ptr + iy * src_stride;
+ y += dy;
+ if (y > max_y) {
+ y = max_y;
+ }
+ boxheight = MIN1((y >> 16) - iy);
+ memset(row32, 0, src_width * 4);
+ for (k = 0; k < boxheight; ++k) {
+ ScaleAddRow(src, (uint32*)(row32), src_width);
+ src += src_stride;
+ }
+ ScaleAddCols(dst_width, boxheight, x, dx, (uint32*)(row32), dst_ptr);
+ dst_ptr += dst_stride;
+ }
+ free_aligned_buffer_64(row32);
+ }
+}
+
+// Scale plane down with bilinear interpolation.
+void ScalePlaneBilinearDown(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint8* src_ptr,
+ uint8* dst_ptr,
+ enum FilterMode filtering) {
+ // Initial source x/y coordinate and step values as 16.16 fixed point.
+ int x = 0;
+ int y = 0;
+ int dx = 0;
+ int dy = 0;
+ // TODO(fbarchard): Consider not allocating row buffer for kFilterLinear.
+ // Allocate a row buffer.
+ align_buffer_64(row, src_width);
+
+ const int max_y = (src_height - 1) << 16;
+ int j;
+ void (*ScaleFilterCols)(uint8 * dst_ptr, const uint8* src_ptr, int dst_width,
+ int x, int dx) =
+ (src_width >= 32768) ? ScaleFilterCols64_C : ScaleFilterCols_C;
+ void (*InterpolateRow)(uint8 * dst_ptr, const uint8* src_ptr,
+ ptrdiff_t src_stride, int dst_width,
+ int source_y_fraction) = InterpolateRow_C;
+ ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y,
+ &dx, &dy);
+ src_width = Abs(src_width);
+
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ InterpolateRow = InterpolateRow_Any_SSSE3;
+ if (IS_ALIGNED(src_width, 16)) {
+ InterpolateRow = InterpolateRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ InterpolateRow = InterpolateRow_Any_AVX2;
+ if (IS_ALIGNED(src_width, 32)) {
+ InterpolateRow = InterpolateRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ InterpolateRow = InterpolateRow_Any_NEON;
+ if (IS_ALIGNED(src_width, 16)) {
+ InterpolateRow = InterpolateRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2)) {
+ InterpolateRow = InterpolateRow_Any_DSPR2;
+ if (IS_ALIGNED(src_width, 4)) {
+ InterpolateRow = InterpolateRow_DSPR2;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ InterpolateRow = InterpolateRow_Any_MSA;
+ if (IS_ALIGNED(src_width, 32)) {
+ InterpolateRow = InterpolateRow_MSA;
+ }
+ }
+#endif
+
+#if defined(HAS_SCALEFILTERCOLS_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
+ ScaleFilterCols = ScaleFilterCols_SSSE3;
+ }
+#endif
+#if defined(HAS_SCALEFILTERCOLS_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && src_width < 32768) {
+ ScaleFilterCols = ScaleFilterCols_Any_NEON;
+ if (IS_ALIGNED(dst_width, 8)) {
+ ScaleFilterCols = ScaleFilterCols_NEON;
+ }
+ }
+#endif
+ if (y > max_y) {
+ y = max_y;
+ }
+
+ for (j = 0; j < dst_height; ++j) {
+ int yi = y >> 16;
+ const uint8* src = src_ptr + yi * src_stride;
+ if (filtering == kFilterLinear) {
+ ScaleFilterCols(dst_ptr, src, dst_width, x, dx);
+ } else {
+ int yf = (y >> 8) & 255;
+ InterpolateRow(row, src, src_stride, src_width, yf);
+ ScaleFilterCols(dst_ptr, row, dst_width, x, dx);
+ }
+ dst_ptr += dst_stride;
+ y += dy;
+ if (y > max_y) {
+ y = max_y;
+ }
+ }
+ free_aligned_buffer_64(row);
+}
+
+void ScalePlaneBilinearDown_16(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint16* src_ptr,
+ uint16* dst_ptr,
+ enum FilterMode filtering) {
+ // Initial source x/y coordinate and step values as 16.16 fixed point.
+ int x = 0;
+ int y = 0;
+ int dx = 0;
+ int dy = 0;
+ // TODO(fbarchard): Consider not allocating row buffer for kFilterLinear.
+ // Allocate a row buffer.
+ align_buffer_64(row, src_width * 2);
+
+ const int max_y = (src_height - 1) << 16;
+ int j;
+ void (*ScaleFilterCols)(uint16 * dst_ptr, const uint16* src_ptr,
+ int dst_width, int x, int dx) =
+ (src_width >= 32768) ? ScaleFilterCols64_16_C : ScaleFilterCols_16_C;
+ void (*InterpolateRow)(uint16 * dst_ptr, const uint16* src_ptr,
+ ptrdiff_t src_stride, int dst_width,
+ int source_y_fraction) = InterpolateRow_16_C;
+ ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y,
+ &dx, &dy);
+ src_width = Abs(src_width);
+
+#if defined(HAS_INTERPOLATEROW_16_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ InterpolateRow = InterpolateRow_Any_16_SSE2;
+ if (IS_ALIGNED(src_width, 16)) {
+ InterpolateRow = InterpolateRow_16_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ InterpolateRow = InterpolateRow_Any_16_SSSE3;
+ if (IS_ALIGNED(src_width, 16)) {
+ InterpolateRow = InterpolateRow_16_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ InterpolateRow = InterpolateRow_Any_16_AVX2;
+ if (IS_ALIGNED(src_width, 32)) {
+ InterpolateRow = InterpolateRow_16_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ InterpolateRow = InterpolateRow_Any_16_NEON;
+ if (IS_ALIGNED(src_width, 16)) {
+ InterpolateRow = InterpolateRow_16_NEON;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2)) {
+ InterpolateRow = InterpolateRow_Any_16_DSPR2;
+ if (IS_ALIGNED(src_width, 4)) {
+ InterpolateRow = InterpolateRow_16_DSPR2;
+ }
+ }
+#endif
+
+#if defined(HAS_SCALEFILTERCOLS_16_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
+ ScaleFilterCols = ScaleFilterCols_16_SSSE3;
+ }
+#endif
+ if (y > max_y) {
+ y = max_y;
+ }
+
+ for (j = 0; j < dst_height; ++j) {
+ int yi = y >> 16;
+ const uint16* src = src_ptr + yi * src_stride;
+ if (filtering == kFilterLinear) {
+ ScaleFilterCols(dst_ptr, src, dst_width, x, dx);
+ } else {
+ int yf = (y >> 8) & 255;
+ InterpolateRow((uint16*)row, src, src_stride, src_width, yf);
+ ScaleFilterCols(dst_ptr, (uint16*)row, dst_width, x, dx);
+ }
+ dst_ptr += dst_stride;
+ y += dy;
+ if (y > max_y) {
+ y = max_y;
+ }
+ }
+ free_aligned_buffer_64(row);
+}
+
+// Scale up down with bilinear interpolation.
+void ScalePlaneBilinearUp(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint8* src_ptr,
+ uint8* dst_ptr,
+ enum FilterMode filtering) {
+ int j;
+ // Initial source x/y coordinate and step values as 16.16 fixed point.
+ int x = 0;
+ int y = 0;
+ int dx = 0;
+ int dy = 0;
+ const int max_y = (src_height - 1) << 16;
+ void (*InterpolateRow)(uint8 * dst_ptr, const uint8* src_ptr,
+ ptrdiff_t src_stride, int dst_width,
+ int source_y_fraction) = InterpolateRow_C;
+ void (*ScaleFilterCols)(uint8 * dst_ptr, const uint8* src_ptr, int dst_width,
+ int x, int dx) =
+ filtering ? ScaleFilterCols_C : ScaleCols_C;
+ ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y,
+ &dx, &dy);
+ src_width = Abs(src_width);
+
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ InterpolateRow = InterpolateRow_Any_SSSE3;
+ if (IS_ALIGNED(dst_width, 16)) {
+ InterpolateRow = InterpolateRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ InterpolateRow = InterpolateRow_Any_AVX2;
+ if (IS_ALIGNED(dst_width, 32)) {
+ InterpolateRow = InterpolateRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ InterpolateRow = InterpolateRow_Any_NEON;
+ if (IS_ALIGNED(dst_width, 16)) {
+ InterpolateRow = InterpolateRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2)) {
+ InterpolateRow = InterpolateRow_Any_DSPR2;
+ if (IS_ALIGNED(dst_width, 4)) {
+ InterpolateRow = InterpolateRow_DSPR2;
+ }
+ }
+#endif
+
+ if (filtering && src_width >= 32768) {
+ ScaleFilterCols = ScaleFilterCols64_C;
+ }
+#if defined(HAS_SCALEFILTERCOLS_SSSE3)
+ if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
+ ScaleFilterCols = ScaleFilterCols_SSSE3;
+ }
+#endif
+#if defined(HAS_SCALEFILTERCOLS_NEON)
+ if (filtering && TestCpuFlag(kCpuHasNEON) && src_width < 32768) {
+ ScaleFilterCols = ScaleFilterCols_Any_NEON;
+ if (IS_ALIGNED(dst_width, 8)) {
+ ScaleFilterCols = ScaleFilterCols_NEON;
+ }
+ }
+#endif
+ if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
+ ScaleFilterCols = ScaleColsUp2_C;
+#if defined(HAS_SCALECOLS_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
+ ScaleFilterCols = ScaleColsUp2_SSE2;
+ }
+#endif
+ }
+
+ if (y > max_y) {
+ y = max_y;
+ }
+ {
+ int yi = y >> 16;
+ const uint8* src = src_ptr + yi * src_stride;
+
+ // Allocate 2 row buffers.
+ const int kRowSize = (dst_width + 31) & ~31;
+ align_buffer_64(row, kRowSize * 2);
+
+ uint8* rowptr = row;
+ int rowstride = kRowSize;
+ int lasty = yi;
+
+ ScaleFilterCols(rowptr, src, dst_width, x, dx);
+ if (src_height > 1) {
+ src += src_stride;
+ }
+ ScaleFilterCols(rowptr + rowstride, src, dst_width, x, dx);
+ src += src_stride;
+
+ for (j = 0; j < dst_height; ++j) {
+ yi = y >> 16;
+ if (yi != lasty) {
+ if (y > max_y) {
+ y = max_y;
+ yi = y >> 16;
+ src = src_ptr + yi * src_stride;
+ }
+ if (yi != lasty) {
+ ScaleFilterCols(rowptr, src, dst_width, x, dx);
+ rowptr += rowstride;
+ rowstride = -rowstride;
+ lasty = yi;
+ src += src_stride;
+ }
+ }
+ if (filtering == kFilterLinear) {
+ InterpolateRow(dst_ptr, rowptr, 0, dst_width, 0);
+ } else {
+ int yf = (y >> 8) & 255;
+ InterpolateRow(dst_ptr, rowptr, rowstride, dst_width, yf);
+ }
+ dst_ptr += dst_stride;
+ y += dy;
+ }
+ free_aligned_buffer_64(row);
+ }
+}
+
+void ScalePlaneBilinearUp_16(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint16* src_ptr,
+ uint16* dst_ptr,
+ enum FilterMode filtering) {
+ int j;
+ // Initial source x/y coordinate and step values as 16.16 fixed point.
+ int x = 0;
+ int y = 0;
+ int dx = 0;
+ int dy = 0;
+ const int max_y = (src_height - 1) << 16;
+ void (*InterpolateRow)(uint16 * dst_ptr, const uint16* src_ptr,
+ ptrdiff_t src_stride, int dst_width,
+ int source_y_fraction) = InterpolateRow_16_C;
+ void (*ScaleFilterCols)(uint16 * dst_ptr, const uint16* src_ptr,
+ int dst_width, int x, int dx) =
+ filtering ? ScaleFilterCols_16_C : ScaleCols_16_C;
+ ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y,
+ &dx, &dy);
+ src_width = Abs(src_width);
+
+#if defined(HAS_INTERPOLATEROW_16_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ InterpolateRow = InterpolateRow_Any_16_SSE2;
+ if (IS_ALIGNED(dst_width, 16)) {
+ InterpolateRow = InterpolateRow_16_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ InterpolateRow = InterpolateRow_Any_16_SSSE3;
+ if (IS_ALIGNED(dst_width, 16)) {
+ InterpolateRow = InterpolateRow_16_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ InterpolateRow = InterpolateRow_Any_16_AVX2;
+ if (IS_ALIGNED(dst_width, 32)) {
+ InterpolateRow = InterpolateRow_16_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ InterpolateRow = InterpolateRow_Any_16_NEON;
+ if (IS_ALIGNED(dst_width, 16)) {
+ InterpolateRow = InterpolateRow_16_NEON;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2)) {
+ InterpolateRow = InterpolateRow_Any_16_DSPR2;
+ if (IS_ALIGNED(dst_width, 4)) {
+ InterpolateRow = InterpolateRow_16_DSPR2;
+ }
+ }
+#endif
+
+ if (filtering && src_width >= 32768) {
+ ScaleFilterCols = ScaleFilterCols64_16_C;
+ }
+#if defined(HAS_SCALEFILTERCOLS_16_SSSE3)
+ if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
+ ScaleFilterCols = ScaleFilterCols_16_SSSE3;
+ }
+#endif
+ if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
+ ScaleFilterCols = ScaleColsUp2_16_C;
+#if defined(HAS_SCALECOLS_16_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
+ ScaleFilterCols = ScaleColsUp2_16_SSE2;
+ }
+#endif
+ }
+
+ if (y > max_y) {
+ y = max_y;
+ }
+ {
+ int yi = y >> 16;
+ const uint16* src = src_ptr + yi * src_stride;
+
+ // Allocate 2 row buffers.
+ const int kRowSize = (dst_width + 31) & ~31;
+ align_buffer_64(row, kRowSize * 4);
+
+ uint16* rowptr = (uint16*)row;
+ int rowstride = kRowSize;
+ int lasty = yi;
+
+ ScaleFilterCols(rowptr, src, dst_width, x, dx);
+ if (src_height > 1) {
+ src += src_stride;
+ }
+ ScaleFilterCols(rowptr + rowstride, src, dst_width, x, dx);
+ src += src_stride;
+
+ for (j = 0; j < dst_height; ++j) {
+ yi = y >> 16;
+ if (yi != lasty) {
+ if (y > max_y) {
+ y = max_y;
+ yi = y >> 16;
+ src = src_ptr + yi * src_stride;
+ }
+ if (yi != lasty) {
+ ScaleFilterCols(rowptr, src, dst_width, x, dx);
+ rowptr += rowstride;
+ rowstride = -rowstride;
+ lasty = yi;
+ src += src_stride;
+ }
+ }
+ if (filtering == kFilterLinear) {
+ InterpolateRow(dst_ptr, rowptr, 0, dst_width, 0);
+ } else {
+ int yf = (y >> 8) & 255;
+ InterpolateRow(dst_ptr, rowptr, rowstride, dst_width, yf);
+ }
+ dst_ptr += dst_stride;
+ y += dy;
+ }
+ free_aligned_buffer_64(row);
+ }
+}
+
+// Scale Plane to/from any dimensions, without interpolation.
+// Fixed point math is used for performance: The upper 16 bits
+// of x and dx is the integer part of the source position and
+// the lower 16 bits are the fixed decimal part.
+
+static void ScalePlaneSimple(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint8* src_ptr,
+ uint8* dst_ptr) {
+ int i;
+ void (*ScaleCols)(uint8 * dst_ptr, const uint8* src_ptr, int dst_width, int x,
+ int dx) = ScaleCols_C;
+ // Initial source x/y coordinate and step values as 16.16 fixed point.
+ int x = 0;
+ int y = 0;
+ int dx = 0;
+ int dy = 0;
+ ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterNone, &x, &y,
+ &dx, &dy);
+ src_width = Abs(src_width);
+
+ if (src_width * 2 == dst_width && x < 0x8000) {
+ ScaleCols = ScaleColsUp2_C;
+#if defined(HAS_SCALECOLS_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
+ ScaleCols = ScaleColsUp2_SSE2;
+ }
+#endif
+ }
+
+ for (i = 0; i < dst_height; ++i) {
+ ScaleCols(dst_ptr, src_ptr + (y >> 16) * src_stride, dst_width, x, dx);
+ dst_ptr += dst_stride;
+ y += dy;
+ }
+}
+
+static void ScalePlaneSimple_16(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint16* src_ptr,
+ uint16* dst_ptr) {
+ int i;
+ void (*ScaleCols)(uint16 * dst_ptr, const uint16* src_ptr, int dst_width,
+ int x, int dx) = ScaleCols_16_C;
+ // Initial source x/y coordinate and step values as 16.16 fixed point.
+ int x = 0;
+ int y = 0;
+ int dx = 0;
+ int dy = 0;
+ ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterNone, &x, &y,
+ &dx, &dy);
+ src_width = Abs(src_width);
+
+ if (src_width * 2 == dst_width && x < 0x8000) {
+ ScaleCols = ScaleColsUp2_16_C;
+#if defined(HAS_SCALECOLS_16_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
+ ScaleCols = ScaleColsUp2_16_SSE2;
+ }
+#endif
+ }
+
+ for (i = 0; i < dst_height; ++i) {
+ ScaleCols(dst_ptr, src_ptr + (y >> 16) * src_stride, dst_width, x, dx);
+ dst_ptr += dst_stride;
+ y += dy;
+ }
+}
+
+// Scale a plane.
+// This function dispatches to a specialized scaler based on scale factor.
+
+LIBYUV_API
+void ScalePlane(const uint8* src,
+ int src_stride,
+ int src_width,
+ int src_height,
+ uint8* dst,
+ int dst_stride,
+ int dst_width,
+ int dst_height,
+ enum FilterMode filtering) {
+ // Simplify filtering when possible.
+ filtering = ScaleFilterReduce(src_width, src_height, dst_width, dst_height,
+ filtering);
+
+ // Negative height means invert the image.
+ if (src_height < 0) {
+ src_height = -src_height;
+ src = src + (src_height - 1) * src_stride;
+ src_stride = -src_stride;
+ }
+
+ // Use specialized scales to improve performance for common resolutions.
+ // For example, all the 1/2 scalings will use ScalePlaneDown2()
+ if (dst_width == src_width && dst_height == src_height) {
+ // Straight copy.
+ CopyPlane(src, src_stride, dst, dst_stride, dst_width, dst_height);
+ return;
+ }
+ if (dst_width == src_width && filtering != kFilterBox) {
+ int dy = FixedDiv(src_height, dst_height);
+ // Arbitrary scale vertically, but unscaled horizontally.
+ ScalePlaneVertical(src_height, dst_width, dst_height, src_stride,
+ dst_stride, src, dst, 0, 0, dy, 1, filtering);
+ return;
+ }
+ if (dst_width <= Abs(src_width) && dst_height <= src_height) {
+ // Scale down.
+ if (4 * dst_width == 3 * src_width && 4 * dst_height == 3 * src_height) {
+ // optimized, 3/4
+ ScalePlaneDown34(src_width, src_height, dst_width, dst_height, src_stride,
+ dst_stride, src, dst, filtering);
+ return;
+ }
+ if (2 * dst_width == src_width && 2 * dst_height == src_height) {
+ // optimized, 1/2
+ ScalePlaneDown2(src_width, src_height, dst_width, dst_height, src_stride,
+ dst_stride, src, dst, filtering);
+ return;
+ }
+ // 3/8 rounded up for odd sized chroma height.
+ if (8 * dst_width == 3 * src_width && 8 * dst_height == 3 * src_height) {
+ // optimized, 3/8
+ ScalePlaneDown38(src_width, src_height, dst_width, dst_height, src_stride,
+ dst_stride, src, dst, filtering);
+ return;
+ }
+ if (4 * dst_width == src_width && 4 * dst_height == src_height &&
+ (filtering == kFilterBox || filtering == kFilterNone)) {
+ // optimized, 1/4
+ ScalePlaneDown4(src_width, src_height, dst_width, dst_height, src_stride,
+ dst_stride, src, dst, filtering);
+ return;
+ }
+ }
+ if (filtering == kFilterBox && dst_height * 2 < src_height) {
+ ScalePlaneBox(src_width, src_height, dst_width, dst_height, src_stride,
+ dst_stride, src, dst);
+ return;
+ }
+ if (filtering && dst_height > src_height) {
+ ScalePlaneBilinearUp(src_width, src_height, dst_width, dst_height,
+ src_stride, dst_stride, src, dst, filtering);
+ return;
+ }
+ if (filtering) {
+ ScalePlaneBilinearDown(src_width, src_height, dst_width, dst_height,
+ src_stride, dst_stride, src, dst, filtering);
+ return;
+ }
+ ScalePlaneSimple(src_width, src_height, dst_width, dst_height, src_stride,
+ dst_stride, src, dst);
+}
+
+LIBYUV_API
+void ScalePlane_16(const uint16* src,
+ int src_stride,
+ int src_width,
+ int src_height,
+ uint16* dst,
+ int dst_stride,
+ int dst_width,
+ int dst_height,
+ enum FilterMode filtering) {
+ // Simplify filtering when possible.
+ filtering = ScaleFilterReduce(src_width, src_height, dst_width, dst_height,
+ filtering);
+
+ // Negative height means invert the image.
+ if (src_height < 0) {
+ src_height = -src_height;
+ src = src + (src_height - 1) * src_stride;
+ src_stride = -src_stride;
+ }
+
+ // Use specialized scales to improve performance for common resolutions.
+ // For example, all the 1/2 scalings will use ScalePlaneDown2()
+ if (dst_width == src_width && dst_height == src_height) {
+ // Straight copy.
+ CopyPlane_16(src, src_stride, dst, dst_stride, dst_width, dst_height);
+ return;
+ }
+ if (dst_width == src_width) {
+ int dy = FixedDiv(src_height, dst_height);
+ // Arbitrary scale vertically, but unscaled vertically.
+ ScalePlaneVertical_16(src_height, dst_width, dst_height, src_stride,
+ dst_stride, src, dst, 0, 0, dy, 1, filtering);
+ return;
+ }
+ if (dst_width <= Abs(src_width) && dst_height <= src_height) {
+ // Scale down.
+ if (4 * dst_width == 3 * src_width && 4 * dst_height == 3 * src_height) {
+ // optimized, 3/4
+ ScalePlaneDown34_16(src_width, src_height, dst_width, dst_height,
+ src_stride, dst_stride, src, dst, filtering);
+ return;
+ }
+ if (2 * dst_width == src_width && 2 * dst_height == src_height) {
+ // optimized, 1/2
+ ScalePlaneDown2_16(src_width, src_height, dst_width, dst_height,
+ src_stride, dst_stride, src, dst, filtering);
+ return;
+ }
+ // 3/8 rounded up for odd sized chroma height.
+ if (8 * dst_width == 3 * src_width && 8 * dst_height == 3 * src_height) {
+ // optimized, 3/8
+ ScalePlaneDown38_16(src_width, src_height, dst_width, dst_height,
+ src_stride, dst_stride, src, dst, filtering);
+ return;
+ }
+ if (4 * dst_width == src_width && 4 * dst_height == src_height &&
+ filtering != kFilterBilinear) {
+ // optimized, 1/4
+ ScalePlaneDown4_16(src_width, src_height, dst_width, dst_height,
+ src_stride, dst_stride, src, dst, filtering);
+ return;
+ }
+ }
+ if (filtering == kFilterBox && dst_height * 2 < src_height) {
+ ScalePlaneBox_16(src_width, src_height, dst_width, dst_height, src_stride,
+ dst_stride, src, dst);
+ return;
+ }
+ if (filtering && dst_height > src_height) {
+ ScalePlaneBilinearUp_16(src_width, src_height, dst_width, dst_height,
+ src_stride, dst_stride, src, dst, filtering);
+ return;
+ }
+ if (filtering) {
+ ScalePlaneBilinearDown_16(src_width, src_height, dst_width, dst_height,
+ src_stride, dst_stride, src, dst, filtering);
+ return;
+ }
+ ScalePlaneSimple_16(src_width, src_height, dst_width, dst_height, src_stride,
+ dst_stride, src, dst);
+}
+
+// Scale an I420 image.
+// This function in turn calls a scaling function for each plane.
+
+LIBYUV_API
+int I420Scale(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ int src_width,
+ int src_height,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int dst_width,
+ int dst_height,
+ enum FilterMode filtering) {
+ int src_halfwidth = SUBSAMPLE(src_width, 1, 1);
+ int src_halfheight = SUBSAMPLE(src_height, 1, 1);
+ int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
+ int dst_halfheight = SUBSAMPLE(dst_height, 1, 1);
+ if (!src_y || !src_u || !src_v || src_width == 0 || src_height == 0 ||
+ src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
+ dst_width <= 0 || dst_height <= 0) {
+ return -1;
+ }
+
+ ScalePlane(src_y, src_stride_y, src_width, src_height, dst_y, dst_stride_y,
+ dst_width, dst_height, filtering);
+ ScalePlane(src_u, src_stride_u, src_halfwidth, src_halfheight, dst_u,
+ dst_stride_u, dst_halfwidth, dst_halfheight, filtering);
+ ScalePlane(src_v, src_stride_v, src_halfwidth, src_halfheight, dst_v,
+ dst_stride_v, dst_halfwidth, dst_halfheight, filtering);
+ return 0;
+}
+
+LIBYUV_API
+int I420Scale_16(const uint16* src_y,
+ int src_stride_y,
+ const uint16* src_u,
+ int src_stride_u,
+ const uint16* src_v,
+ int src_stride_v,
+ int src_width,
+ int src_height,
+ uint16* dst_y,
+ int dst_stride_y,
+ uint16* dst_u,
+ int dst_stride_u,
+ uint16* dst_v,
+ int dst_stride_v,
+ int dst_width,
+ int dst_height,
+ enum FilterMode filtering) {
+ int src_halfwidth = SUBSAMPLE(src_width, 1, 1);
+ int src_halfheight = SUBSAMPLE(src_height, 1, 1);
+ int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
+ int dst_halfheight = SUBSAMPLE(dst_height, 1, 1);
+ if (!src_y || !src_u || !src_v || src_width == 0 || src_height == 0 ||
+ src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
+ dst_width <= 0 || dst_height <= 0) {
+ return -1;
+ }
+
+ ScalePlane_16(src_y, src_stride_y, src_width, src_height, dst_y, dst_stride_y,
+ dst_width, dst_height, filtering);
+ ScalePlane_16(src_u, src_stride_u, src_halfwidth, src_halfheight, dst_u,
+ dst_stride_u, dst_halfwidth, dst_halfheight, filtering);
+ ScalePlane_16(src_v, src_stride_v, src_halfwidth, src_halfheight, dst_v,
+ dst_stride_v, dst_halfwidth, dst_halfheight, filtering);
+ return 0;
+}
+
+// Deprecated api
+LIBYUV_API
+int Scale(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ int src_stride_y,
+ int src_stride_u,
+ int src_stride_v,
+ int src_width,
+ int src_height,
+ uint8* dst_y,
+ uint8* dst_u,
+ uint8* dst_v,
+ int dst_stride_y,
+ int dst_stride_u,
+ int dst_stride_v,
+ int dst_width,
+ int dst_height,
+ LIBYUV_BOOL interpolate) {
+ return I420Scale(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, src_width, src_height, dst_y, dst_stride_y,
+ dst_u, dst_stride_u, dst_v, dst_stride_v, dst_width,
+ dst_height, interpolate ? kFilterBox : kFilterNone);
+}
+
+// Deprecated api
+LIBYUV_API
+int ScaleOffset(const uint8* src,
+ int src_width,
+ int src_height,
+ uint8* dst,
+ int dst_width,
+ int dst_height,
+ int dst_yoffset,
+ LIBYUV_BOOL interpolate) {
+ // Chroma requires offset to multiple of 2.
+ int dst_yoffset_even = dst_yoffset & ~1;
+ int src_halfwidth = SUBSAMPLE(src_width, 1, 1);
+ int src_halfheight = SUBSAMPLE(src_height, 1, 1);
+ int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
+ int dst_halfheight = SUBSAMPLE(dst_height, 1, 1);
+ int aheight = dst_height - dst_yoffset_even * 2; // actual output height
+ const uint8* src_y = src;
+ const uint8* src_u = src + src_width * src_height;
+ const uint8* src_v =
+ src + src_width * src_height + src_halfwidth * src_halfheight;
+ uint8* dst_y = dst + dst_yoffset_even * dst_width;
+ uint8* dst_u =
+ dst + dst_width * dst_height + (dst_yoffset_even >> 1) * dst_halfwidth;
+ uint8* dst_v = dst + dst_width * dst_height + dst_halfwidth * dst_halfheight +
+ (dst_yoffset_even >> 1) * dst_halfwidth;
+ if (!src || src_width <= 0 || src_height <= 0 || !dst || dst_width <= 0 ||
+ dst_height <= 0 || dst_yoffset_even < 0 ||
+ dst_yoffset_even >= dst_height) {
+ return -1;
+ }
+ return I420Scale(src_y, src_width, src_u, src_halfwidth, src_v, src_halfwidth,
+ src_width, src_height, dst_y, dst_width, dst_u,
+ dst_halfwidth, dst_v, dst_halfwidth, dst_width, aheight,
+ interpolate ? kFilterBox : kFilterNone);
+}
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/libyuv/src/main/cpp/libyuv/source/scale_any.cc b/libyuv/src/main/cpp/libyuv/source/scale_any.cc
new file mode 100644
index 0000000..77e04fb
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/source/scale_any.cc
@@ -0,0 +1,434 @@
+/*
+ * Copyright 2015 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/scale.h"
+#include "libyuv/scale_row.h"
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Definition for ScaleFilterCols, ScaleARGBCols and ScaleARGBFilterCols
+#define CANY(NAMEANY, TERP_SIMD, TERP_C, BPP, MASK) \
+ void NAMEANY(uint8* dst_ptr, const uint8* src_ptr, int dst_width, int x, \
+ int dx) { \
+ int r = dst_width & MASK; \
+ int n = dst_width & ~MASK; \
+ if (n > 0) { \
+ TERP_SIMD(dst_ptr, src_ptr, n, x, dx); \
+ } \
+ TERP_C(dst_ptr + n * BPP, src_ptr, r, x + n * dx, dx); \
+ }
+
+#ifdef HAS_SCALEFILTERCOLS_NEON
+CANY(ScaleFilterCols_Any_NEON, ScaleFilterCols_NEON, ScaleFilterCols_C, 1, 7)
+#endif
+#ifdef HAS_SCALEARGBCOLS_NEON
+CANY(ScaleARGBCols_Any_NEON, ScaleARGBCols_NEON, ScaleARGBCols_C, 4, 7)
+#endif
+#ifdef HAS_SCALEARGBFILTERCOLS_NEON
+CANY(ScaleARGBFilterCols_Any_NEON,
+ ScaleARGBFilterCols_NEON,
+ ScaleARGBFilterCols_C,
+ 4,
+ 3)
+#endif
+#undef CANY
+
+// Fixed scale down.
+// Mask may be non-power of 2, so use MOD
+#define SDANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK) \
+ void NAMEANY(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, \
+ int dst_width) { \
+ int r = (int)((unsigned int)dst_width % (MASK + 1)); /* NOLINT */ \
+ int n = dst_width - r; \
+ if (n > 0) { \
+ SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n); \
+ } \
+ SCALEROWDOWN_C(src_ptr + (n * FACTOR) * BPP, src_stride, \
+ dst_ptr + n * BPP, r); \
+ }
+
+// Fixed scale down for odd source width. Used by I420Blend subsampling.
+// Since dst_width is (width + 1) / 2, this function scales one less pixel
+// and copies the last pixel.
+#define SDODD(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK) \
+ void NAMEANY(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, \
+ int dst_width) { \
+ int r = (int)((unsigned int)(dst_width - 1) % (MASK + 1)); /* NOLINT */ \
+ int n = (dst_width - 1) - r; \
+ if (n > 0) { \
+ SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n); \
+ } \
+ SCALEROWDOWN_C(src_ptr + (n * FACTOR) * BPP, src_stride, \
+ dst_ptr + n * BPP, r + 1); \
+ }
+
+#ifdef HAS_SCALEROWDOWN2_SSSE3
+SDANY(ScaleRowDown2_Any_SSSE3, ScaleRowDown2_SSSE3, ScaleRowDown2_C, 2, 1, 15)
+SDANY(ScaleRowDown2Linear_Any_SSSE3,
+ ScaleRowDown2Linear_SSSE3,
+ ScaleRowDown2Linear_C,
+ 2,
+ 1,
+ 15)
+SDANY(ScaleRowDown2Box_Any_SSSE3,
+ ScaleRowDown2Box_SSSE3,
+ ScaleRowDown2Box_C,
+ 2,
+ 1,
+ 15)
+SDODD(ScaleRowDown2Box_Odd_SSSE3,
+ ScaleRowDown2Box_SSSE3,
+ ScaleRowDown2Box_Odd_C,
+ 2,
+ 1,
+ 15)
+#endif
+#ifdef HAS_SCALEROWDOWN2_AVX2
+SDANY(ScaleRowDown2_Any_AVX2, ScaleRowDown2_AVX2, ScaleRowDown2_C, 2, 1, 31)
+SDANY(ScaleRowDown2Linear_Any_AVX2,
+ ScaleRowDown2Linear_AVX2,
+ ScaleRowDown2Linear_C,
+ 2,
+ 1,
+ 31)
+SDANY(ScaleRowDown2Box_Any_AVX2,
+ ScaleRowDown2Box_AVX2,
+ ScaleRowDown2Box_C,
+ 2,
+ 1,
+ 31)
+SDODD(ScaleRowDown2Box_Odd_AVX2,
+ ScaleRowDown2Box_AVX2,
+ ScaleRowDown2Box_Odd_C,
+ 2,
+ 1,
+ 31)
+#endif
+#ifdef HAS_SCALEROWDOWN2_NEON
+SDANY(ScaleRowDown2_Any_NEON, ScaleRowDown2_NEON, ScaleRowDown2_C, 2, 1, 15)
+SDANY(ScaleRowDown2Linear_Any_NEON,
+ ScaleRowDown2Linear_NEON,
+ ScaleRowDown2Linear_C,
+ 2,
+ 1,
+ 15)
+SDANY(ScaleRowDown2Box_Any_NEON,
+ ScaleRowDown2Box_NEON,
+ ScaleRowDown2Box_C,
+ 2,
+ 1,
+ 15)
+SDODD(ScaleRowDown2Box_Odd_NEON,
+ ScaleRowDown2Box_NEON,
+ ScaleRowDown2Box_Odd_C,
+ 2,
+ 1,
+ 15)
+#endif
+#ifdef HAS_SCALEROWDOWN2_MSA
+SDANY(ScaleRowDown2_Any_MSA, ScaleRowDown2_MSA, ScaleRowDown2_C, 2, 1, 31)
+SDANY(ScaleRowDown2Linear_Any_MSA,
+ ScaleRowDown2Linear_MSA,
+ ScaleRowDown2Linear_C,
+ 2,
+ 1,
+ 31)
+SDANY(ScaleRowDown2Box_Any_MSA,
+ ScaleRowDown2Box_MSA,
+ ScaleRowDown2Box_C,
+ 2,
+ 1,
+ 31)
+#endif
+#ifdef HAS_SCALEROWDOWN4_SSSE3
+SDANY(ScaleRowDown4_Any_SSSE3, ScaleRowDown4_SSSE3, ScaleRowDown4_C, 4, 1, 7)
+SDANY(ScaleRowDown4Box_Any_SSSE3,
+ ScaleRowDown4Box_SSSE3,
+ ScaleRowDown4Box_C,
+ 4,
+ 1,
+ 7)
+#endif
+#ifdef HAS_SCALEROWDOWN4_AVX2
+SDANY(ScaleRowDown4_Any_AVX2, ScaleRowDown4_AVX2, ScaleRowDown4_C, 4, 1, 15)
+SDANY(ScaleRowDown4Box_Any_AVX2,
+ ScaleRowDown4Box_AVX2,
+ ScaleRowDown4Box_C,
+ 4,
+ 1,
+ 15)
+#endif
+#ifdef HAS_SCALEROWDOWN4_NEON
+SDANY(ScaleRowDown4_Any_NEON, ScaleRowDown4_NEON, ScaleRowDown4_C, 4, 1, 7)
+SDANY(ScaleRowDown4Box_Any_NEON,
+ ScaleRowDown4Box_NEON,
+ ScaleRowDown4Box_C,
+ 4,
+ 1,
+ 7)
+#endif
+#ifdef HAS_SCALEROWDOWN4_MSA
+SDANY(ScaleRowDown4_Any_MSA, ScaleRowDown4_MSA, ScaleRowDown4_C, 4, 1, 15)
+SDANY(ScaleRowDown4Box_Any_MSA,
+ ScaleRowDown4Box_MSA,
+ ScaleRowDown4Box_C,
+ 4,
+ 1,
+ 15)
+#endif
+#ifdef HAS_SCALEROWDOWN34_SSSE3
+SDANY(ScaleRowDown34_Any_SSSE3,
+ ScaleRowDown34_SSSE3,
+ ScaleRowDown34_C,
+ 4 / 3,
+ 1,
+ 23)
+SDANY(ScaleRowDown34_0_Box_Any_SSSE3,
+ ScaleRowDown34_0_Box_SSSE3,
+ ScaleRowDown34_0_Box_C,
+ 4 / 3,
+ 1,
+ 23)
+SDANY(ScaleRowDown34_1_Box_Any_SSSE3,
+ ScaleRowDown34_1_Box_SSSE3,
+ ScaleRowDown34_1_Box_C,
+ 4 / 3,
+ 1,
+ 23)
+#endif
+#ifdef HAS_SCALEROWDOWN34_NEON
+SDANY(ScaleRowDown34_Any_NEON,
+ ScaleRowDown34_NEON,
+ ScaleRowDown34_C,
+ 4 / 3,
+ 1,
+ 23)
+SDANY(ScaleRowDown34_0_Box_Any_NEON,
+ ScaleRowDown34_0_Box_NEON,
+ ScaleRowDown34_0_Box_C,
+ 4 / 3,
+ 1,
+ 23)
+SDANY(ScaleRowDown34_1_Box_Any_NEON,
+ ScaleRowDown34_1_Box_NEON,
+ ScaleRowDown34_1_Box_C,
+ 4 / 3,
+ 1,
+ 23)
+#endif
+#ifdef HAS_SCALEROWDOWN38_SSSE3
+SDANY(ScaleRowDown38_Any_SSSE3,
+ ScaleRowDown38_SSSE3,
+ ScaleRowDown38_C,
+ 8 / 3,
+ 1,
+ 11)
+SDANY(ScaleRowDown38_3_Box_Any_SSSE3,
+ ScaleRowDown38_3_Box_SSSE3,
+ ScaleRowDown38_3_Box_C,
+ 8 / 3,
+ 1,
+ 5)
+SDANY(ScaleRowDown38_2_Box_Any_SSSE3,
+ ScaleRowDown38_2_Box_SSSE3,
+ ScaleRowDown38_2_Box_C,
+ 8 / 3,
+ 1,
+ 5)
+#endif
+#ifdef HAS_SCALEROWDOWN38_NEON
+SDANY(ScaleRowDown38_Any_NEON,
+ ScaleRowDown38_NEON,
+ ScaleRowDown38_C,
+ 8 / 3,
+ 1,
+ 11)
+SDANY(ScaleRowDown38_3_Box_Any_NEON,
+ ScaleRowDown38_3_Box_NEON,
+ ScaleRowDown38_3_Box_C,
+ 8 / 3,
+ 1,
+ 11)
+SDANY(ScaleRowDown38_2_Box_Any_NEON,
+ ScaleRowDown38_2_Box_NEON,
+ ScaleRowDown38_2_Box_C,
+ 8 / 3,
+ 1,
+ 11)
+#endif
+#ifdef HAS_SCALEROWDOWN38_MSA
+SDANY(ScaleRowDown38_Any_MSA,
+ ScaleRowDown38_MSA,
+ ScaleRowDown38_C,
+ 8 / 3,
+ 1,
+ 11)
+SDANY(ScaleRowDown38_3_Box_Any_MSA,
+ ScaleRowDown38_3_Box_MSA,
+ ScaleRowDown38_3_Box_C,
+ 8 / 3,
+ 1,
+ 11)
+SDANY(ScaleRowDown38_2_Box_Any_MSA,
+ ScaleRowDown38_2_Box_MSA,
+ ScaleRowDown38_2_Box_C,
+ 8 / 3,
+ 1,
+ 11)
+#endif
+
+#ifdef HAS_SCALEARGBROWDOWN2_SSE2
+SDANY(ScaleARGBRowDown2_Any_SSE2,
+ ScaleARGBRowDown2_SSE2,
+ ScaleARGBRowDown2_C,
+ 2,
+ 4,
+ 3)
+SDANY(ScaleARGBRowDown2Linear_Any_SSE2,
+ ScaleARGBRowDown2Linear_SSE2,
+ ScaleARGBRowDown2Linear_C,
+ 2,
+ 4,
+ 3)
+SDANY(ScaleARGBRowDown2Box_Any_SSE2,
+ ScaleARGBRowDown2Box_SSE2,
+ ScaleARGBRowDown2Box_C,
+ 2,
+ 4,
+ 3)
+#endif
+#ifdef HAS_SCALEARGBROWDOWN2_NEON
+SDANY(ScaleARGBRowDown2_Any_NEON,
+ ScaleARGBRowDown2_NEON,
+ ScaleARGBRowDown2_C,
+ 2,
+ 4,
+ 7)
+SDANY(ScaleARGBRowDown2Linear_Any_NEON,
+ ScaleARGBRowDown2Linear_NEON,
+ ScaleARGBRowDown2Linear_C,
+ 2,
+ 4,
+ 7)
+SDANY(ScaleARGBRowDown2Box_Any_NEON,
+ ScaleARGBRowDown2Box_NEON,
+ ScaleARGBRowDown2Box_C,
+ 2,
+ 4,
+ 7)
+#endif
+#ifdef HAS_SCALEARGBROWDOWN2_MSA
+SDANY(ScaleARGBRowDown2_Any_MSA,
+ ScaleARGBRowDown2_MSA,
+ ScaleARGBRowDown2_C,
+ 2,
+ 4,
+ 3)
+SDANY(ScaleARGBRowDown2Linear_Any_MSA,
+ ScaleARGBRowDown2Linear_MSA,
+ ScaleARGBRowDown2Linear_C,
+ 2,
+ 4,
+ 3)
+SDANY(ScaleARGBRowDown2Box_Any_MSA,
+ ScaleARGBRowDown2Box_MSA,
+ ScaleARGBRowDown2Box_C,
+ 2,
+ 4,
+ 3)
+#endif
+#undef SDANY
+
+// Scale down by even scale factor.
+#define SDAANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, BPP, MASK) \
+ void NAMEANY(const uint8* src_ptr, ptrdiff_t src_stride, int src_stepx, \
+ uint8* dst_ptr, int dst_width) { \
+ int r = dst_width & MASK; \
+ int n = dst_width & ~MASK; \
+ if (n > 0) { \
+ SCALEROWDOWN_SIMD(src_ptr, src_stride, src_stepx, dst_ptr, n); \
+ } \
+ SCALEROWDOWN_C(src_ptr + (n * src_stepx) * BPP, src_stride, src_stepx, \
+ dst_ptr + n * BPP, r); \
+ }
+
+#ifdef HAS_SCALEARGBROWDOWNEVEN_SSE2
+SDAANY(ScaleARGBRowDownEven_Any_SSE2,
+ ScaleARGBRowDownEven_SSE2,
+ ScaleARGBRowDownEven_C,
+ 4,
+ 3)
+SDAANY(ScaleARGBRowDownEvenBox_Any_SSE2,
+ ScaleARGBRowDownEvenBox_SSE2,
+ ScaleARGBRowDownEvenBox_C,
+ 4,
+ 3)
+#endif
+#ifdef HAS_SCALEARGBROWDOWNEVEN_NEON
+SDAANY(ScaleARGBRowDownEven_Any_NEON,
+ ScaleARGBRowDownEven_NEON,
+ ScaleARGBRowDownEven_C,
+ 4,
+ 3)
+SDAANY(ScaleARGBRowDownEvenBox_Any_NEON,
+ ScaleARGBRowDownEvenBox_NEON,
+ ScaleARGBRowDownEvenBox_C,
+ 4,
+ 3)
+#endif
+#ifdef HAS_SCALEARGBROWDOWNEVEN_MSA
+SDAANY(ScaleARGBRowDownEven_Any_MSA,
+ ScaleARGBRowDownEven_MSA,
+ ScaleARGBRowDownEven_C,
+ 4,
+ 3)
+SDAANY(ScaleARGBRowDownEvenBox_Any_MSA,
+ ScaleARGBRowDownEvenBox_MSA,
+ ScaleARGBRowDownEvenBox_C,
+ 4,
+ 3)
+#endif
+
+// Add rows box filter scale down.
+#define SAANY(NAMEANY, SCALEADDROW_SIMD, SCALEADDROW_C, MASK) \
+ void NAMEANY(const uint8* src_ptr, uint16* dst_ptr, int src_width) { \
+ int n = src_width & ~MASK; \
+ if (n > 0) { \
+ SCALEADDROW_SIMD(src_ptr, dst_ptr, n); \
+ } \
+ SCALEADDROW_C(src_ptr + n, dst_ptr + n, src_width & MASK); \
+ }
+
+#ifdef HAS_SCALEADDROW_SSE2
+SAANY(ScaleAddRow_Any_SSE2, ScaleAddRow_SSE2, ScaleAddRow_C, 15)
+#endif
+#ifdef HAS_SCALEADDROW_AVX2
+SAANY(ScaleAddRow_Any_AVX2, ScaleAddRow_AVX2, ScaleAddRow_C, 31)
+#endif
+#ifdef HAS_SCALEADDROW_NEON
+SAANY(ScaleAddRow_Any_NEON, ScaleAddRow_NEON, ScaleAddRow_C, 15)
+#endif
+#ifdef HAS_SCALEADDROW_MSA
+SAANY(ScaleAddRow_Any_MSA, ScaleAddRow_MSA, ScaleAddRow_C, 15)
+#endif
+#ifdef HAS_SCALEADDROW_DSPR2
+SAANY(ScaleAddRow_Any_DSPR2, ScaleAddRow_DSPR2, ScaleAddRow_C, 15)
+#endif
+#undef SAANY
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/libyuv/src/main/cpp/libyuv/source/scale_argb.cc b/libyuv/src/main/cpp/libyuv/source/scale_argb.cc
new file mode 100644
index 0000000..1ea28f0
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/source/scale_argb.cc
@@ -0,0 +1,992 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/scale.h"
+
+#include
+#include
+
+#include "libyuv/cpu_id.h"
+#include "libyuv/planar_functions.h" // For CopyARGB
+#include "libyuv/row.h"
+#include "libyuv/scale_row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+static __inline int Abs(int v) {
+ return v >= 0 ? v : -v;
+}
+
+// ScaleARGB ARGB, 1/2
+// This is an optimized version for scaling down a ARGB to 1/2 of
+// its original size.
+static void ScaleARGBDown2(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint8* src_argb,
+ uint8* dst_argb,
+ int x,
+ int dx,
+ int y,
+ int dy,
+ enum FilterMode filtering) {
+ int j;
+ int row_stride = src_stride * (dy >> 16);
+ void (*ScaleARGBRowDown2)(const uint8* src_argb, ptrdiff_t src_stride,
+ uint8* dst_argb, int dst_width) =
+ filtering == kFilterNone
+ ? ScaleARGBRowDown2_C
+ : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_C
+ : ScaleARGBRowDown2Box_C);
+ (void)src_width;
+ (void)src_height;
+ (void)dx;
+ assert(dx == 65536 * 2); // Test scale factor of 2.
+ assert((dy & 0x1ffff) == 0); // Test vertical scale is multiple of 2.
+ // Advance to odd row, even column.
+ if (filtering == kFilterBilinear) {
+ src_argb += (y >> 16) * src_stride + (x >> 16) * 4;
+ } else {
+ src_argb += (y >> 16) * src_stride + ((x >> 16) - 1) * 4;
+ }
+
+#if defined(HAS_SCALEARGBROWDOWN2_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ ScaleARGBRowDown2 =
+ filtering == kFilterNone
+ ? ScaleARGBRowDown2_Any_SSE2
+ : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_SSE2
+ : ScaleARGBRowDown2Box_Any_SSE2);
+ if (IS_ALIGNED(dst_width, 4)) {
+ ScaleARGBRowDown2 =
+ filtering == kFilterNone
+ ? ScaleARGBRowDown2_SSE2
+ : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_SSE2
+ : ScaleARGBRowDown2Box_SSE2);
+ }
+ }
+#endif
+#if defined(HAS_SCALEARGBROWDOWN2_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ScaleARGBRowDown2 =
+ filtering == kFilterNone
+ ? ScaleARGBRowDown2_Any_NEON
+ : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_NEON
+ : ScaleARGBRowDown2Box_Any_NEON);
+ if (IS_ALIGNED(dst_width, 8)) {
+ ScaleARGBRowDown2 =
+ filtering == kFilterNone
+ ? ScaleARGBRowDown2_NEON
+ : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_NEON
+ : ScaleARGBRowDown2Box_NEON);
+ }
+ }
+#endif
+#if defined(HAS_SCALEARGBROWDOWN2_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ScaleARGBRowDown2 =
+ filtering == kFilterNone
+ ? ScaleARGBRowDown2_Any_MSA
+ : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_MSA
+ : ScaleARGBRowDown2Box_Any_MSA);
+ if (IS_ALIGNED(dst_width, 4)) {
+ ScaleARGBRowDown2 =
+ filtering == kFilterNone
+ ? ScaleARGBRowDown2_MSA
+ : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_MSA
+ : ScaleARGBRowDown2Box_MSA);
+ }
+ }
+#endif
+
+ if (filtering == kFilterLinear) {
+ src_stride = 0;
+ }
+ for (j = 0; j < dst_height; ++j) {
+ ScaleARGBRowDown2(src_argb, src_stride, dst_argb, dst_width);
+ src_argb += row_stride;
+ dst_argb += dst_stride;
+ }
+}
+
+// ScaleARGB ARGB, 1/4
+// This is an optimized version for scaling down a ARGB to 1/4 of
+// its original size.
+static void ScaleARGBDown4Box(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint8* src_argb,
+ uint8* dst_argb,
+ int x,
+ int dx,
+ int y,
+ int dy) {
+ int j;
+ // Allocate 2 rows of ARGB.
+ const int kRowSize = (dst_width * 2 * 4 + 31) & ~31;
+ align_buffer_64(row, kRowSize * 2);
+ int row_stride = src_stride * (dy >> 16);
+ void (*ScaleARGBRowDown2)(const uint8* src_argb, ptrdiff_t src_stride,
+ uint8* dst_argb, int dst_width) =
+ ScaleARGBRowDown2Box_C;
+ // Advance to odd row, even column.
+ src_argb += (y >> 16) * src_stride + (x >> 16) * 4;
+ (void)src_width;
+ (void)src_height;
+ (void)dx;
+ assert(dx == 65536 * 4); // Test scale factor of 4.
+ assert((dy & 0x3ffff) == 0); // Test vertical scale is multiple of 4.
+#if defined(HAS_SCALEARGBROWDOWN2_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ ScaleARGBRowDown2 = ScaleARGBRowDown2Box_Any_SSE2;
+ if (IS_ALIGNED(dst_width, 4)) {
+ ScaleARGBRowDown2 = ScaleARGBRowDown2Box_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_SCALEARGBROWDOWN2_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ScaleARGBRowDown2 = ScaleARGBRowDown2Box_Any_NEON;
+ if (IS_ALIGNED(dst_width, 8)) {
+ ScaleARGBRowDown2 = ScaleARGBRowDown2Box_NEON;
+ }
+ }
+#endif
+
+ for (j = 0; j < dst_height; ++j) {
+ ScaleARGBRowDown2(src_argb, src_stride, row, dst_width * 2);
+ ScaleARGBRowDown2(src_argb + src_stride * 2, src_stride, row + kRowSize,
+ dst_width * 2);
+ ScaleARGBRowDown2(row, kRowSize, dst_argb, dst_width);
+ src_argb += row_stride;
+ dst_argb += dst_stride;
+ }
+ free_aligned_buffer_64(row);
+}
+
+// ScaleARGB ARGB Even
+// This is an optimized version for scaling down a ARGB to even
+// multiple of its original size.
+static void ScaleARGBDownEven(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint8* src_argb,
+ uint8* dst_argb,
+ int x,
+ int dx,
+ int y,
+ int dy,
+ enum FilterMode filtering) {
+ int j;
+ int col_step = dx >> 16;
+ int row_stride = (dy >> 16) * src_stride;
+ void (*ScaleARGBRowDownEven)(const uint8* src_argb, ptrdiff_t src_stride,
+ int src_step, uint8* dst_argb, int dst_width) =
+ filtering ? ScaleARGBRowDownEvenBox_C : ScaleARGBRowDownEven_C;
+ (void)src_width;
+ (void)src_height;
+ assert(IS_ALIGNED(src_width, 2));
+ assert(IS_ALIGNED(src_height, 2));
+ src_argb += (y >> 16) * src_stride + (x >> 16) * 4;
+#if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_SSE2
+ : ScaleARGBRowDownEven_Any_SSE2;
+ if (IS_ALIGNED(dst_width, 4)) {
+ ScaleARGBRowDownEven =
+ filtering ? ScaleARGBRowDownEvenBox_SSE2 : ScaleARGBRowDownEven_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_SCALEARGBROWDOWNEVEN_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_NEON
+ : ScaleARGBRowDownEven_Any_NEON;
+ if (IS_ALIGNED(dst_width, 4)) {
+ ScaleARGBRowDownEven =
+ filtering ? ScaleARGBRowDownEvenBox_NEON : ScaleARGBRowDownEven_NEON;
+ }
+ }
+#endif
+#if defined(HAS_SCALEARGBROWDOWNEVEN_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_MSA
+ : ScaleARGBRowDownEven_Any_MSA;
+ if (IS_ALIGNED(dst_width, 4)) {
+ ScaleARGBRowDownEven =
+ filtering ? ScaleARGBRowDownEvenBox_MSA : ScaleARGBRowDownEven_MSA;
+ }
+ }
+#endif
+
+ if (filtering == kFilterLinear) {
+ src_stride = 0;
+ }
+ for (j = 0; j < dst_height; ++j) {
+ ScaleARGBRowDownEven(src_argb, src_stride, col_step, dst_argb, dst_width);
+ src_argb += row_stride;
+ dst_argb += dst_stride;
+ }
+}
+
+// Scale ARGB down with bilinear interpolation.
+static void ScaleARGBBilinearDown(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint8* src_argb,
+ uint8* dst_argb,
+ int x,
+ int dx,
+ int y,
+ int dy,
+ enum FilterMode filtering) {
+ int j;
+ void (*InterpolateRow)(uint8 * dst_argb, const uint8* src_argb,
+ ptrdiff_t src_stride, int dst_width,
+ int source_y_fraction) = InterpolateRow_C;
+ void (*ScaleARGBFilterCols)(uint8 * dst_argb, const uint8* src_argb,
+ int dst_width, int x, int dx) =
+ (src_width >= 32768) ? ScaleARGBFilterCols64_C : ScaleARGBFilterCols_C;
+ int64 xlast = x + (int64)(dst_width - 1) * dx;
+ int64 xl = (dx >= 0) ? x : xlast;
+ int64 xr = (dx >= 0) ? xlast : x;
+ int clip_src_width;
+ xl = (xl >> 16) & ~3; // Left edge aligned.
+ xr = (xr >> 16) + 1; // Right most pixel used. Bilinear uses 2 pixels.
+ xr = (xr + 1 + 3) & ~3; // 1 beyond 4 pixel aligned right most pixel.
+ if (xr > src_width) {
+ xr = src_width;
+ }
+ clip_src_width = (int)(xr - xl) * 4; // Width aligned to 4.
+ src_argb += xl * 4;
+ x -= (int)(xl << 16);
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ InterpolateRow = InterpolateRow_Any_SSSE3;
+ if (IS_ALIGNED(clip_src_width, 16)) {
+ InterpolateRow = InterpolateRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ InterpolateRow = InterpolateRow_Any_AVX2;
+ if (IS_ALIGNED(clip_src_width, 32)) {
+ InterpolateRow = InterpolateRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ InterpolateRow = InterpolateRow_Any_NEON;
+ if (IS_ALIGNED(clip_src_width, 16)) {
+ InterpolateRow = InterpolateRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(src_argb, 4) &&
+ IS_ALIGNED(src_stride, 4)) {
+ InterpolateRow = InterpolateRow_Any_DSPR2;
+ if (IS_ALIGNED(clip_src_width, 4)) {
+ InterpolateRow = InterpolateRow_DSPR2;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ InterpolateRow = InterpolateRow_Any_MSA;
+ if (IS_ALIGNED(clip_src_width, 32)) {
+ InterpolateRow = InterpolateRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_SCALEARGBFILTERCOLS_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
+ ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3;
+ }
+#endif
+#if defined(HAS_SCALEARGBFILTERCOLS_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ScaleARGBFilterCols = ScaleARGBFilterCols_Any_NEON;
+ if (IS_ALIGNED(dst_width, 4)) {
+ ScaleARGBFilterCols = ScaleARGBFilterCols_NEON;
+ }
+ }
+#endif
+ // TODO(fbarchard): Consider not allocating row buffer for kFilterLinear.
+ // Allocate a row of ARGB.
+ {
+ align_buffer_64(row, clip_src_width * 4);
+
+ const int max_y = (src_height - 1) << 16;
+ if (y > max_y) {
+ y = max_y;
+ }
+ for (j = 0; j < dst_height; ++j) {
+ int yi = y >> 16;
+ const uint8* src = src_argb + yi * src_stride;
+ if (filtering == kFilterLinear) {
+ ScaleARGBFilterCols(dst_argb, src, dst_width, x, dx);
+ } else {
+ int yf = (y >> 8) & 255;
+ InterpolateRow(row, src, src_stride, clip_src_width, yf);
+ ScaleARGBFilterCols(dst_argb, row, dst_width, x, dx);
+ }
+ dst_argb += dst_stride;
+ y += dy;
+ if (y > max_y) {
+ y = max_y;
+ }
+ }
+ free_aligned_buffer_64(row);
+ }
+}
+
+// Scale ARGB up with bilinear interpolation.
+static void ScaleARGBBilinearUp(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint8* src_argb,
+ uint8* dst_argb,
+ int x,
+ int dx,
+ int y,
+ int dy,
+ enum FilterMode filtering) {
+ int j;
+ void (*InterpolateRow)(uint8 * dst_argb, const uint8* src_argb,
+ ptrdiff_t src_stride, int dst_width,
+ int source_y_fraction) = InterpolateRow_C;
+ void (*ScaleARGBFilterCols)(uint8 * dst_argb, const uint8* src_argb,
+ int dst_width, int x, int dx) =
+ filtering ? ScaleARGBFilterCols_C : ScaleARGBCols_C;
+ const int max_y = (src_height - 1) << 16;
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ InterpolateRow = InterpolateRow_Any_SSSE3;
+ if (IS_ALIGNED(dst_width, 4)) {
+ InterpolateRow = InterpolateRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ InterpolateRow = InterpolateRow_Any_AVX2;
+ if (IS_ALIGNED(dst_width, 8)) {
+ InterpolateRow = InterpolateRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ InterpolateRow = InterpolateRow_Any_NEON;
+ if (IS_ALIGNED(dst_width, 4)) {
+ InterpolateRow = InterpolateRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(dst_argb, 4) &&
+ IS_ALIGNED(dst_stride, 4)) {
+ InterpolateRow = InterpolateRow_DSPR2;
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ InterpolateRow = InterpolateRow_Any_MSA;
+ if (IS_ALIGNED(dst_width, 8)) {
+ InterpolateRow = InterpolateRow_MSA;
+ }
+ }
+#endif
+ if (src_width >= 32768) {
+ ScaleARGBFilterCols =
+ filtering ? ScaleARGBFilterCols64_C : ScaleARGBCols64_C;
+ }
+#if defined(HAS_SCALEARGBFILTERCOLS_SSSE3)
+ if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
+ ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3;
+ }
+#endif
+#if defined(HAS_SCALEARGBFILTERCOLS_NEON)
+ if (filtering && TestCpuFlag(kCpuHasNEON)) {
+ ScaleARGBFilterCols = ScaleARGBFilterCols_Any_NEON;
+ if (IS_ALIGNED(dst_width, 4)) {
+ ScaleARGBFilterCols = ScaleARGBFilterCols_NEON;
+ }
+ }
+#endif
+#if defined(HAS_SCALEARGBCOLS_SSE2)
+ if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {
+ ScaleARGBFilterCols = ScaleARGBCols_SSE2;
+ }
+#endif
+#if defined(HAS_SCALEARGBCOLS_NEON)
+ if (!filtering && TestCpuFlag(kCpuHasNEON)) {
+ ScaleARGBFilterCols = ScaleARGBCols_Any_NEON;
+ if (IS_ALIGNED(dst_width, 8)) {
+ ScaleARGBFilterCols = ScaleARGBCols_NEON;
+ }
+ }
+#endif
+ if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
+ ScaleARGBFilterCols = ScaleARGBColsUp2_C;
+#if defined(HAS_SCALEARGBCOLSUP2_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
+ ScaleARGBFilterCols = ScaleARGBColsUp2_SSE2;
+ }
+#endif
+ }
+
+ if (y > max_y) {
+ y = max_y;
+ }
+
+ {
+ int yi = y >> 16;
+ const uint8* src = src_argb + yi * src_stride;
+
+ // Allocate 2 rows of ARGB.
+ const int kRowSize = (dst_width * 4 + 31) & ~31;
+ align_buffer_64(row, kRowSize * 2);
+
+ uint8* rowptr = row;
+ int rowstride = kRowSize;
+ int lasty = yi;
+
+ ScaleARGBFilterCols(rowptr, src, dst_width, x, dx);
+ if (src_height > 1) {
+ src += src_stride;
+ }
+ ScaleARGBFilterCols(rowptr + rowstride, src, dst_width, x, dx);
+ src += src_stride;
+
+ for (j = 0; j < dst_height; ++j) {
+ yi = y >> 16;
+ if (yi != lasty) {
+ if (y > max_y) {
+ y = max_y;
+ yi = y >> 16;
+ src = src_argb + yi * src_stride;
+ }
+ if (yi != lasty) {
+ ScaleARGBFilterCols(rowptr, src, dst_width, x, dx);
+ rowptr += rowstride;
+ rowstride = -rowstride;
+ lasty = yi;
+ src += src_stride;
+ }
+ }
+ if (filtering == kFilterLinear) {
+ InterpolateRow(dst_argb, rowptr, 0, dst_width * 4, 0);
+ } else {
+ int yf = (y >> 8) & 255;
+ InterpolateRow(dst_argb, rowptr, rowstride, dst_width * 4, yf);
+ }
+ dst_argb += dst_stride;
+ y += dy;
+ }
+ free_aligned_buffer_64(row);
+ }
+}
+
+#ifdef YUVSCALEUP
+// Scale YUV to ARGB up with bilinear interpolation.
+static void ScaleYUVToARGBBilinearUp(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride_y,
+ int src_stride_u,
+ int src_stride_v,
+ int dst_stride_argb,
+ const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int x,
+ int dx,
+ int y,
+ int dy,
+ enum FilterMode filtering) {
+ int j;
+ void (*I422ToARGBRow)(const uint8* y_buf, const uint8* u_buf,
+ const uint8* v_buf, uint8* rgb_buf, int width) =
+ I422ToARGBRow_C;
+#if defined(HAS_I422TOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(src_width, 8)) {
+ I422ToARGBRow = I422ToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I422ToARGBRow = I422ToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(src_width, 16)) {
+ I422ToARGBRow = I422ToARGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I422ToARGBRow = I422ToARGBRow_Any_NEON;
+ if (IS_ALIGNED(src_width, 8)) {
+ I422ToARGBRow = I422ToARGBRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGBROW_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(src_width, 4) &&
+ IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
+ IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
+ IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
+ IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
+ I422ToARGBRow = I422ToARGBRow_DSPR2;
+ }
+#endif
+#if defined(HAS_I422TOARGBROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ I422ToARGBRow = I422ToARGBRow_Any_MSA;
+ if (IS_ALIGNED(src_width, 8)) {
+ I422ToARGBRow = I422ToARGBRow_MSA;
+ }
+ }
+#endif
+
+ void (*InterpolateRow)(uint8 * dst_argb, const uint8* src_argb,
+ ptrdiff_t src_stride, int dst_width,
+ int source_y_fraction) = InterpolateRow_C;
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ InterpolateRow = InterpolateRow_Any_SSSE3;
+ if (IS_ALIGNED(dst_width, 4)) {
+ InterpolateRow = InterpolateRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ InterpolateRow = InterpolateRow_Any_AVX2;
+ if (IS_ALIGNED(dst_width, 8)) {
+ InterpolateRow = InterpolateRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ InterpolateRow = InterpolateRow_Any_NEON;
+ if (IS_ALIGNED(dst_width, 4)) {
+ InterpolateRow = InterpolateRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(dst_argb, 4) &&
+ IS_ALIGNED(dst_stride_argb, 4)) {
+ InterpolateRow = InterpolateRow_DSPR2;
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ InterpolateRow = InterpolateRow_Any_MSA;
+ if (IS_ALIGNED(dst_width, 8)) {
+ InterpolateRow = InterpolateRow_MSA;
+ }
+ }
+#endif
+
+ void (*ScaleARGBFilterCols)(uint8 * dst_argb, const uint8* src_argb,
+ int dst_width, int x, int dx) =
+ filtering ? ScaleARGBFilterCols_C : ScaleARGBCols_C;
+ if (src_width >= 32768) {
+ ScaleARGBFilterCols =
+ filtering ? ScaleARGBFilterCols64_C : ScaleARGBCols64_C;
+ }
+#if defined(HAS_SCALEARGBFILTERCOLS_SSSE3)
+ if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
+ ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3;
+ }
+#endif
+#if defined(HAS_SCALEARGBFILTERCOLS_NEON)
+ if (filtering && TestCpuFlag(kCpuHasNEON)) {
+ ScaleARGBFilterCols = ScaleARGBFilterCols_Any_NEON;
+ if (IS_ALIGNED(dst_width, 4)) {
+ ScaleARGBFilterCols = ScaleARGBFilterCols_NEON;
+ }
+ }
+#endif
+#if defined(HAS_SCALEARGBCOLS_SSE2)
+ if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {
+ ScaleARGBFilterCols = ScaleARGBCols_SSE2;
+ }
+#endif
+#if defined(HAS_SCALEARGBCOLS_NEON)
+ if (!filtering && TestCpuFlag(kCpuHasNEON)) {
+ ScaleARGBFilterCols = ScaleARGBCols_Any_NEON;
+ if (IS_ALIGNED(dst_width, 8)) {
+ ScaleARGBFilterCols = ScaleARGBCols_NEON;
+ }
+ }
+#endif
+ if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
+ ScaleARGBFilterCols = ScaleARGBColsUp2_C;
+#if defined(HAS_SCALEARGBCOLSUP2_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
+ ScaleARGBFilterCols = ScaleARGBColsUp2_SSE2;
+ }
+#endif
+ }
+
+ const int max_y = (src_height - 1) << 16;
+ if (y > max_y) {
+ y = max_y;
+ }
+ const int kYShift = 1; // Shift Y by 1 to convert Y plane to UV coordinate.
+ int yi = y >> 16;
+ int uv_yi = yi >> kYShift;
+ const uint8* src_row_y = src_y + yi * src_stride_y;
+ const uint8* src_row_u = src_u + uv_yi * src_stride_u;
+ const uint8* src_row_v = src_v + uv_yi * src_stride_v;
+
+ // Allocate 2 rows of ARGB.
+ const int kRowSize = (dst_width * 4 + 31) & ~31;
+ align_buffer_64(row, kRowSize * 2);
+
+ // Allocate 1 row of ARGB for source conversion.
+ align_buffer_64(argb_row, src_width * 4);
+
+ uint8* rowptr = row;
+ int rowstride = kRowSize;
+ int lasty = yi;
+
+ // TODO(fbarchard): Convert first 2 rows of YUV to ARGB.
+ ScaleARGBFilterCols(rowptr, src_row_y, dst_width, x, dx);
+ if (src_height > 1) {
+ src_row_y += src_stride_y;
+ if (yi & 1) {
+ src_row_u += src_stride_u;
+ src_row_v += src_stride_v;
+ }
+ }
+ ScaleARGBFilterCols(rowptr + rowstride, src_row_y, dst_width, x, dx);
+ if (src_height > 2) {
+ src_row_y += src_stride_y;
+ if (!(yi & 1)) {
+ src_row_u += src_stride_u;
+ src_row_v += src_stride_v;
+ }
+ }
+
+ for (j = 0; j < dst_height; ++j) {
+ yi = y >> 16;
+ if (yi != lasty) {
+ if (y > max_y) {
+ y = max_y;
+ yi = y >> 16;
+ uv_yi = yi >> kYShift;
+ src_row_y = src_y + yi * src_stride_y;
+ src_row_u = src_u + uv_yi * src_stride_u;
+ src_row_v = src_v + uv_yi * src_stride_v;
+ }
+ if (yi != lasty) {
+ // TODO(fbarchard): Convert the clipped region of row.
+ I422ToARGBRow(src_row_y, src_row_u, src_row_v, argb_row, src_width);
+ ScaleARGBFilterCols(rowptr, argb_row, dst_width, x, dx);
+ rowptr += rowstride;
+ rowstride = -rowstride;
+ lasty = yi;
+ src_row_y += src_stride_y;
+ if (yi & 1) {
+ src_row_u += src_stride_u;
+ src_row_v += src_stride_v;
+ }
+ }
+ }
+ if (filtering == kFilterLinear) {
+ InterpolateRow(dst_argb, rowptr, 0, dst_width * 4, 0);
+ } else {
+ int yf = (y >> 8) & 255;
+ InterpolateRow(dst_argb, rowptr, rowstride, dst_width * 4, yf);
+ }
+ dst_argb += dst_stride_argb;
+ y += dy;
+ }
+ free_aligned_buffer_64(row);
+ free_aligned_buffer_64(row_argb);
+}
+#endif
+
+// Scale ARGB to/from any dimensions, without interpolation.
+// Fixed point math is used for performance: The upper 16 bits
+// of x and dx is the integer part of the source position and
+// the lower 16 bits are the fixed decimal part.
+
+static void ScaleARGBSimple(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint8* src_argb,
+ uint8* dst_argb,
+ int x,
+ int dx,
+ int y,
+ int dy) {
+ int j;
+ void (*ScaleARGBCols)(uint8 * dst_argb, const uint8* src_argb, int dst_width,
+ int x, int dx) =
+ (src_width >= 32768) ? ScaleARGBCols64_C : ScaleARGBCols_C;
+ (void)src_height;
+#if defined(HAS_SCALEARGBCOLS_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {
+ ScaleARGBCols = ScaleARGBCols_SSE2;
+ }
+#endif
+#if defined(HAS_SCALEARGBCOLS_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ScaleARGBCols = ScaleARGBCols_Any_NEON;
+ if (IS_ALIGNED(dst_width, 8)) {
+ ScaleARGBCols = ScaleARGBCols_NEON;
+ }
+ }
+#endif
+ if (src_width * 2 == dst_width && x < 0x8000) {
+ ScaleARGBCols = ScaleARGBColsUp2_C;
+#if defined(HAS_SCALEARGBCOLSUP2_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
+ ScaleARGBCols = ScaleARGBColsUp2_SSE2;
+ }
+#endif
+ }
+
+ for (j = 0; j < dst_height; ++j) {
+ ScaleARGBCols(dst_argb, src_argb + (y >> 16) * src_stride, dst_width, x,
+ dx);
+ dst_argb += dst_stride;
+ y += dy;
+ }
+}
+
+// ScaleARGB a ARGB.
+// This function in turn calls a scaling function
+// suitable for handling the desired resolutions.
+static void ScaleARGB(const uint8* src,
+ int src_stride,
+ int src_width,
+ int src_height,
+ uint8* dst,
+ int dst_stride,
+ int dst_width,
+ int dst_height,
+ int clip_x,
+ int clip_y,
+ int clip_width,
+ int clip_height,
+ enum FilterMode filtering) {
+ // Initial source x/y coordinate and step values as 16.16 fixed point.
+ int x = 0;
+ int y = 0;
+ int dx = 0;
+ int dy = 0;
+ // ARGB does not support box filter yet, but allow the user to pass it.
+ // Simplify filtering when possible.
+ filtering = ScaleFilterReduce(src_width, src_height, dst_width, dst_height,
+ filtering);
+
+ // Negative src_height means invert the image.
+ if (src_height < 0) {
+ src_height = -src_height;
+ src = src + (src_height - 1) * src_stride;
+ src_stride = -src_stride;
+ }
+ ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y,
+ &dx, &dy);
+ src_width = Abs(src_width);
+ if (clip_x) {
+ int64 clipf = (int64)(clip_x)*dx;
+ x += (clipf & 0xffff);
+ src += (clipf >> 16) * 4;
+ dst += clip_x * 4;
+ }
+ if (clip_y) {
+ int64 clipf = (int64)(clip_y)*dy;
+ y += (clipf & 0xffff);
+ src += (clipf >> 16) * src_stride;
+ dst += clip_y * dst_stride;
+ }
+
+ // Special case for integer step values.
+ if (((dx | dy) & 0xffff) == 0) {
+ if (!dx || !dy) { // 1 pixel wide and/or tall.
+ filtering = kFilterNone;
+ } else {
+ // Optimized even scale down. ie 2, 4, 6, 8, 10x.
+ if (!(dx & 0x10000) && !(dy & 0x10000)) {
+ if (dx == 0x20000) {
+ // Optimized 1/2 downsample.
+ ScaleARGBDown2(src_width, src_height, clip_width, clip_height,
+ src_stride, dst_stride, src, dst, x, dx, y, dy,
+ filtering);
+ return;
+ }
+ if (dx == 0x40000 && filtering == kFilterBox) {
+ // Optimized 1/4 box downsample.
+ ScaleARGBDown4Box(src_width, src_height, clip_width, clip_height,
+ src_stride, dst_stride, src, dst, x, dx, y, dy);
+ return;
+ }
+ ScaleARGBDownEven(src_width, src_height, clip_width, clip_height,
+ src_stride, dst_stride, src, dst, x, dx, y, dy,
+ filtering);
+ return;
+ }
+ // Optimized odd scale down. ie 3, 5, 7, 9x.
+ if ((dx & 0x10000) && (dy & 0x10000)) {
+ filtering = kFilterNone;
+ if (dx == 0x10000 && dy == 0x10000) {
+ // Straight copy.
+ ARGBCopy(src + (y >> 16) * src_stride + (x >> 16) * 4, src_stride,
+ dst, dst_stride, clip_width, clip_height);
+ return;
+ }
+ }
+ }
+ }
+ if (dx == 0x10000 && (x & 0xffff) == 0) {
+ // Arbitrary scale vertically, but unscaled vertically.
+ ScalePlaneVertical(src_height, clip_width, clip_height, src_stride,
+ dst_stride, src, dst, x, y, dy, 4, filtering);
+ return;
+ }
+ if (filtering && dy < 65536) {
+ ScaleARGBBilinearUp(src_width, src_height, clip_width, clip_height,
+ src_stride, dst_stride, src, dst, x, dx, y, dy,
+ filtering);
+ return;
+ }
+ if (filtering) {
+ ScaleARGBBilinearDown(src_width, src_height, clip_width, clip_height,
+ src_stride, dst_stride, src, dst, x, dx, y, dy,
+ filtering);
+ return;
+ }
+ ScaleARGBSimple(src_width, src_height, clip_width, clip_height, src_stride,
+ dst_stride, src, dst, x, dx, y, dy);
+}
+
+LIBYUV_API
+int ARGBScaleClip(const uint8* src_argb,
+ int src_stride_argb,
+ int src_width,
+ int src_height,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int dst_width,
+ int dst_height,
+ int clip_x,
+ int clip_y,
+ int clip_width,
+ int clip_height,
+ enum FilterMode filtering) {
+ if (!src_argb || src_width == 0 || src_height == 0 || !dst_argb ||
+ dst_width <= 0 || dst_height <= 0 || clip_x < 0 || clip_y < 0 ||
+ clip_width > 32768 || clip_height > 32768 ||
+ (clip_x + clip_width) > dst_width ||
+ (clip_y + clip_height) > dst_height) {
+ return -1;
+ }
+ ScaleARGB(src_argb, src_stride_argb, src_width, src_height, dst_argb,
+ dst_stride_argb, dst_width, dst_height, clip_x, clip_y, clip_width,
+ clip_height, filtering);
+ return 0;
+}
+
+// Scale an ARGB image.
+LIBYUV_API
+int ARGBScale(const uint8* src_argb,
+ int src_stride_argb,
+ int src_width,
+ int src_height,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int dst_width,
+ int dst_height,
+ enum FilterMode filtering) {
+ if (!src_argb || src_width == 0 || src_height == 0 || src_width > 32768 ||
+ src_height > 32768 || !dst_argb || dst_width <= 0 || dst_height <= 0) {
+ return -1;
+ }
+ ScaleARGB(src_argb, src_stride_argb, src_width, src_height, dst_argb,
+ dst_stride_argb, dst_width, dst_height, 0, 0, dst_width, dst_height,
+ filtering);
+ return 0;
+}
+
+// Scale with YUV conversion to ARGB and clipping.
+LIBYUV_API
+int YUVToARGBScaleClip(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint32 src_fourcc,
+ int src_width,
+ int src_height,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ uint32 dst_fourcc,
+ int dst_width,
+ int dst_height,
+ int clip_x,
+ int clip_y,
+ int clip_width,
+ int clip_height,
+ enum FilterMode filtering) {
+ uint8* argb_buffer = (uint8*)malloc(src_width * src_height * 4);
+ int r;
+ (void)src_fourcc; // TODO(fbarchard): implement and/or assert.
+ (void)dst_fourcc;
+ I420ToARGB(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v,
+ argb_buffer, src_width * 4, src_width, src_height);
+
+ r = ARGBScaleClip(argb_buffer, src_width * 4, src_width, src_height, dst_argb,
+ dst_stride_argb, dst_width, dst_height, clip_x, clip_y,
+ clip_width, clip_height, filtering);
+ free(argb_buffer);
+ return r;
+}
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/libyuv/src/main/cpp/libyuv/source/scale_common.cc b/libyuv/src/main/cpp/libyuv/source/scale_common.cc
new file mode 100644
index 0000000..1bef39d
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/source/scale_common.cc
@@ -0,0 +1,1312 @@
+/*
+ * Copyright 2013 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/scale.h"
+
+#include
+#include
+
+#include "libyuv/cpu_id.h"
+#include "libyuv/planar_functions.h" // For CopyARGB
+#include "libyuv/row.h"
+#include "libyuv/scale_row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+static __inline int Abs(int v) {
+ return v >= 0 ? v : -v;
+}
+
+// CPU agnostic row functions
+void ScaleRowDown2_C(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst,
+ int dst_width) {
+ int x;
+ (void)src_stride;
+ for (x = 0; x < dst_width - 1; x += 2) {
+ dst[0] = src_ptr[1];
+ dst[1] = src_ptr[3];
+ dst += 2;
+ src_ptr += 4;
+ }
+ if (dst_width & 1) {
+ dst[0] = src_ptr[1];
+ }
+}
+
+void ScaleRowDown2_16_C(const uint16* src_ptr,
+ ptrdiff_t src_stride,
+ uint16* dst,
+ int dst_width) {
+ int x;
+ (void)src_stride;
+ for (x = 0; x < dst_width - 1; x += 2) {
+ dst[0] = src_ptr[1];
+ dst[1] = src_ptr[3];
+ dst += 2;
+ src_ptr += 4;
+ }
+ if (dst_width & 1) {
+ dst[0] = src_ptr[1];
+ }
+}
+
+void ScaleRowDown2Linear_C(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst,
+ int dst_width) {
+ const uint8* s = src_ptr;
+ int x;
+ (void)src_stride;
+ for (x = 0; x < dst_width - 1; x += 2) {
+ dst[0] = (s[0] + s[1] + 1) >> 1;
+ dst[1] = (s[2] + s[3] + 1) >> 1;
+ dst += 2;
+ s += 4;
+ }
+ if (dst_width & 1) {
+ dst[0] = (s[0] + s[1] + 1) >> 1;
+ }
+}
+
+void ScaleRowDown2Linear_16_C(const uint16* src_ptr,
+ ptrdiff_t src_stride,
+ uint16* dst,
+ int dst_width) {
+ const uint16* s = src_ptr;
+ int x;
+ (void)src_stride;
+ for (x = 0; x < dst_width - 1; x += 2) {
+ dst[0] = (s[0] + s[1] + 1) >> 1;
+ dst[1] = (s[2] + s[3] + 1) >> 1;
+ dst += 2;
+ s += 4;
+ }
+ if (dst_width & 1) {
+ dst[0] = (s[0] + s[1] + 1) >> 1;
+ }
+}
+
+void ScaleRowDown2Box_C(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst,
+ int dst_width) {
+ const uint8* s = src_ptr;
+ const uint8* t = src_ptr + src_stride;
+ int x;
+ for (x = 0; x < dst_width - 1; x += 2) {
+ dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
+ dst[1] = (s[2] + s[3] + t[2] + t[3] + 2) >> 2;
+ dst += 2;
+ s += 4;
+ t += 4;
+ }
+ if (dst_width & 1) {
+ dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
+ }
+}
+
+void ScaleRowDown2Box_Odd_C(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst,
+ int dst_width) {
+ const uint8* s = src_ptr;
+ const uint8* t = src_ptr + src_stride;
+ int x;
+ dst_width -= 1;
+ for (x = 0; x < dst_width - 1; x += 2) {
+ dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
+ dst[1] = (s[2] + s[3] + t[2] + t[3] + 2) >> 2;
+ dst += 2;
+ s += 4;
+ t += 4;
+ }
+ if (dst_width & 1) {
+ dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
+ dst += 1;
+ s += 2;
+ t += 2;
+ }
+ dst[0] = (s[0] + t[0] + 1) >> 1;
+}
+
+void ScaleRowDown2Box_16_C(const uint16* src_ptr,
+ ptrdiff_t src_stride,
+ uint16* dst,
+ int dst_width) {
+ const uint16* s = src_ptr;
+ const uint16* t = src_ptr + src_stride;
+ int x;
+ for (x = 0; x < dst_width - 1; x += 2) {
+ dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
+ dst[1] = (s[2] + s[3] + t[2] + t[3] + 2) >> 2;
+ dst += 2;
+ s += 4;
+ t += 4;
+ }
+ if (dst_width & 1) {
+ dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
+ }
+}
+
+void ScaleRowDown4_C(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst,
+ int dst_width) {
+ int x;
+ (void)src_stride;
+ for (x = 0; x < dst_width - 1; x += 2) {
+ dst[0] = src_ptr[2];
+ dst[1] = src_ptr[6];
+ dst += 2;
+ src_ptr += 8;
+ }
+ if (dst_width & 1) {
+ dst[0] = src_ptr[2];
+ }
+}
+
+void ScaleRowDown4_16_C(const uint16* src_ptr,
+ ptrdiff_t src_stride,
+ uint16* dst,
+ int dst_width) {
+ int x;
+ (void)src_stride;
+ for (x = 0; x < dst_width - 1; x += 2) {
+ dst[0] = src_ptr[2];
+ dst[1] = src_ptr[6];
+ dst += 2;
+ src_ptr += 8;
+ }
+ if (dst_width & 1) {
+ dst[0] = src_ptr[2];
+ }
+}
+
+void ScaleRowDown4Box_C(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst,
+ int dst_width) {
+ intptr_t stride = src_stride;
+ int x;
+ for (x = 0; x < dst_width - 1; x += 2) {
+ dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
+ src_ptr[stride + 0] + src_ptr[stride + 1] + src_ptr[stride + 2] +
+ src_ptr[stride + 3] + src_ptr[stride * 2 + 0] +
+ src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2] +
+ src_ptr[stride * 2 + 3] + src_ptr[stride * 3 + 0] +
+ src_ptr[stride * 3 + 1] + src_ptr[stride * 3 + 2] +
+ src_ptr[stride * 3 + 3] + 8) >>
+ 4;
+ dst[1] = (src_ptr[4] + src_ptr[5] + src_ptr[6] + src_ptr[7] +
+ src_ptr[stride + 4] + src_ptr[stride + 5] + src_ptr[stride + 6] +
+ src_ptr[stride + 7] + src_ptr[stride * 2 + 4] +
+ src_ptr[stride * 2 + 5] + src_ptr[stride * 2 + 6] +
+ src_ptr[stride * 2 + 7] + src_ptr[stride * 3 + 4] +
+ src_ptr[stride * 3 + 5] + src_ptr[stride * 3 + 6] +
+ src_ptr[stride * 3 + 7] + 8) >>
+ 4;
+ dst += 2;
+ src_ptr += 8;
+ }
+ if (dst_width & 1) {
+ dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
+ src_ptr[stride + 0] + src_ptr[stride + 1] + src_ptr[stride + 2] +
+ src_ptr[stride + 3] + src_ptr[stride * 2 + 0] +
+ src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2] +
+ src_ptr[stride * 2 + 3] + src_ptr[stride * 3 + 0] +
+ src_ptr[stride * 3 + 1] + src_ptr[stride * 3 + 2] +
+ src_ptr[stride * 3 + 3] + 8) >>
+ 4;
+ }
+}
+
+void ScaleRowDown4Box_16_C(const uint16* src_ptr,
+ ptrdiff_t src_stride,
+ uint16* dst,
+ int dst_width) {
+ intptr_t stride = src_stride;
+ int x;
+ for (x = 0; x < dst_width - 1; x += 2) {
+ dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
+ src_ptr[stride + 0] + src_ptr[stride + 1] + src_ptr[stride + 2] +
+ src_ptr[stride + 3] + src_ptr[stride * 2 + 0] +
+ src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2] +
+ src_ptr[stride * 2 + 3] + src_ptr[stride * 3 + 0] +
+ src_ptr[stride * 3 + 1] + src_ptr[stride * 3 + 2] +
+ src_ptr[stride * 3 + 3] + 8) >>
+ 4;
+ dst[1] = (src_ptr[4] + src_ptr[5] + src_ptr[6] + src_ptr[7] +
+ src_ptr[stride + 4] + src_ptr[stride + 5] + src_ptr[stride + 6] +
+ src_ptr[stride + 7] + src_ptr[stride * 2 + 4] +
+ src_ptr[stride * 2 + 5] + src_ptr[stride * 2 + 6] +
+ src_ptr[stride * 2 + 7] + src_ptr[stride * 3 + 4] +
+ src_ptr[stride * 3 + 5] + src_ptr[stride * 3 + 6] +
+ src_ptr[stride * 3 + 7] + 8) >>
+ 4;
+ dst += 2;
+ src_ptr += 8;
+ }
+ if (dst_width & 1) {
+ dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
+ src_ptr[stride + 0] + src_ptr[stride + 1] + src_ptr[stride + 2] +
+ src_ptr[stride + 3] + src_ptr[stride * 2 + 0] +
+ src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2] +
+ src_ptr[stride * 2 + 3] + src_ptr[stride * 3 + 0] +
+ src_ptr[stride * 3 + 1] + src_ptr[stride * 3 + 2] +
+ src_ptr[stride * 3 + 3] + 8) >>
+ 4;
+ }
+}
+
+void ScaleRowDown34_C(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst,
+ int dst_width) {
+ int x;
+ (void)src_stride;
+ assert((dst_width % 3 == 0) && (dst_width > 0));
+ for (x = 0; x < dst_width; x += 3) {
+ dst[0] = src_ptr[0];
+ dst[1] = src_ptr[1];
+ dst[2] = src_ptr[3];
+ dst += 3;
+ src_ptr += 4;
+ }
+}
+
+void ScaleRowDown34_16_C(const uint16* src_ptr,
+ ptrdiff_t src_stride,
+ uint16* dst,
+ int dst_width) {
+ int x;
+ (void)src_stride;
+ assert((dst_width % 3 == 0) && (dst_width > 0));
+ for (x = 0; x < dst_width; x += 3) {
+ dst[0] = src_ptr[0];
+ dst[1] = src_ptr[1];
+ dst[2] = src_ptr[3];
+ dst += 3;
+ src_ptr += 4;
+ }
+}
+
+// Filter rows 0 and 1 together, 3 : 1
+void ScaleRowDown34_0_Box_C(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* d,
+ int dst_width) {
+ const uint8* s = src_ptr;
+ const uint8* t = src_ptr + src_stride;
+ int x;
+ assert((dst_width % 3 == 0) && (dst_width > 0));
+ for (x = 0; x < dst_width; x += 3) {
+ uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
+ uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
+ uint8 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
+ uint8 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
+ uint8 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
+ uint8 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
+ d[0] = (a0 * 3 + b0 + 2) >> 2;
+ d[1] = (a1 * 3 + b1 + 2) >> 2;
+ d[2] = (a2 * 3 + b2 + 2) >> 2;
+ d += 3;
+ s += 4;
+ t += 4;
+ }
+}
+
+void ScaleRowDown34_0_Box_16_C(const uint16* src_ptr,
+ ptrdiff_t src_stride,
+ uint16* d,
+ int dst_width) {
+ const uint16* s = src_ptr;
+ const uint16* t = src_ptr + src_stride;
+ int x;
+ assert((dst_width % 3 == 0) && (dst_width > 0));
+ for (x = 0; x < dst_width; x += 3) {
+ uint16 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
+ uint16 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
+ uint16 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
+ uint16 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
+ uint16 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
+ uint16 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
+ d[0] = (a0 * 3 + b0 + 2) >> 2;
+ d[1] = (a1 * 3 + b1 + 2) >> 2;
+ d[2] = (a2 * 3 + b2 + 2) >> 2;
+ d += 3;
+ s += 4;
+ t += 4;
+ }
+}
+
+// Filter rows 1 and 2 together, 1 : 1
+void ScaleRowDown34_1_Box_C(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* d,
+ int dst_width) {
+ const uint8* s = src_ptr;
+ const uint8* t = src_ptr + src_stride;
+ int x;
+ assert((dst_width % 3 == 0) && (dst_width > 0));
+ for (x = 0; x < dst_width; x += 3) {
+ uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
+ uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
+ uint8 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
+ uint8 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
+ uint8 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
+ uint8 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
+ d[0] = (a0 + b0 + 1) >> 1;
+ d[1] = (a1 + b1 + 1) >> 1;
+ d[2] = (a2 + b2 + 1) >> 1;
+ d += 3;
+ s += 4;
+ t += 4;
+ }
+}
+
+void ScaleRowDown34_1_Box_16_C(const uint16* src_ptr,
+ ptrdiff_t src_stride,
+ uint16* d,
+ int dst_width) {
+ const uint16* s = src_ptr;
+ const uint16* t = src_ptr + src_stride;
+ int x;
+ assert((dst_width % 3 == 0) && (dst_width > 0));
+ for (x = 0; x < dst_width; x += 3) {
+ uint16 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
+ uint16 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
+ uint16 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
+ uint16 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
+ uint16 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
+ uint16 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
+ d[0] = (a0 + b0 + 1) >> 1;
+ d[1] = (a1 + b1 + 1) >> 1;
+ d[2] = (a2 + b2 + 1) >> 1;
+ d += 3;
+ s += 4;
+ t += 4;
+ }
+}
+
+// Scales a single row of pixels using point sampling.
+void ScaleCols_C(uint8* dst_ptr,
+ const uint8* src_ptr,
+ int dst_width,
+ int x,
+ int dx) {
+ int j;
+ for (j = 0; j < dst_width - 1; j += 2) {
+ dst_ptr[0] = src_ptr[x >> 16];
+ x += dx;
+ dst_ptr[1] = src_ptr[x >> 16];
+ x += dx;
+ dst_ptr += 2;
+ }
+ if (dst_width & 1) {
+ dst_ptr[0] = src_ptr[x >> 16];
+ }
+}
+
+void ScaleCols_16_C(uint16* dst_ptr,
+ const uint16* src_ptr,
+ int dst_width,
+ int x,
+ int dx) {
+ int j;
+ for (j = 0; j < dst_width - 1; j += 2) {
+ dst_ptr[0] = src_ptr[x >> 16];
+ x += dx;
+ dst_ptr[1] = src_ptr[x >> 16];
+ x += dx;
+ dst_ptr += 2;
+ }
+ if (dst_width & 1) {
+ dst_ptr[0] = src_ptr[x >> 16];
+ }
+}
+
+// Scales a single row of pixels up by 2x using point sampling.
+void ScaleColsUp2_C(uint8* dst_ptr,
+ const uint8* src_ptr,
+ int dst_width,
+ int x,
+ int dx) {
+ int j;
+ (void)x;
+ (void)dx;
+ for (j = 0; j < dst_width - 1; j += 2) {
+ dst_ptr[1] = dst_ptr[0] = src_ptr[0];
+ src_ptr += 1;
+ dst_ptr += 2;
+ }
+ if (dst_width & 1) {
+ dst_ptr[0] = src_ptr[0];
+ }
+}
+
+void ScaleColsUp2_16_C(uint16* dst_ptr,
+ const uint16* src_ptr,
+ int dst_width,
+ int x,
+ int dx) {
+ int j;
+ (void)x;
+ (void)dx;
+ for (j = 0; j < dst_width - 1; j += 2) {
+ dst_ptr[1] = dst_ptr[0] = src_ptr[0];
+ src_ptr += 1;
+ dst_ptr += 2;
+ }
+ if (dst_width & 1) {
+ dst_ptr[0] = src_ptr[0];
+ }
+}
+
+// (1-f)a + fb can be replaced with a + f(b-a)
+#if defined(__arm__) || defined(__aarch64__)
+#define BLENDER(a, b, f) \
+ (uint8)((int)(a) + ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16))
+#else
+// Intel uses 7 bit math with rounding.
+#define BLENDER(a, b, f) \
+ (uint8)((int)(a) + (((int)((f) >> 9) * ((int)(b) - (int)(a)) + 0x40) >> 7))
+#endif
+
+void ScaleFilterCols_C(uint8* dst_ptr,
+ const uint8* src_ptr,
+ int dst_width,
+ int x,
+ int dx) {
+ int j;
+ for (j = 0; j < dst_width - 1; j += 2) {
+ int xi = x >> 16;
+ int a = src_ptr[xi];
+ int b = src_ptr[xi + 1];
+ dst_ptr[0] = BLENDER(a, b, x & 0xffff);
+ x += dx;
+ xi = x >> 16;
+ a = src_ptr[xi];
+ b = src_ptr[xi + 1];
+ dst_ptr[1] = BLENDER(a, b, x & 0xffff);
+ x += dx;
+ dst_ptr += 2;
+ }
+ if (dst_width & 1) {
+ int xi = x >> 16;
+ int a = src_ptr[xi];
+ int b = src_ptr[xi + 1];
+ dst_ptr[0] = BLENDER(a, b, x & 0xffff);
+ }
+}
+
+void ScaleFilterCols64_C(uint8* dst_ptr,
+ const uint8* src_ptr,
+ int dst_width,
+ int x32,
+ int dx) {
+ int64 x = (int64)(x32);
+ int j;
+ for (j = 0; j < dst_width - 1; j += 2) {
+ int64 xi = x >> 16;
+ int a = src_ptr[xi];
+ int b = src_ptr[xi + 1];
+ dst_ptr[0] = BLENDER(a, b, x & 0xffff);
+ x += dx;
+ xi = x >> 16;
+ a = src_ptr[xi];
+ b = src_ptr[xi + 1];
+ dst_ptr[1] = BLENDER(a, b, x & 0xffff);
+ x += dx;
+ dst_ptr += 2;
+ }
+ if (dst_width & 1) {
+ int64 xi = x >> 16;
+ int a = src_ptr[xi];
+ int b = src_ptr[xi + 1];
+ dst_ptr[0] = BLENDER(a, b, x & 0xffff);
+ }
+}
+#undef BLENDER
+
+// Same as 8 bit arm blender but return is cast to uint16
+#define BLENDER(a, b, f) \
+ (uint16)((int)(a) + ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16))
+
+void ScaleFilterCols_16_C(uint16* dst_ptr,
+ const uint16* src_ptr,
+ int dst_width,
+ int x,
+ int dx) {
+ int j;
+ for (j = 0; j < dst_width - 1; j += 2) {
+ int xi = x >> 16;
+ int a = src_ptr[xi];
+ int b = src_ptr[xi + 1];
+ dst_ptr[0] = BLENDER(a, b, x & 0xffff);
+ x += dx;
+ xi = x >> 16;
+ a = src_ptr[xi];
+ b = src_ptr[xi + 1];
+ dst_ptr[1] = BLENDER(a, b, x & 0xffff);
+ x += dx;
+ dst_ptr += 2;
+ }
+ if (dst_width & 1) {
+ int xi = x >> 16;
+ int a = src_ptr[xi];
+ int b = src_ptr[xi + 1];
+ dst_ptr[0] = BLENDER(a, b, x & 0xffff);
+ }
+}
+
+void ScaleFilterCols64_16_C(uint16* dst_ptr,
+ const uint16* src_ptr,
+ int dst_width,
+ int x32,
+ int dx) {
+ int64 x = (int64)(x32);
+ int j;
+ for (j = 0; j < dst_width - 1; j += 2) {
+ int64 xi = x >> 16;
+ int a = src_ptr[xi];
+ int b = src_ptr[xi + 1];
+ dst_ptr[0] = BLENDER(a, b, x & 0xffff);
+ x += dx;
+ xi = x >> 16;
+ a = src_ptr[xi];
+ b = src_ptr[xi + 1];
+ dst_ptr[1] = BLENDER(a, b, x & 0xffff);
+ x += dx;
+ dst_ptr += 2;
+ }
+ if (dst_width & 1) {
+ int64 xi = x >> 16;
+ int a = src_ptr[xi];
+ int b = src_ptr[xi + 1];
+ dst_ptr[0] = BLENDER(a, b, x & 0xffff);
+ }
+}
+#undef BLENDER
+
+void ScaleRowDown38_C(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst,
+ int dst_width) {
+ int x;
+ (void)src_stride;
+ assert(dst_width % 3 == 0);
+ for (x = 0; x < dst_width; x += 3) {
+ dst[0] = src_ptr[0];
+ dst[1] = src_ptr[3];
+ dst[2] = src_ptr[6];
+ dst += 3;
+ src_ptr += 8;
+ }
+}
+
+void ScaleRowDown38_16_C(const uint16* src_ptr,
+ ptrdiff_t src_stride,
+ uint16* dst,
+ int dst_width) {
+ int x;
+ (void)src_stride;
+ assert(dst_width % 3 == 0);
+ for (x = 0; x < dst_width; x += 3) {
+ dst[0] = src_ptr[0];
+ dst[1] = src_ptr[3];
+ dst[2] = src_ptr[6];
+ dst += 3;
+ src_ptr += 8;
+ }
+}
+
+// 8x3 -> 3x1
+void ScaleRowDown38_3_Box_C(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width) {
+ intptr_t stride = src_stride;
+ int i;
+ assert((dst_width % 3 == 0) && (dst_width > 0));
+ for (i = 0; i < dst_width; i += 3) {
+ dst_ptr[0] =
+ (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[stride + 0] +
+ src_ptr[stride + 1] + src_ptr[stride + 2] + src_ptr[stride * 2 + 0] +
+ src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) *
+ (65536 / 9) >>
+ 16;
+ dst_ptr[1] =
+ (src_ptr[3] + src_ptr[4] + src_ptr[5] + src_ptr[stride + 3] +
+ src_ptr[stride + 4] + src_ptr[stride + 5] + src_ptr[stride * 2 + 3] +
+ src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) *
+ (65536 / 9) >>
+ 16;
+ dst_ptr[2] =
+ (src_ptr[6] + src_ptr[7] + src_ptr[stride + 6] + src_ptr[stride + 7] +
+ src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) *
+ (65536 / 6) >>
+ 16;
+ src_ptr += 8;
+ dst_ptr += 3;
+ }
+}
+
+void ScaleRowDown38_3_Box_16_C(const uint16* src_ptr,
+ ptrdiff_t src_stride,
+ uint16* dst_ptr,
+ int dst_width) {
+ intptr_t stride = src_stride;
+ int i;
+ assert((dst_width % 3 == 0) && (dst_width > 0));
+ for (i = 0; i < dst_width; i += 3) {
+ dst_ptr[0] =
+ (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[stride + 0] +
+ src_ptr[stride + 1] + src_ptr[stride + 2] + src_ptr[stride * 2 + 0] +
+ src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) *
+ (65536 / 9) >>
+ 16;
+ dst_ptr[1] =
+ (src_ptr[3] + src_ptr[4] + src_ptr[5] + src_ptr[stride + 3] +
+ src_ptr[stride + 4] + src_ptr[stride + 5] + src_ptr[stride * 2 + 3] +
+ src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) *
+ (65536 / 9) >>
+ 16;
+ dst_ptr[2] =
+ (src_ptr[6] + src_ptr[7] + src_ptr[stride + 6] + src_ptr[stride + 7] +
+ src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) *
+ (65536 / 6) >>
+ 16;
+ src_ptr += 8;
+ dst_ptr += 3;
+ }
+}
+
+// 8x2 -> 3x1
+void ScaleRowDown38_2_Box_C(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width) {
+ intptr_t stride = src_stride;
+ int i;
+ assert((dst_width % 3 == 0) && (dst_width > 0));
+ for (i = 0; i < dst_width; i += 3) {
+ dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[stride + 0] +
+ src_ptr[stride + 1] + src_ptr[stride + 2]) *
+ (65536 / 6) >>
+ 16;
+ dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] + src_ptr[stride + 3] +
+ src_ptr[stride + 4] + src_ptr[stride + 5]) *
+ (65536 / 6) >>
+ 16;
+ dst_ptr[2] =
+ (src_ptr[6] + src_ptr[7] + src_ptr[stride + 6] + src_ptr[stride + 7]) *
+ (65536 / 4) >>
+ 16;
+ src_ptr += 8;
+ dst_ptr += 3;
+ }
+}
+
+void ScaleRowDown38_2_Box_16_C(const uint16* src_ptr,
+ ptrdiff_t src_stride,
+ uint16* dst_ptr,
+ int dst_width) {
+ intptr_t stride = src_stride;
+ int i;
+ assert((dst_width % 3 == 0) && (dst_width > 0));
+ for (i = 0; i < dst_width; i += 3) {
+ dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[stride + 0] +
+ src_ptr[stride + 1] + src_ptr[stride + 2]) *
+ (65536 / 6) >>
+ 16;
+ dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] + src_ptr[stride + 3] +
+ src_ptr[stride + 4] + src_ptr[stride + 5]) *
+ (65536 / 6) >>
+ 16;
+ dst_ptr[2] =
+ (src_ptr[6] + src_ptr[7] + src_ptr[stride + 6] + src_ptr[stride + 7]) *
+ (65536 / 4) >>
+ 16;
+ src_ptr += 8;
+ dst_ptr += 3;
+ }
+}
+
+void ScaleAddRow_C(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
+ int x;
+ assert(src_width > 0);
+ for (x = 0; x < src_width - 1; x += 2) {
+ dst_ptr[0] += src_ptr[0];
+ dst_ptr[1] += src_ptr[1];
+ src_ptr += 2;
+ dst_ptr += 2;
+ }
+ if (src_width & 1) {
+ dst_ptr[0] += src_ptr[0];
+ }
+}
+
+void ScaleAddRow_16_C(const uint16* src_ptr, uint32* dst_ptr, int src_width) {
+ int x;
+ assert(src_width > 0);
+ for (x = 0; x < src_width - 1; x += 2) {
+ dst_ptr[0] += src_ptr[0];
+ dst_ptr[1] += src_ptr[1];
+ src_ptr += 2;
+ dst_ptr += 2;
+ }
+ if (src_width & 1) {
+ dst_ptr[0] += src_ptr[0];
+ }
+}
+
+void ScaleARGBRowDown2_C(const uint8* src_argb,
+ ptrdiff_t src_stride,
+ uint8* dst_argb,
+ int dst_width) {
+ const uint32* src = (const uint32*)(src_argb);
+ uint32* dst = (uint32*)(dst_argb);
+ int x;
+ (void)src_stride;
+ for (x = 0; x < dst_width - 1; x += 2) {
+ dst[0] = src[1];
+ dst[1] = src[3];
+ src += 4;
+ dst += 2;
+ }
+ if (dst_width & 1) {
+ dst[0] = src[1];
+ }
+}
+
+void ScaleARGBRowDown2Linear_C(const uint8* src_argb,
+ ptrdiff_t src_stride,
+ uint8* dst_argb,
+ int dst_width) {
+ int x;
+ (void)src_stride;
+ for (x = 0; x < dst_width; ++x) {
+ dst_argb[0] = (src_argb[0] + src_argb[4] + 1) >> 1;
+ dst_argb[1] = (src_argb[1] + src_argb[5] + 1) >> 1;
+ dst_argb[2] = (src_argb[2] + src_argb[6] + 1) >> 1;
+ dst_argb[3] = (src_argb[3] + src_argb[7] + 1) >> 1;
+ src_argb += 8;
+ dst_argb += 4;
+ }
+}
+
+void ScaleARGBRowDown2Box_C(const uint8* src_argb,
+ ptrdiff_t src_stride,
+ uint8* dst_argb,
+ int dst_width) {
+ int x;
+ for (x = 0; x < dst_width; ++x) {
+ dst_argb[0] = (src_argb[0] + src_argb[4] + src_argb[src_stride] +
+ src_argb[src_stride + 4] + 2) >>
+ 2;
+ dst_argb[1] = (src_argb[1] + src_argb[5] + src_argb[src_stride + 1] +
+ src_argb[src_stride + 5] + 2) >>
+ 2;
+ dst_argb[2] = (src_argb[2] + src_argb[6] + src_argb[src_stride + 2] +
+ src_argb[src_stride + 6] + 2) >>
+ 2;
+ dst_argb[3] = (src_argb[3] + src_argb[7] + src_argb[src_stride + 3] +
+ src_argb[src_stride + 7] + 2) >>
+ 2;
+ src_argb += 8;
+ dst_argb += 4;
+ }
+}
+
+void ScaleARGBRowDownEven_C(const uint8* src_argb,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8* dst_argb,
+ int dst_width) {
+ const uint32* src = (const uint32*)(src_argb);
+ uint32* dst = (uint32*)(dst_argb);
+ (void)src_stride;
+ int x;
+ for (x = 0; x < dst_width - 1; x += 2) {
+ dst[0] = src[0];
+ dst[1] = src[src_stepx];
+ src += src_stepx * 2;
+ dst += 2;
+ }
+ if (dst_width & 1) {
+ dst[0] = src[0];
+ }
+}
+
+void ScaleARGBRowDownEvenBox_C(const uint8* src_argb,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8* dst_argb,
+ int dst_width) {
+ int x;
+ for (x = 0; x < dst_width; ++x) {
+ dst_argb[0] = (src_argb[0] + src_argb[4] + src_argb[src_stride] +
+ src_argb[src_stride + 4] + 2) >>
+ 2;
+ dst_argb[1] = (src_argb[1] + src_argb[5] + src_argb[src_stride + 1] +
+ src_argb[src_stride + 5] + 2) >>
+ 2;
+ dst_argb[2] = (src_argb[2] + src_argb[6] + src_argb[src_stride + 2] +
+ src_argb[src_stride + 6] + 2) >>
+ 2;
+ dst_argb[3] = (src_argb[3] + src_argb[7] + src_argb[src_stride + 3] +
+ src_argb[src_stride + 7] + 2) >>
+ 2;
+ src_argb += src_stepx * 4;
+ dst_argb += 4;
+ }
+}
+
+// Scales a single row of pixels using point sampling.
+void ScaleARGBCols_C(uint8* dst_argb,
+ const uint8* src_argb,
+ int dst_width,
+ int x,
+ int dx) {
+ const uint32* src = (const uint32*)(src_argb);
+ uint32* dst = (uint32*)(dst_argb);
+ int j;
+ for (j = 0; j < dst_width - 1; j += 2) {
+ dst[0] = src[x >> 16];
+ x += dx;
+ dst[1] = src[x >> 16];
+ x += dx;
+ dst += 2;
+ }
+ if (dst_width & 1) {
+ dst[0] = src[x >> 16];
+ }
+}
+
+void ScaleARGBCols64_C(uint8* dst_argb,
+ const uint8* src_argb,
+ int dst_width,
+ int x32,
+ int dx) {
+ int64 x = (int64)(x32);
+ const uint32* src = (const uint32*)(src_argb);
+ uint32* dst = (uint32*)(dst_argb);
+ int j;
+ for (j = 0; j < dst_width - 1; j += 2) {
+ dst[0] = src[x >> 16];
+ x += dx;
+ dst[1] = src[x >> 16];
+ x += dx;
+ dst += 2;
+ }
+ if (dst_width & 1) {
+ dst[0] = src[x >> 16];
+ }
+}
+
+// Scales a single row of pixels up by 2x using point sampling.
+void ScaleARGBColsUp2_C(uint8* dst_argb,
+ const uint8* src_argb,
+ int dst_width,
+ int x,
+ int dx) {
+ const uint32* src = (const uint32*)(src_argb);
+ uint32* dst = (uint32*)(dst_argb);
+ int j;
+ (void)x;
+ (void)dx;
+ for (j = 0; j < dst_width - 1; j += 2) {
+ dst[1] = dst[0] = src[0];
+ src += 1;
+ dst += 2;
+ }
+ if (dst_width & 1) {
+ dst[0] = src[0];
+ }
+}
+
+// TODO(fbarchard): Replace 0x7f ^ f with 128-f. bug=607.
+// Mimics SSSE3 blender
+#define BLENDER1(a, b, f) ((a) * (0x7f ^ f) + (b)*f) >> 7
+#define BLENDERC(a, b, f, s) \
+ (uint32)(BLENDER1(((a) >> s) & 255, ((b) >> s) & 255, f) << s)
+#define BLENDER(a, b, f) \
+ BLENDERC(a, b, f, 24) | BLENDERC(a, b, f, 16) | BLENDERC(a, b, f, 8) | \
+ BLENDERC(a, b, f, 0)
+
+void ScaleARGBFilterCols_C(uint8* dst_argb,
+ const uint8* src_argb,
+ int dst_width,
+ int x,
+ int dx) {
+ const uint32* src = (const uint32*)(src_argb);
+ uint32* dst = (uint32*)(dst_argb);
+ int j;
+ for (j = 0; j < dst_width - 1; j += 2) {
+ int xi = x >> 16;
+ int xf = (x >> 9) & 0x7f;
+ uint32 a = src[xi];
+ uint32 b = src[xi + 1];
+ dst[0] = BLENDER(a, b, xf);
+ x += dx;
+ xi = x >> 16;
+ xf = (x >> 9) & 0x7f;
+ a = src[xi];
+ b = src[xi + 1];
+ dst[1] = BLENDER(a, b, xf);
+ x += dx;
+ dst += 2;
+ }
+ if (dst_width & 1) {
+ int xi = x >> 16;
+ int xf = (x >> 9) & 0x7f;
+ uint32 a = src[xi];
+ uint32 b = src[xi + 1];
+ dst[0] = BLENDER(a, b, xf);
+ }
+}
+
+void ScaleARGBFilterCols64_C(uint8* dst_argb,
+ const uint8* src_argb,
+ int dst_width,
+ int x32,
+ int dx) {
+ int64 x = (int64)(x32);
+ const uint32* src = (const uint32*)(src_argb);
+ uint32* dst = (uint32*)(dst_argb);
+ int j;
+ for (j = 0; j < dst_width - 1; j += 2) {
+ int64 xi = x >> 16;
+ int xf = (x >> 9) & 0x7f;
+ uint32 a = src[xi];
+ uint32 b = src[xi + 1];
+ dst[0] = BLENDER(a, b, xf);
+ x += dx;
+ xi = x >> 16;
+ xf = (x >> 9) & 0x7f;
+ a = src[xi];
+ b = src[xi + 1];
+ dst[1] = BLENDER(a, b, xf);
+ x += dx;
+ dst += 2;
+ }
+ if (dst_width & 1) {
+ int64 xi = x >> 16;
+ int xf = (x >> 9) & 0x7f;
+ uint32 a = src[xi];
+ uint32 b = src[xi + 1];
+ dst[0] = BLENDER(a, b, xf);
+ }
+}
+#undef BLENDER1
+#undef BLENDERC
+#undef BLENDER
+
+// Scale plane vertically with bilinear interpolation.
+void ScalePlaneVertical(int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint8* src_argb,
+ uint8* dst_argb,
+ int x,
+ int y,
+ int dy,
+ int bpp,
+ enum FilterMode filtering) {
+ // TODO(fbarchard): Allow higher bpp.
+ int dst_width_bytes = dst_width * bpp;
+ void (*InterpolateRow)(uint8 * dst_argb, const uint8* src_argb,
+ ptrdiff_t src_stride, int dst_width,
+ int source_y_fraction) = InterpolateRow_C;
+ const int max_y = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0;
+ int j;
+ assert(bpp >= 1 && bpp <= 4);
+ assert(src_height != 0);
+ assert(dst_width > 0);
+ assert(dst_height > 0);
+ src_argb += (x >> 16) * bpp;
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ InterpolateRow = InterpolateRow_Any_SSSE3;
+ if (IS_ALIGNED(dst_width_bytes, 16)) {
+ InterpolateRow = InterpolateRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ InterpolateRow = InterpolateRow_Any_AVX2;
+ if (IS_ALIGNED(dst_width_bytes, 32)) {
+ InterpolateRow = InterpolateRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ InterpolateRow = InterpolateRow_Any_NEON;
+ if (IS_ALIGNED(dst_width_bytes, 16)) {
+ InterpolateRow = InterpolateRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(src_argb, 4) &&
+ IS_ALIGNED(src_stride, 4) && IS_ALIGNED(dst_argb, 4) &&
+ IS_ALIGNED(dst_stride, 4)) {
+ InterpolateRow = InterpolateRow_Any_DSPR2;
+ if (IS_ALIGNED(dst_width_bytes, 4)) {
+ InterpolateRow = InterpolateRow_DSPR2;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ InterpolateRow = InterpolateRow_Any_MSA;
+ if (IS_ALIGNED(dst_width_bytes, 32)) {
+ InterpolateRow = InterpolateRow_MSA;
+ }
+ }
+#endif
+ for (j = 0; j < dst_height; ++j) {
+ int yi;
+ int yf;
+ if (y > max_y) {
+ y = max_y;
+ }
+ yi = y >> 16;
+ yf = filtering ? ((y >> 8) & 255) : 0;
+ InterpolateRow(dst_argb, src_argb + yi * src_stride, src_stride,
+ dst_width_bytes, yf);
+ dst_argb += dst_stride;
+ y += dy;
+ }
+}
+void ScalePlaneVertical_16(int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint16* src_argb,
+ uint16* dst_argb,
+ int x,
+ int y,
+ int dy,
+ int wpp,
+ enum FilterMode filtering) {
+ // TODO(fbarchard): Allow higher wpp.
+ int dst_width_words = dst_width * wpp;
+ void (*InterpolateRow)(uint16 * dst_argb, const uint16* src_argb,
+ ptrdiff_t src_stride, int dst_width,
+ int source_y_fraction) = InterpolateRow_16_C;
+ const int max_y = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0;
+ int j;
+ assert(wpp >= 1 && wpp <= 2);
+ assert(src_height != 0);
+ assert(dst_width > 0);
+ assert(dst_height > 0);
+ src_argb += (x >> 16) * wpp;
+#if defined(HAS_INTERPOLATEROW_16_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ InterpolateRow = InterpolateRow_Any_16_SSE2;
+ if (IS_ALIGNED(dst_width_bytes, 16)) {
+ InterpolateRow = InterpolateRow_16_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ InterpolateRow = InterpolateRow_Any_16_SSSE3;
+ if (IS_ALIGNED(dst_width_bytes, 16)) {
+ InterpolateRow = InterpolateRow_16_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ InterpolateRow = InterpolateRow_Any_16_AVX2;
+ if (IS_ALIGNED(dst_width_bytes, 32)) {
+ InterpolateRow = InterpolateRow_16_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ InterpolateRow = InterpolateRow_Any_16_NEON;
+ if (IS_ALIGNED(dst_width_bytes, 16)) {
+ InterpolateRow = InterpolateRow_16_NEON;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(src_argb, 4) &&
+ IS_ALIGNED(src_stride, 4) && IS_ALIGNED(dst_argb, 4) &&
+ IS_ALIGNED(dst_stride, 4)) {
+ InterpolateRow = InterpolateRow_Any_16_DSPR2;
+ if (IS_ALIGNED(dst_width_bytes, 4)) {
+ InterpolateRow = InterpolateRow_16_DSPR2;
+ }
+ }
+#endif
+ for (j = 0; j < dst_height; ++j) {
+ int yi;
+ int yf;
+ if (y > max_y) {
+ y = max_y;
+ }
+ yi = y >> 16;
+ yf = filtering ? ((y >> 8) & 255) : 0;
+ InterpolateRow(dst_argb, src_argb + yi * src_stride, src_stride,
+ dst_width_words, yf);
+ dst_argb += dst_stride;
+ y += dy;
+ }
+}
+
+// Simplify the filtering based on scale factors.
+enum FilterMode ScaleFilterReduce(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ enum FilterMode filtering) {
+ if (src_width < 0) {
+ src_width = -src_width;
+ }
+ if (src_height < 0) {
+ src_height = -src_height;
+ }
+ if (filtering == kFilterBox) {
+ // If scaling both axis to 0.5 or larger, switch from Box to Bilinear.
+ if (dst_width * 2 >= src_width && dst_height * 2 >= src_height) {
+ filtering = kFilterBilinear;
+ }
+ }
+ if (filtering == kFilterBilinear) {
+ if (src_height == 1) {
+ filtering = kFilterLinear;
+ }
+ // TODO(fbarchard): Detect any odd scale factor and reduce to Linear.
+ if (dst_height == src_height || dst_height * 3 == src_height) {
+ filtering = kFilterLinear;
+ }
+ // TODO(fbarchard): Remove 1 pixel wide filter restriction, which is to
+ // avoid reading 2 pixels horizontally that causes memory exception.
+ if (src_width == 1) {
+ filtering = kFilterNone;
+ }
+ }
+ if (filtering == kFilterLinear) {
+ if (src_width == 1) {
+ filtering = kFilterNone;
+ }
+ // TODO(fbarchard): Detect any odd scale factor and reduce to None.
+ if (dst_width == src_width || dst_width * 3 == src_width) {
+ filtering = kFilterNone;
+ }
+ }
+ return filtering;
+}
+
+// Divide num by div and return as 16.16 fixed point result.
+int FixedDiv_C(int num, int div) {
+ return (int)(((int64)(num) << 16) / div);
+}
+
+// Divide num by div and return as 16.16 fixed point result.
+int FixedDiv1_C(int num, int div) {
+ return (int)((((int64)(num) << 16) - 0x00010001) / (div - 1));
+}
+
+#define CENTERSTART(dx, s) (dx < 0) ? -((-dx >> 1) + s) : ((dx >> 1) + s)
+
+// Compute slope values for stepping.
+void ScaleSlope(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ enum FilterMode filtering,
+ int* x,
+ int* y,
+ int* dx,
+ int* dy) {
+ assert(x != NULL);
+ assert(y != NULL);
+ assert(dx != NULL);
+ assert(dy != NULL);
+ assert(src_width != 0);
+ assert(src_height != 0);
+ assert(dst_width > 0);
+ assert(dst_height > 0);
+ // Check for 1 pixel and avoid FixedDiv overflow.
+ if (dst_width == 1 && src_width >= 32768) {
+ dst_width = src_width;
+ }
+ if (dst_height == 1 && src_height >= 32768) {
+ dst_height = src_height;
+ }
+ if (filtering == kFilterBox) {
+ // Scale step for point sampling duplicates all pixels equally.
+ *dx = FixedDiv(Abs(src_width), dst_width);
+ *dy = FixedDiv(src_height, dst_height);
+ *x = 0;
+ *y = 0;
+ } else if (filtering == kFilterBilinear) {
+ // Scale step for bilinear sampling renders last pixel once for upsample.
+ if (dst_width <= Abs(src_width)) {
+ *dx = FixedDiv(Abs(src_width), dst_width);
+ *x = CENTERSTART(*dx, -32768); // Subtract 0.5 (32768) to center filter.
+ } else if (dst_width > 1) {
+ *dx = FixedDiv1(Abs(src_width), dst_width);
+ *x = 0;
+ }
+ if (dst_height <= src_height) {
+ *dy = FixedDiv(src_height, dst_height);
+ *y = CENTERSTART(*dy, -32768); // Subtract 0.5 (32768) to center filter.
+ } else if (dst_height > 1) {
+ *dy = FixedDiv1(src_height, dst_height);
+ *y = 0;
+ }
+ } else if (filtering == kFilterLinear) {
+ // Scale step for bilinear sampling renders last pixel once for upsample.
+ if (dst_width <= Abs(src_width)) {
+ *dx = FixedDiv(Abs(src_width), dst_width);
+ *x = CENTERSTART(*dx, -32768); // Subtract 0.5 (32768) to center filter.
+ } else if (dst_width > 1) {
+ *dx = FixedDiv1(Abs(src_width), dst_width);
+ *x = 0;
+ }
+ *dy = FixedDiv(src_height, dst_height);
+ *y = *dy >> 1;
+ } else {
+ // Scale step for point sampling duplicates all pixels equally.
+ *dx = FixedDiv(Abs(src_width), dst_width);
+ *dy = FixedDiv(src_height, dst_height);
+ *x = CENTERSTART(*dx, 0);
+ *y = CENTERSTART(*dy, 0);
+ }
+ // Negative src_width means horizontally mirror.
+ if (src_width < 0) {
+ *x += (dst_width - 1) * *dx;
+ *dx = -*dx;
+ // src_width = -src_width; // Caller must do this.
+ }
+}
+#undef CENTERSTART
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/libyuv/src/main/cpp/libyuv/source/scale_dspr2.cc b/libyuv/src/main/cpp/libyuv/source/scale_dspr2.cc
new file mode 100644
index 0000000..ddedcbf
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/source/scale_dspr2.cc
@@ -0,0 +1,668 @@
+/*
+ * Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/basic_types.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for GCC MIPS DSPR2
+#if !defined(LIBYUV_DISABLE_DSPR2) && defined(__mips_dsp) && \
+ (__mips_dsp_rev >= 2) && (_MIPS_SIM == _MIPS_SIM_ABI32)
+
+void ScaleRowDown2_DSPR2(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst,
+ int dst_width) {
+ __asm__ __volatile__(
+ ".set push \n"
+ ".set noreorder \n"
+
+ "srl $t9, %[dst_width], 4 \n" // iterations -> by 16
+ "beqz $t9, 2f \n"
+ " nop \n"
+
+ "1: \n"
+ "lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0|
+ "lw $t1, 4(%[src_ptr]) \n" // |7|6|5|4|
+ "lw $t2, 8(%[src_ptr]) \n" // |11|10|9|8|
+ "lw $t3, 12(%[src_ptr]) \n" // |15|14|13|12|
+ "lw $t4, 16(%[src_ptr]) \n" // |19|18|17|16|
+ "lw $t5, 20(%[src_ptr]) \n" // |23|22|21|20|
+ "lw $t6, 24(%[src_ptr]) \n" // |27|26|25|24|
+ "lw $t7, 28(%[src_ptr]) \n" // |31|30|29|28|
+ // TODO(fbarchard): Use odd pixels instead of even.
+ "precrq.qb.ph $t8, $t1, $t0 \n" // |7|5|3|1|
+ "precrq.qb.ph $t0, $t3, $t2 \n" // |15|13|11|9|
+ "precrq.qb.ph $t1, $t5, $t4 \n" // |23|21|19|17|
+ "precrq.qb.ph $t2, $t7, $t6 \n" // |31|29|27|25|
+ "addiu %[src_ptr], %[src_ptr], 32 \n"
+ "addiu $t9, $t9, -1 \n"
+ "sw $t8, 0(%[dst]) \n"
+ "sw $t0, 4(%[dst]) \n"
+ "sw $t1, 8(%[dst]) \n"
+ "sw $t2, 12(%[dst]) \n"
+ "bgtz $t9, 1b \n"
+ " addiu %[dst], %[dst], 16 \n"
+
+ "2: \n"
+ "andi $t9, %[dst_width], 0xf \n" // residue
+ "beqz $t9, 3f \n"
+ " nop \n"
+
+ "21: \n"
+ "lbu $t0, 1(%[src_ptr]) \n"
+ "addiu %[src_ptr], %[src_ptr], 2 \n"
+ "addiu $t9, $t9, -1 \n"
+ "sb $t0, 0(%[dst]) \n"
+ "bgtz $t9, 21b \n"
+ " addiu %[dst], %[dst], 1 \n"
+
+ "3: \n"
+ ".set pop \n"
+ : [src_ptr] "+r"(src_ptr), [dst] "+r"(dst)
+ : [dst_width] "r"(dst_width)
+ : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9");
+}
+
+void ScaleRowDown2Box_DSPR2(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst,
+ int dst_width) {
+ const uint8* t = src_ptr + src_stride;
+
+ __asm__ __volatile__(
+ ".set push \n"
+ ".set noreorder \n"
+
+ "srl $t9, %[dst_width], 3 \n" // iterations -> step 8
+ "bltz $t9, 2f \n"
+ " nop \n"
+
+ "1: \n"
+ "lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0|
+ "lw $t1, 4(%[src_ptr]) \n" // |7|6|5|4|
+ "lw $t2, 8(%[src_ptr]) \n" // |11|10|9|8|
+ "lw $t3, 12(%[src_ptr]) \n" // |15|14|13|12|
+ "lw $t4, 0(%[t]) \n" // |19|18|17|16|
+ "lw $t5, 4(%[t]) \n" // |23|22|21|20|
+ "lw $t6, 8(%[t]) \n" // |27|26|25|24|
+ "lw $t7, 12(%[t]) \n" // |31|30|29|28|
+ "addiu $t9, $t9, -1 \n"
+ "srl $t8, $t0, 16 \n" // |X|X|3|2|
+ "ins $t0, $t4, 16, 16 \n" // |17|16|1|0|
+ "ins $t4, $t8, 0, 16 \n" // |19|18|3|2|
+ "raddu.w.qb $t0, $t0 \n" // |17+16+1+0|
+ "raddu.w.qb $t4, $t4 \n" // |19+18+3+2|
+ "shra_r.w $t0, $t0, 2 \n" // |t0+2|>>2
+ "shra_r.w $t4, $t4, 2 \n" // |t4+2|>>2
+ "srl $t8, $t1, 16 \n" // |X|X|7|6|
+ "ins $t1, $t5, 16, 16 \n" // |21|20|5|4|
+ "ins $t5, $t8, 0, 16 \n" // |22|23|7|6|
+ "raddu.w.qb $t1, $t1 \n" // |21+20+5+4|
+ "raddu.w.qb $t5, $t5 \n" // |23+22+7+6|
+ "shra_r.w $t1, $t1, 2 \n" // |t1+2|>>2
+ "shra_r.w $t5, $t5, 2 \n" // |t5+2|>>2
+ "srl $t8, $t2, 16 \n" // |X|X|11|10|
+ "ins $t2, $t6, 16, 16 \n" // |25|24|9|8|
+ "ins $t6, $t8, 0, 16 \n" // |27|26|11|10|
+ "raddu.w.qb $t2, $t2 \n" // |25+24+9+8|
+ "raddu.w.qb $t6, $t6 \n" // |27+26+11+10|
+ "shra_r.w $t2, $t2, 2 \n" // |t2+2|>>2
+ "shra_r.w $t6, $t6, 2 \n" // |t5+2|>>2
+ "srl $t8, $t3, 16 \n" // |X|X|15|14|
+ "ins $t3, $t7, 16, 16 \n" // |29|28|13|12|
+ "ins $t7, $t8, 0, 16 \n" // |31|30|15|14|
+ "raddu.w.qb $t3, $t3 \n" // |29+28+13+12|
+ "raddu.w.qb $t7, $t7 \n" // |31+30+15+14|
+ "shra_r.w $t3, $t3, 2 \n" // |t3+2|>>2
+ "shra_r.w $t7, $t7, 2 \n" // |t7+2|>>2
+ "addiu %[src_ptr], %[src_ptr], 16 \n"
+ "addiu %[t], %[t], 16 \n"
+ "sb $t0, 0(%[dst]) \n"
+ "sb $t4, 1(%[dst]) \n"
+ "sb $t1, 2(%[dst]) \n"
+ "sb $t5, 3(%[dst]) \n"
+ "sb $t2, 4(%[dst]) \n"
+ "sb $t6, 5(%[dst]) \n"
+ "sb $t3, 6(%[dst]) \n"
+ "sb $t7, 7(%[dst]) \n"
+ "bgtz $t9, 1b \n"
+ " addiu %[dst], %[dst], 8 \n"
+
+ "2: \n"
+ "andi $t9, %[dst_width], 0x7 \n" // x = residue
+ "beqz $t9, 3f \n"
+ " nop \n"
+
+ "21: \n"
+ "lwr $t1, 0(%[src_ptr]) \n"
+ "lwl $t1, 3(%[src_ptr]) \n"
+ "lwr $t2, 0(%[t]) \n"
+ "lwl $t2, 3(%[t]) \n"
+ "srl $t8, $t1, 16 \n"
+ "ins $t1, $t2, 16, 16 \n"
+ "ins $t2, $t8, 0, 16 \n"
+ "raddu.w.qb $t1, $t1 \n"
+ "raddu.w.qb $t2, $t2 \n"
+ "shra_r.w $t1, $t1, 2 \n"
+ "shra_r.w $t2, $t2, 2 \n"
+ "sb $t1, 0(%[dst]) \n"
+ "sb $t2, 1(%[dst]) \n"
+ "addiu %[src_ptr], %[src_ptr], 4 \n"
+ "addiu $t9, $t9, -2 \n"
+ "addiu %[t], %[t], 4 \n"
+ "bgtz $t9, 21b \n"
+ " addiu %[dst], %[dst], 2 \n"
+
+ "3: \n"
+ ".set pop \n"
+
+ : [src_ptr] "+r"(src_ptr), [dst] "+r"(dst), [t] "+r"(t)
+ : [dst_width] "r"(dst_width)
+ : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9");
+}
+
+void ScaleRowDown4_DSPR2(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst,
+ int dst_width) {
+ __asm__ __volatile__(
+ ".set push \n"
+ ".set noreorder \n"
+
+ "srl $t9, %[dst_width], 3 \n"
+ "beqz $t9, 2f \n"
+ " nop \n"
+
+ "1: \n"
+ "lw $t1, 0(%[src_ptr]) \n" // |3|2|1|0|
+ "lw $t2, 4(%[src_ptr]) \n" // |7|6|5|4|
+ "lw $t3, 8(%[src_ptr]) \n" // |11|10|9|8|
+ "lw $t4, 12(%[src_ptr]) \n" // |15|14|13|12|
+ "lw $t5, 16(%[src_ptr]) \n" // |19|18|17|16|
+ "lw $t6, 20(%[src_ptr]) \n" // |23|22|21|20|
+ "lw $t7, 24(%[src_ptr]) \n" // |27|26|25|24|
+ "lw $t8, 28(%[src_ptr]) \n" // |31|30|29|28|
+ "precr.qb.ph $t1, $t2, $t1 \n" // |6|4|2|0|
+ "precr.qb.ph $t2, $t4, $t3 \n" // |14|12|10|8|
+ "precr.qb.ph $t5, $t6, $t5 \n" // |22|20|18|16|
+ "precr.qb.ph $t6, $t8, $t7 \n" // |30|28|26|24|
+ "precrq.qb.ph $t1, $t2, $t1 \n" // |14|10|6|2|
+ "precrq.qb.ph $t5, $t6, $t5 \n" // |30|26|22|18|
+ "addiu %[src_ptr], %[src_ptr], 32 \n"
+ "addiu $t9, $t9, -1 \n"
+ "sw $t1, 0(%[dst]) \n"
+ "sw $t5, 4(%[dst]) \n"
+ "bgtz $t9, 1b \n"
+ " addiu %[dst], %[dst], 8 \n"
+
+ "2: \n"
+ "andi $t9, %[dst_width], 7 \n" // residue
+ "beqz $t9, 3f \n"
+ " nop \n"
+
+ "21: \n"
+ "lbu $t1, 2(%[src_ptr]) \n"
+ "addiu %[src_ptr], %[src_ptr], 4 \n"
+ "addiu $t9, $t9, -1 \n"
+ "sb $t1, 0(%[dst]) \n"
+ "bgtz $t9, 21b \n"
+ " addiu %[dst], %[dst], 1 \n"
+
+ "3: \n"
+ ".set pop \n"
+ : [src_ptr] "+r"(src_ptr), [dst] "+r"(dst)
+ : [dst_width] "r"(dst_width)
+ : "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9");
+}
+
+void ScaleRowDown4Box_DSPR2(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst,
+ int dst_width) {
+ intptr_t stride = src_stride;
+ const uint8* s1 = src_ptr + stride;
+ const uint8* s2 = s1 + stride;
+ const uint8* s3 = s2 + stride;
+
+ __asm__ __volatile__(
+ ".set push \n"
+ ".set noreorder \n"
+
+ "srl $t9, %[dst_width], 1 \n"
+ "andi $t8, %[dst_width], 1 \n"
+
+ "1: \n"
+ "lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0|
+ "lw $t1, 0(%[s1]) \n" // |7|6|5|4|
+ "lw $t2, 0(%[s2]) \n" // |11|10|9|8|
+ "lw $t3, 0(%[s3]) \n" // |15|14|13|12|
+ "lw $t4, 4(%[src_ptr]) \n" // |19|18|17|16|
+ "lw $t5, 4(%[s1]) \n" // |23|22|21|20|
+ "lw $t6, 4(%[s2]) \n" // |27|26|25|24|
+ "lw $t7, 4(%[s3]) \n" // |31|30|29|28|
+ "raddu.w.qb $t0, $t0 \n" // |3 + 2 + 1 + 0|
+ "raddu.w.qb $t1, $t1 \n" // |7 + 6 + 5 + 4|
+ "raddu.w.qb $t2, $t2 \n" // |11 + 10 + 9 + 8|
+ "raddu.w.qb $t3, $t3 \n" // |15 + 14 + 13 + 12|
+ "raddu.w.qb $t4, $t4 \n" // |19 + 18 + 17 + 16|
+ "raddu.w.qb $t5, $t5 \n" // |23 + 22 + 21 + 20|
+ "raddu.w.qb $t6, $t6 \n" // |27 + 26 + 25 + 24|
+ "raddu.w.qb $t7, $t7 \n" // |31 + 30 + 29 + 28|
+ "add $t0, $t0, $t1 \n"
+ "add $t1, $t2, $t3 \n"
+ "add $t0, $t0, $t1 \n"
+ "add $t4, $t4, $t5 \n"
+ "add $t6, $t6, $t7 \n"
+ "add $t4, $t4, $t6 \n"
+ "shra_r.w $t0, $t0, 4 \n"
+ "shra_r.w $t4, $t4, 4 \n"
+ "sb $t0, 0(%[dst]) \n"
+ "sb $t4, 1(%[dst]) \n"
+ "addiu %[src_ptr], %[src_ptr], 8 \n"
+ "addiu %[s1], %[s1], 8 \n"
+ "addiu %[s2], %[s2], 8 \n"
+ "addiu %[s3], %[s3], 8 \n"
+ "addiu $t9, $t9, -1 \n"
+ "bgtz $t9, 1b \n"
+ " addiu %[dst], %[dst], 2 \n"
+ "beqz $t8, 2f \n"
+ " nop \n"
+
+ "lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0|
+ "lw $t1, 0(%[s1]) \n" // |7|6|5|4|
+ "lw $t2, 0(%[s2]) \n" // |11|10|9|8|
+ "lw $t3, 0(%[s3]) \n" // |15|14|13|12|
+ "raddu.w.qb $t0, $t0 \n" // |3 + 2 + 1 + 0|
+ "raddu.w.qb $t1, $t1 \n" // |7 + 6 + 5 + 4|
+ "raddu.w.qb $t2, $t2 \n" // |11 + 10 + 9 + 8|
+ "raddu.w.qb $t3, $t3 \n" // |15 + 14 + 13 + 12|
+ "add $t0, $t0, $t1 \n"
+ "add $t1, $t2, $t3 \n"
+ "add $t0, $t0, $t1 \n"
+ "shra_r.w $t0, $t0, 4 \n"
+ "sb $t0, 0(%[dst]) \n"
+
+ "2: \n"
+ ".set pop \n"
+
+ : [src_ptr] "+r"(src_ptr), [dst] "+r"(dst), [s1] "+r"(s1), [s2] "+r"(s2),
+ [s3] "+r"(s3)
+ : [dst_width] "r"(dst_width)
+ : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9");
+}
+
+void ScaleRowDown34_DSPR2(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst,
+ int dst_width) {
+ __asm__ __volatile__(
+ ".set push \n"
+ ".set noreorder \n"
+ "1: \n"
+ "lw $t1, 0(%[src_ptr]) \n" // |3|2|1|0|
+ "lw $t2, 4(%[src_ptr]) \n" // |7|6|5|4|
+ "lw $t3, 8(%[src_ptr]) \n" // |11|10|9|8|
+ "lw $t4, 12(%[src_ptr]) \n" // |15|14|13|12|
+ "lw $t5, 16(%[src_ptr]) \n" // |19|18|17|16|
+ "lw $t6, 20(%[src_ptr]) \n" // |23|22|21|20|
+ "lw $t7, 24(%[src_ptr]) \n" // |27|26|25|24|
+ "lw $t8, 28(%[src_ptr]) \n" // |31|30|29|28|
+ "precrq.qb.ph $t0, $t2, $t4 \n" // |7|5|15|13|
+ "precrq.qb.ph $t9, $t6, $t8 \n" // |23|21|31|30|
+ "addiu %[dst_width], %[dst_width], -24 \n"
+ "ins $t1, $t1, 8, 16 \n" // |3|1|0|X|
+ "ins $t4, $t0, 8, 16 \n" // |X|15|13|12|
+ "ins $t5, $t5, 8, 16 \n" // |19|17|16|X|
+ "ins $t8, $t9, 8, 16 \n" // |X|31|29|28|
+ "addiu %[src_ptr], %[src_ptr], 32 \n"
+ "packrl.ph $t0, $t3, $t0 \n" // |9|8|7|5|
+ "packrl.ph $t9, $t7, $t9 \n" // |25|24|23|21|
+ "prepend $t1, $t2, 8 \n" // |4|3|1|0|
+ "prepend $t3, $t4, 24 \n" // |15|13|12|11|
+ "prepend $t5, $t6, 8 \n" // |20|19|17|16|
+ "prepend $t7, $t8, 24 \n" // |31|29|28|27|
+ "sw $t1, 0(%[dst]) \n"
+ "sw $t0, 4(%[dst]) \n"
+ "sw $t3, 8(%[dst]) \n"
+ "sw $t5, 12(%[dst]) \n"
+ "sw $t9, 16(%[dst]) \n"
+ "sw $t7, 20(%[dst]) \n"
+ "bnez %[dst_width], 1b \n"
+ " addiu %[dst], %[dst], 24 \n"
+ ".set pop \n"
+ : [src_ptr] "+r"(src_ptr), [dst] "+r"(dst), [dst_width] "+r"(dst_width)
+ :
+ : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9");
+}
+
+void ScaleRowDown34_0_Box_DSPR2(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* d,
+ int dst_width) {
+ __asm__ __volatile__(
+ ".set push \n"
+ ".set noreorder \n"
+ "repl.ph $t3, 3 \n" // 0x00030003
+
+ "1: \n"
+ "lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0|
+ "lwx $t1, %[src_stride](%[src_ptr]) \n" // |T3|T2|T1|T0|
+ "rotr $t2, $t0, 8 \n" // |S0|S3|S2|S1|
+ "rotr $t6, $t1, 8 \n" // |T0|T3|T2|T1|
+ "muleu_s.ph.qbl $t4, $t2, $t3 \n" // |S0*3|S3*3|
+ "muleu_s.ph.qbl $t5, $t6, $t3 \n" // |T0*3|T3*3|
+ "andi $t0, $t2, 0xFFFF \n" // |0|0|S2|S1|
+ "andi $t1, $t6, 0xFFFF \n" // |0|0|T2|T1|
+ "raddu.w.qb $t0, $t0 \n"
+ "raddu.w.qb $t1, $t1 \n"
+ "shra_r.w $t0, $t0, 1 \n"
+ "shra_r.w $t1, $t1, 1 \n"
+ "preceu.ph.qbr $t2, $t2 \n" // |0|S2|0|S1|
+ "preceu.ph.qbr $t6, $t6 \n" // |0|T2|0|T1|
+ "rotr $t2, $t2, 16 \n" // |0|S1|0|S2|
+ "rotr $t6, $t6, 16 \n" // |0|T1|0|T2|
+ "addu.ph $t2, $t2, $t4 \n"
+ "addu.ph $t6, $t6, $t5 \n"
+ "sll $t5, $t0, 1 \n"
+ "add $t0, $t5, $t0 \n"
+ "shra_r.ph $t2, $t2, 2 \n"
+ "shra_r.ph $t6, $t6, 2 \n"
+ "shll.ph $t4, $t2, 1 \n"
+ "addq.ph $t4, $t4, $t2 \n"
+ "addu $t0, $t0, $t1 \n"
+ "addiu %[src_ptr], %[src_ptr], 4 \n"
+ "shra_r.w $t0, $t0, 2 \n"
+ "addu.ph $t6, $t6, $t4 \n"
+ "shra_r.ph $t6, $t6, 2 \n"
+ "srl $t1, $t6, 16 \n"
+ "addiu %[dst_width], %[dst_width], -3 \n"
+ "sb $t1, 0(%[d]) \n"
+ "sb $t0, 1(%[d]) \n"
+ "sb $t6, 2(%[d]) \n"
+ "bgtz %[dst_width], 1b \n"
+ " addiu %[d], %[d], 3 \n"
+ "3: \n"
+ ".set pop \n"
+ : [src_ptr] "+r"(src_ptr), [src_stride] "+r"(src_stride), [d] "+r"(d),
+ [dst_width] "+r"(dst_width)
+ :
+ : "t0", "t1", "t2", "t3", "t4", "t5", "t6");
+}
+
+void ScaleRowDown34_1_Box_DSPR2(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* d,
+ int dst_width) {
+ __asm__ __volatile__(
+ ".set push \n"
+ ".set noreorder \n"
+ "repl.ph $t2, 3 \n" // 0x00030003
+
+ "1: \n"
+ "lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0|
+ "lwx $t1, %[src_stride](%[src_ptr]) \n" // |T3|T2|T1|T0|
+ "rotr $t4, $t0, 8 \n" // |S0|S3|S2|S1|
+ "rotr $t6, $t1, 8 \n" // |T0|T3|T2|T1|
+ "muleu_s.ph.qbl $t3, $t4, $t2 \n" // |S0*3|S3*3|
+ "muleu_s.ph.qbl $t5, $t6, $t2 \n" // |T0*3|T3*3|
+ "andi $t0, $t4, 0xFFFF \n" // |0|0|S2|S1|
+ "andi $t1, $t6, 0xFFFF \n" // |0|0|T2|T1|
+ "raddu.w.qb $t0, $t0 \n"
+ "raddu.w.qb $t1, $t1 \n"
+ "shra_r.w $t0, $t0, 1 \n"
+ "shra_r.w $t1, $t1, 1 \n"
+ "preceu.ph.qbr $t4, $t4 \n" // |0|S2|0|S1|
+ "preceu.ph.qbr $t6, $t6 \n" // |0|T2|0|T1|
+ "rotr $t4, $t4, 16 \n" // |0|S1|0|S2|
+ "rotr $t6, $t6, 16 \n" // |0|T1|0|T2|
+ "addu.ph $t4, $t4, $t3 \n"
+ "addu.ph $t6, $t6, $t5 \n"
+ "shra_r.ph $t6, $t6, 2 \n"
+ "shra_r.ph $t4, $t4, 2 \n"
+ "addu.ph $t6, $t6, $t4 \n"
+ "addiu %[src_ptr], %[src_ptr], 4 \n"
+ "shra_r.ph $t6, $t6, 1 \n"
+ "addu $t0, $t0, $t1 \n"
+ "addiu %[dst_width], %[dst_width], -3 \n"
+ "shra_r.w $t0, $t0, 1 \n"
+ "srl $t1, $t6, 16 \n"
+ "sb $t1, 0(%[d]) \n"
+ "sb $t0, 1(%[d]) \n"
+ "sb $t6, 2(%[d]) \n"
+ "bgtz %[dst_width], 1b \n"
+ " addiu %[d], %[d], 3 \n"
+ "3: \n"
+ ".set pop \n"
+ : [src_ptr] "+r"(src_ptr), [src_stride] "+r"(src_stride), [d] "+r"(d),
+ [dst_width] "+r"(dst_width)
+ :
+ : "t0", "t1", "t2", "t3", "t4", "t5", "t6");
+}
+
+void ScaleRowDown38_DSPR2(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst,
+ int dst_width) {
+ __asm__ __volatile__(
+ ".set push \n"
+ ".set noreorder \n"
+
+ "1: \n"
+ "lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0|
+ "lw $t1, 4(%[src_ptr]) \n" // |7|6|5|4|
+ "lw $t2, 8(%[src_ptr]) \n" // |11|10|9|8|
+ "lw $t3, 12(%[src_ptr]) \n" // |15|14|13|12|
+ "lw $t4, 16(%[src_ptr]) \n" // |19|18|17|16|
+ "lw $t5, 20(%[src_ptr]) \n" // |23|22|21|20|
+ "lw $t6, 24(%[src_ptr]) \n" // |27|26|25|24|
+ "lw $t7, 28(%[src_ptr]) \n" // |31|30|29|28|
+ "wsbh $t0, $t0 \n" // |2|3|0|1|
+ "wsbh $t6, $t6 \n" // |26|27|24|25|
+ "srl $t0, $t0, 8 \n" // |X|2|3|0|
+ "srl $t3, $t3, 16 \n" // |X|X|15|14|
+ "srl $t5, $t5, 16 \n" // |X|X|23|22|
+ "srl $t7, $t7, 16 \n" // |X|X|31|30|
+ "ins $t1, $t2, 24, 8 \n" // |8|6|5|4|
+ "ins $t6, $t5, 0, 8 \n" // |26|27|24|22|
+ "ins $t1, $t0, 0, 16 \n" // |8|6|3|0|
+ "ins $t6, $t7, 24, 8 \n" // |30|27|24|22|
+ "prepend $t2, $t3, 24 \n" // |X|15|14|11|
+ "ins $t4, $t4, 16, 8 \n" // |19|16|17|X|
+ "ins $t4, $t2, 0, 16 \n" // |19|16|14|11|
+ "addiu %[src_ptr], %[src_ptr], 32 \n"
+ "addiu %[dst_width], %[dst_width], -12 \n"
+ "addiu $t8,%[dst_width], -12 \n"
+ "sw $t1, 0(%[dst]) \n"
+ "sw $t4, 4(%[dst]) \n"
+ "sw $t6, 8(%[dst]) \n"
+ "bgez $t8, 1b \n"
+ " addiu %[dst], %[dst], 12 \n"
+ ".set pop \n"
+ : [src_ptr] "+r"(src_ptr), [dst] "+r"(dst), [dst_width] "+r"(dst_width)
+ :
+ : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8");
+}
+
+void ScaleRowDown38_2_Box_DSPR2(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width) {
+ intptr_t stride = src_stride;
+ const uint8* t = src_ptr + stride;
+ const int c = 0x2AAA;
+
+ __asm__ __volatile__(
+ ".set push \n"
+ ".set noreorder \n"
+
+ "1: \n"
+ "lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0|
+ "lw $t1, 4(%[src_ptr]) \n" // |S7|S6|S5|S4|
+ "lw $t2, 0(%[t]) \n" // |T3|T2|T1|T0|
+ "lw $t3, 4(%[t]) \n" // |T7|T6|T5|T4|
+ "rotr $t1, $t1, 16 \n" // |S5|S4|S7|S6|
+ "packrl.ph $t4, $t1, $t3 \n" // |S7|S6|T7|T6|
+ "packrl.ph $t5, $t3, $t1 \n" // |T5|T4|S5|S4|
+ "raddu.w.qb $t4, $t4 \n" // S7+S6+T7+T6
+ "raddu.w.qb $t5, $t5 \n" // T5+T4+S5+S4
+ "precrq.qb.ph $t6, $t0, $t2 \n" // |S3|S1|T3|T1|
+ "precrq.qb.ph $t6, $t6, $t6 \n" // |S3|T3|S3|T3|
+ "srl $t4, $t4, 2 \n" // t4 / 4
+ "srl $t6, $t6, 16 \n" // |0|0|S3|T3|
+ "raddu.w.qb $t6, $t6 \n" // 0+0+S3+T3
+ "addu $t6, $t5, $t6 \n"
+ "mul $t6, $t6, %[c] \n" // t6 * 0x2AAA
+ "sll $t0, $t0, 8 \n" // |S2|S1|S0|0|
+ "sll $t2, $t2, 8 \n" // |T2|T1|T0|0|
+ "raddu.w.qb $t0, $t0 \n" // S2+S1+S0+0
+ "raddu.w.qb $t2, $t2 \n" // T2+T1+T0+0
+ "addu $t0, $t0, $t2 \n"
+ "mul $t0, $t0, %[c] \n" // t0 * 0x2AAA
+ "addiu %[src_ptr], %[src_ptr], 8 \n"
+ "addiu %[t], %[t], 8 \n"
+ "addiu %[dst_width], %[dst_width], -3 \n"
+ "addiu %[dst_ptr], %[dst_ptr], 3 \n"
+ "srl $t6, $t6, 16 \n"
+ "srl $t0, $t0, 16 \n"
+ "sb $t4, -1(%[dst_ptr]) \n"
+ "sb $t6, -2(%[dst_ptr]) \n"
+ "bgtz %[dst_width], 1b \n"
+ " sb $t0, -3(%[dst_ptr]) \n"
+ ".set pop \n"
+ : [src_ptr] "+r"(src_ptr), [dst_ptr] "+r"(dst_ptr), [t] "+r"(t),
+ [dst_width] "+r"(dst_width)
+ : [c] "r"(c)
+ : "t0", "t1", "t2", "t3", "t4", "t5", "t6");
+}
+
+void ScaleRowDown38_3_Box_DSPR2(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width) {
+ intptr_t stride = src_stride;
+ const uint8* s1 = src_ptr + stride;
+ stride += stride;
+ const uint8* s2 = src_ptr + stride;
+ const int c1 = 0x1C71;
+ const int c2 = 0x2AAA;
+
+ __asm__ __volatile__(
+ ".set push \n"
+ ".set noreorder \n"
+
+ "1: \n"
+ "lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0|
+ "lw $t1, 4(%[src_ptr]) \n" // |S7|S6|S5|S4|
+ "lw $t2, 0(%[s1]) \n" // |T3|T2|T1|T0|
+ "lw $t3, 4(%[s1]) \n" // |T7|T6|T5|T4|
+ "lw $t4, 0(%[s2]) \n" // |R3|R2|R1|R0|
+ "lw $t5, 4(%[s2]) \n" // |R7|R6|R5|R4|
+ "rotr $t1, $t1, 16 \n" // |S5|S4|S7|S6|
+ "packrl.ph $t6, $t1, $t3 \n" // |S7|S6|T7|T6|
+ "raddu.w.qb $t6, $t6 \n" // S7+S6+T7+T6
+ "packrl.ph $t7, $t3, $t1 \n" // |T5|T4|S5|S4|
+ "raddu.w.qb $t7, $t7 \n" // T5+T4+S5+S4
+ "sll $t8, $t5, 16 \n" // |R5|R4|0|0|
+ "raddu.w.qb $t8, $t8 \n" // R5+R4
+ "addu $t7, $t7, $t8 \n"
+ "srl $t8, $t5, 16 \n" // |0|0|R7|R6|
+ "raddu.w.qb $t8, $t8 \n" // R7 + R6
+ "addu $t6, $t6, $t8 \n"
+ "mul $t6, $t6, %[c2] \n" // t6 * 0x2AAA
+ "precrq.qb.ph $t8, $t0, $t2 \n" // |S3|S1|T3|T1|
+ "precrq.qb.ph $t8, $t8, $t4 \n" // |S3|T3|R3|R1|
+ "srl $t8, $t8, 8 \n" // |0|S3|T3|R3|
+ "raddu.w.qb $t8, $t8 \n" // S3 + T3 + R3
+ "addu $t7, $t7, $t8 \n"
+ "mul $t7, $t7, %[c1] \n" // t7 * 0x1C71
+ "sll $t0, $t0, 8 \n" // |S2|S1|S0|0|
+ "sll $t2, $t2, 8 \n" // |T2|T1|T0|0|
+ "sll $t4, $t4, 8 \n" // |R2|R1|R0|0|
+ "raddu.w.qb $t0, $t0 \n"
+ "raddu.w.qb $t2, $t2 \n"
+ "raddu.w.qb $t4, $t4 \n"
+ "addu $t0, $t0, $t2 \n"
+ "addu $t0, $t0, $t4 \n"
+ "mul $t0, $t0, %[c1] \n" // t0 * 0x1C71
+ "addiu %[src_ptr], %[src_ptr], 8 \n"
+ "addiu %[s1], %[s1], 8 \n"
+ "addiu %[s2], %[s2], 8 \n"
+ "addiu %[dst_width], %[dst_width], -3 \n"
+ "addiu %[dst_ptr], %[dst_ptr], 3 \n"
+ "srl $t6, $t6, 16 \n"
+ "srl $t7, $t7, 16 \n"
+ "srl $t0, $t0, 16 \n"
+ "sb $t6, -1(%[dst_ptr]) \n"
+ "sb $t7, -2(%[dst_ptr]) \n"
+ "bgtz %[dst_width], 1b \n"
+ " sb $t0, -3(%[dst_ptr]) \n"
+ ".set pop \n"
+ : [src_ptr] "+r"(src_ptr), [dst_ptr] "+r"(dst_ptr), [s1] "+r"(s1),
+ [s2] "+r"(s2), [dst_width] "+r"(dst_width)
+ : [c1] "r"(c1), [c2] "r"(c2)
+ : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8");
+}
+
+void ScaleAddRow_DSPR2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
+ int x;
+ for (x = 0; x < ((src_width - 1)); x += 8) {
+ uint32 tmp_t1, tmp_t2, tmp_t3, tmp_t4;
+ uint32 tmp_t5, tmp_t6, tmp_t7, tmp_t8;
+ __asm__ __volatile__(
+ ".set push \n"
+ ".set noreorder \n"
+ "lw %[tmp_t5], 0(%[src_ptr]) \n"
+ "lw %[tmp_t6], 4(%[src_ptr]) \n"
+ "lw %[tmp_t1], 0(%[dst_ptr]) \n"
+ "lw %[tmp_t2], 4(%[dst_ptr]) \n"
+ "lw %[tmp_t3], 8(%[dst_ptr]) \n"
+ "lw %[tmp_t4], 12(%[dst_ptr]) \n"
+ "preceu.ph.qbr %[tmp_t7], %[tmp_t5] \n"
+ "preceu.ph.qbl %[tmp_t8], %[tmp_t5] \n"
+ "addu.ph %[tmp_t1], %[tmp_t1], %[tmp_t7] \n"
+ "addu.ph %[tmp_t2], %[tmp_t2], %[tmp_t8] \n"
+ "preceu.ph.qbr %[tmp_t7], %[tmp_t6] \n"
+ "preceu.ph.qbl %[tmp_t8], %[tmp_t6] \n"
+ "addu.ph %[tmp_t3], %[tmp_t3], %[tmp_t7] \n"
+ "addu.ph %[tmp_t4], %[tmp_t4], %[tmp_t8] \n"
+ "sw %[tmp_t1], 0(%[dst_ptr]) \n"
+ "sw %[tmp_t2], 4(%[dst_ptr]) \n"
+ "sw %[tmp_t3], 8(%[dst_ptr]) \n"
+ "sw %[tmp_t4], 12(%[dst_ptr]) \n"
+ ".set pop \n"
+ :
+ [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2), [tmp_t3] "=&r"(tmp_t3),
+ [tmp_t4] "=&r"(tmp_t4), [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
+ [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8), [src_ptr] "+r"(src_ptr)
+ : [dst_ptr] "r"(dst_ptr));
+ src_ptr += 8;
+ dst_ptr += 8;
+ }
+
+ if ((src_width)&7) {
+ for (x = 0; x < ((src_width - 1) & 7); x += 1) {
+ dst_ptr[0] += src_ptr[0];
+ src_ptr += 1;
+ dst_ptr += 1;
+ }
+ }
+}
+
+#endif // defined(__mips_dsp) && (__mips_dsp_rev >= 2)
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/libyuv/src/main/cpp/libyuv/source/scale_gcc.cc b/libyuv/src/main/cpp/libyuv/source/scale_gcc.cc
new file mode 100644
index 0000000..f0ac56f
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/source/scale_gcc.cc
@@ -0,0 +1,1377 @@
+/*
+ * Copyright 2013 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+#include "libyuv/scale_row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for GCC x86 and x64.
+#if !defined(LIBYUV_DISABLE_X86) && \
+ (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
+
+// Offsets for source bytes 0 to 9
+static uvec8 kShuf0 = {0, 1, 3, 4, 5, 7, 8, 9,
+ 128, 128, 128, 128, 128, 128, 128, 128};
+
+// Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
+static uvec8 kShuf1 = {3, 4, 5, 7, 8, 9, 11, 12,
+ 128, 128, 128, 128, 128, 128, 128, 128};
+
+// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
+static uvec8 kShuf2 = {5, 7, 8, 9, 11, 12, 13, 15,
+ 128, 128, 128, 128, 128, 128, 128, 128};
+
+// Offsets for source bytes 0 to 10
+static uvec8 kShuf01 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10};
+
+// Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
+static uvec8 kShuf11 = {2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13};
+
+// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
+static uvec8 kShuf21 = {5, 6, 6, 7, 8, 9, 9, 10,
+ 10, 11, 12, 13, 13, 14, 14, 15};
+
+// Coefficients for source bytes 0 to 10
+static uvec8 kMadd01 = {3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2};
+
+// Coefficients for source bytes 10 to 21
+static uvec8 kMadd11 = {1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1};
+
+// Coefficients for source bytes 21 to 31
+static uvec8 kMadd21 = {2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3};
+
+// Coefficients for source bytes 21 to 31
+static vec16 kRound34 = {2, 2, 2, 2, 2, 2, 2, 2};
+
+static uvec8 kShuf38a = {0, 3, 6, 8, 11, 14, 128, 128,
+ 128, 128, 128, 128, 128, 128, 128, 128};
+
+static uvec8 kShuf38b = {128, 128, 128, 128, 128, 128, 0, 3,
+ 6, 8, 11, 14, 128, 128, 128, 128};
+
+// Arrange words 0,3,6 into 0,1,2
+static uvec8 kShufAc = {0, 1, 6, 7, 12, 13, 128, 128,
+ 128, 128, 128, 128, 128, 128, 128, 128};
+
+// Arrange words 0,3,6 into 3,4,5
+static uvec8 kShufAc3 = {128, 128, 128, 128, 128, 128, 0, 1,
+ 6, 7, 12, 13, 128, 128, 128, 128};
+
+// Scaling values for boxes of 3x3 and 2x3
+static uvec16 kScaleAc33 = {65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9,
+ 65536 / 9, 65536 / 6, 0, 0};
+
+// Arrange first value for pixels 0,1,2,3,4,5
+static uvec8 kShufAb0 = {0, 128, 3, 128, 6, 128, 8, 128,
+ 11, 128, 14, 128, 128, 128, 128, 128};
+
+// Arrange second value for pixels 0,1,2,3,4,5
+static uvec8 kShufAb1 = {1, 128, 4, 128, 7, 128, 9, 128,
+ 12, 128, 15, 128, 128, 128, 128, 128};
+
+// Arrange third value for pixels 0,1,2,3,4,5
+static uvec8 kShufAb2 = {2, 128, 5, 128, 128, 128, 10, 128,
+ 13, 128, 128, 128, 128, 128, 128, 128};
+
+// Scaling values for boxes of 3x2 and 2x2
+static uvec16 kScaleAb2 = {65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3,
+ 65536 / 3, 65536 / 2, 0, 0};
+
+// GCC versions of row functions are verbatim conversions from Visual C.
+// Generated using gcc disassembly on Visual C object file:
+// objdump -D yuvscaler.obj >yuvscaler.txt
+
+void ScaleRowDown2_SSSE3(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width) {
+ (void)src_stride;
+ asm volatile (
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ :: "memory", "cc", "xmm0", "xmm1"
+ );
+}
+
+void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width) {
+ (void)src_stride;
+ asm volatile (
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "psrlw $0xf,%%xmm4 \n"
+ "packuswb %%xmm4,%%xmm4 \n"
+ "pxor %%xmm5,%%xmm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10, 0) ",%%xmm1 \n"
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm1 \n"
+ "pavgw %%xmm5,%%xmm0 \n"
+ "pavgw %%xmm5,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ :: "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5"
+ );
+}
+
+void ScaleRowDown2Box_SSSE3(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width) {
+ asm volatile (
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "psrlw $0xf,%%xmm4 \n"
+ "packuswb %%xmm4,%%xmm4 \n"
+ "pxor %%xmm5,%%xmm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ MEMOPREG(movdqu,0x00,0,3,1,xmm2) // movdqu (%0,%3,1),%%xmm2
+ MEMOPREG(movdqu,0x10,0,3,1,xmm3) // movdqu 0x10(%0,%3,1),%%xmm3
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm1 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm4,%%xmm3 \n"
+ "paddw %%xmm2,%%xmm0 \n"
+ "paddw %%xmm3,%%xmm1 \n"
+ "psrlw $0x1,%%xmm0 \n"
+ "psrlw $0x1,%%xmm1 \n"
+ "pavgw %%xmm5,%%xmm0 \n"
+ "pavgw %%xmm5,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"((intptr_t)(src_stride)) // %3
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+ );
+}
+
+#ifdef HAS_SCALEROWDOWN2_AVX2
+void ScaleRowDown2_AVX2(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width) {
+ (void)src_stride;
+ asm volatile (
+ LABELALIGN
+ "1: \n"
+ "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
+ "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
+ "lea " MEMLEA(0x40,0) ",%0 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
+ "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x20,1) ",%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ :: "memory", "cc", "xmm0", "xmm1"
+ );
+}
+
+void ScaleRowDown2Linear_AVX2(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width) {
+ (void)src_stride;
+ asm volatile (
+ "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
+ "vpsrlw $0xf,%%ymm4,%%ymm4 \n"
+ "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
+ "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
+ "vmovdqu " MEMACCESS2(0x20, 0) ",%%ymm1 \n"
+ "lea " MEMLEA(0x40,0) ",%0 \n"
+ "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
+ "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
+ "vpavgw %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpavgw %%ymm5,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x20,1) ",%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ :: "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5"
+ );
+}
+
+void ScaleRowDown2Box_AVX2(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width) {
+ asm volatile (
+ "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
+ "vpsrlw $0xf,%%ymm4,%%ymm4 \n"
+ "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
+ "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
+ "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
+ MEMOPREG(vmovdqu,0x00,0,3,1,ymm2) // vmovdqu (%0,%3,1),%%ymm2
+ MEMOPREG(vmovdqu,0x20,0,3,1,ymm3) // vmovdqu 0x20(%0,%3,1),%%ymm3
+ "lea " MEMLEA(0x40,0) ",%0 \n"
+ "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
+ "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
+ "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
+ "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
+ "vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
+ "vpsrlw $0x1,%%ymm0,%%ymm0 \n"
+ "vpsrlw $0x1,%%ymm1,%%ymm1 \n"
+ "vpavgw %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpavgw %%ymm5,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x20,1) ",%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"((intptr_t)(src_stride)) // %3
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+ );
+}
+#endif // HAS_SCALEROWDOWN2_AVX2
+
+void ScaleRowDown4_SSSE3(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width) {
+ (void)src_stride;
+ asm volatile (
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrld $0x18,%%xmm5 \n"
+ "pslld $0x10,%%xmm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movq %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x8,1) ",%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ :: "memory", "cc", "xmm0", "xmm1", "xmm5"
+ );
+}
+
+void ScaleRowDown4Box_SSSE3(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width) {
+ intptr_t stridex3;
+ asm volatile (
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "psrlw $0xf,%%xmm4 \n"
+ "movdqa %%xmm4,%%xmm5 \n"
+ "packuswb %%xmm4,%%xmm4 \n"
+ "psllw $0x3,%%xmm5 \n"
+ "lea " MEMLEA4(0x00,4,4,2) ",%3 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2
+ MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm1 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm4,%%xmm3 \n"
+ "paddw %%xmm2,%%xmm0 \n"
+ "paddw %%xmm3,%%xmm1 \n"
+ MEMOPREG(movdqu,0x00,0,4,2,xmm2) // movdqu (%0,%4,2),%%xmm2
+ MEMOPREG(movdqu,0x10,0,4,2,xmm3) // movdqu 0x10(%0,%4,2),%%xmm3
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm4,%%xmm3 \n"
+ "paddw %%xmm2,%%xmm0 \n"
+ "paddw %%xmm3,%%xmm1 \n"
+ MEMOPREG(movdqu,0x00,0,3,1,xmm2) // movdqu (%0,%3,1),%%xmm2
+ MEMOPREG(movdqu,0x10,0,3,1,xmm3) // movdqu 0x10(%0,%3,1),%%xmm3
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm4,%%xmm3 \n"
+ "paddw %%xmm2,%%xmm0 \n"
+ "paddw %%xmm3,%%xmm1 \n"
+ "phaddw %%xmm1,%%xmm0 \n"
+ "paddw %%xmm5,%%xmm0 \n"
+ "psrlw $0x4,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movq %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x8,1) ",%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width), // %2
+ "=&r"(stridex3) // %3
+ : "r"((intptr_t)(src_stride)) // %4
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+}
+
+#ifdef HAS_SCALEROWDOWN4_AVX2
+void ScaleRowDown4_AVX2(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width) {
+ (void)src_stride;
+ asm volatile (
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ "vpsrld $0x18,%%ymm5,%%ymm5 \n"
+ "vpslld $0x10,%%ymm5,%%ymm5 \n"
+ LABELALIGN
+ "1: \n"
+ "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
+ "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
+ "lea " MEMLEA(0x40,0) ",%0 \n"
+ "vpand %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpand %%ymm5,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
+ "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ :: "memory", "cc", "xmm0", "xmm1", "xmm5"
+ );
+}
+
+void ScaleRowDown4Box_AVX2(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width) {
+ asm volatile (
+ "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
+ "vpsrlw $0xf,%%ymm4,%%ymm4 \n"
+ "vpsllw $0x3,%%ymm4,%%ymm5 \n"
+ "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
+ "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
+ MEMOPREG(vmovdqu,0x00,0,3,1,ymm2) // vmovdqu (%0,%3,1),%%ymm2
+ MEMOPREG(vmovdqu,0x20,0,3,1,ymm3) // vmovdqu 0x20(%0,%3,1),%%ymm3
+ "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
+ "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
+ "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
+ "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
+ "vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
+ MEMOPREG(vmovdqu,0x00,0,3,2,ymm2) // vmovdqu (%0,%3,2),%%ymm2
+ MEMOPREG(vmovdqu,0x20,0,3,2,ymm3) // vmovdqu 0x20(%0,%3,2),%%ymm3
+ "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
+ "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
+ "vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
+ MEMOPREG(vmovdqu,0x00,0,4,1,ymm2) // vmovdqu (%0,%4,1),%%ymm2
+ MEMOPREG(vmovdqu,0x20,0,4,1,ymm3) // vmovdqu 0x20(%0,%4,1),%%ymm3
+ "lea " MEMLEA(0x40,0) ",%0 \n"
+ "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
+ "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
+ "vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
+ "vphaddw %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpaddw %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpsrlw $0x4,%%ymm0,%%ymm0 \n"
+ "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"((intptr_t)(src_stride)), // %3
+ "r"((intptr_t)(src_stride * 3)) // %4
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+}
+#endif // HAS_SCALEROWDOWN4_AVX2
+
+void ScaleRowDown34_SSSE3(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width) {
+ (void)src_stride;
+ asm volatile(
+ "movdqa %0,%%xmm3 \n"
+ "movdqa %1,%%xmm4 \n"
+ "movdqa %2,%%xmm5 \n"
+ :
+ : "m"(kShuf0), // %0
+ "m"(kShuf1), // %1
+ "m"(kShuf2) // %2
+ );
+ asm volatile (
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm2 \n"
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "movdqa %%xmm2,%%xmm1 \n"
+ "palignr $0x8,%%xmm0,%%xmm1 \n"
+ "pshufb %%xmm3,%%xmm0 \n"
+ "pshufb %%xmm4,%%xmm1 \n"
+ "pshufb %%xmm5,%%xmm2 \n"
+ "movq %%xmm0," MEMACCESS(1) " \n"
+ "movq %%xmm1," MEMACCESS2(0x8,1) " \n"
+ "movq %%xmm2," MEMACCESS2(0x10,1) " \n"
+ "lea " MEMLEA(0x18,1) ",%1 \n"
+ "sub $0x18,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+}
+
+void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width) {
+ asm volatile(
+ "movdqa %0,%%xmm2 \n" // kShuf01
+ "movdqa %1,%%xmm3 \n" // kShuf11
+ "movdqa %2,%%xmm4 \n" // kShuf21
+ :
+ : "m"(kShuf01), // %0
+ "m"(kShuf11), // %1
+ "m"(kShuf21) // %2
+ );
+ asm volatile(
+ "movdqa %0,%%xmm5 \n" // kMadd01
+ "movdqa %1,%%xmm0 \n" // kMadd11
+ "movdqa %2,%%xmm1 \n" // kRound34
+ :
+ : "m"(kMadd01), // %0
+ "m"(kMadd11), // %1
+ "m"(kRound34) // %2
+ );
+ asm volatile (
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm6 \n"
+ MEMOPREG(movdqu,0x00,0,3,1,xmm7) // movdqu (%0,%3),%%xmm7
+ "pavgb %%xmm7,%%xmm6 \n"
+ "pshufb %%xmm2,%%xmm6 \n"
+ "pmaddubsw %%xmm5,%%xmm6 \n"
+ "paddsw %%xmm1,%%xmm6 \n"
+ "psrlw $0x2,%%xmm6 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "movq %%xmm6," MEMACCESS(1) " \n"
+ "movdqu " MEMACCESS2(0x8,0) ",%%xmm6 \n"
+ MEMOPREG(movdqu,0x8,0,3,1,xmm7) // movdqu 0x8(%0,%3),%%xmm7
+ "pavgb %%xmm7,%%xmm6 \n"
+ "pshufb %%xmm3,%%xmm6 \n"
+ "pmaddubsw %%xmm0,%%xmm6 \n"
+ "paddsw %%xmm1,%%xmm6 \n"
+ "psrlw $0x2,%%xmm6 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "movq %%xmm6," MEMACCESS2(0x8,1) " \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm6 \n"
+ MEMOPREG(movdqu,0x10,0,3,1,xmm7) // movdqu 0x10(%0,%3),%%xmm7
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "pavgb %%xmm7,%%xmm6 \n"
+ "pshufb %%xmm4,%%xmm6 \n"
+ "pmaddubsw %4,%%xmm6 \n"
+ "paddsw %%xmm1,%%xmm6 \n"
+ "psrlw $0x2,%%xmm6 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "movq %%xmm6," MEMACCESS2(0x10,1) " \n"
+ "lea " MEMLEA(0x18,1) ",%1 \n"
+ "sub $0x18,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"((intptr_t)(src_stride)), // %3
+ "m"(kMadd21) // %4
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+ );
+}
+
+void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width) {
+ asm volatile(
+ "movdqa %0,%%xmm2 \n" // kShuf01
+ "movdqa %1,%%xmm3 \n" // kShuf11
+ "movdqa %2,%%xmm4 \n" // kShuf21
+ :
+ : "m"(kShuf01), // %0
+ "m"(kShuf11), // %1
+ "m"(kShuf21) // %2
+ );
+ asm volatile(
+ "movdqa %0,%%xmm5 \n" // kMadd01
+ "movdqa %1,%%xmm0 \n" // kMadd11
+ "movdqa %2,%%xmm1 \n" // kRound34
+ :
+ : "m"(kMadd01), // %0
+ "m"(kMadd11), // %1
+ "m"(kRound34) // %2
+ );
+
+ asm volatile (
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm6 \n"
+ MEMOPREG(movdqu,0x00,0,3,1,xmm7) // movdqu (%0,%3,1),%%xmm7
+ "pavgb %%xmm6,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm6 \n"
+ "pshufb %%xmm2,%%xmm6 \n"
+ "pmaddubsw %%xmm5,%%xmm6 \n"
+ "paddsw %%xmm1,%%xmm6 \n"
+ "psrlw $0x2,%%xmm6 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "movq %%xmm6," MEMACCESS(1) " \n"
+ "movdqu " MEMACCESS2(0x8,0) ",%%xmm6 \n"
+ MEMOPREG(movdqu,0x8,0,3,1,xmm7) // movdqu 0x8(%0,%3,1),%%xmm7
+ "pavgb %%xmm6,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm6 \n"
+ "pshufb %%xmm3,%%xmm6 \n"
+ "pmaddubsw %%xmm0,%%xmm6 \n"
+ "paddsw %%xmm1,%%xmm6 \n"
+ "psrlw $0x2,%%xmm6 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "movq %%xmm6," MEMACCESS2(0x8,1) " \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm6 \n"
+ MEMOPREG(movdqu,0x10,0,3,1,xmm7) // movdqu 0x10(%0,%3,1),%%xmm7
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "pavgb %%xmm6,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm6 \n"
+ "pshufb %%xmm4,%%xmm6 \n"
+ "pmaddubsw %4,%%xmm6 \n"
+ "paddsw %%xmm1,%%xmm6 \n"
+ "psrlw $0x2,%%xmm6 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "movq %%xmm6," MEMACCESS2(0x10,1) " \n"
+ "lea " MEMLEA(0x18,1) ",%1 \n"
+ "sub $0x18,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"((intptr_t)(src_stride)), // %3
+ "m"(kMadd21) // %4
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+ );
+}
+
+void ScaleRowDown38_SSSE3(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width) {
+ (void)src_stride;
+ asm volatile (
+ "movdqa %3,%%xmm4 \n"
+ "movdqa %4,%%xmm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "pshufb %%xmm4,%%xmm0 \n"
+ "pshufb %%xmm5,%%xmm1 \n"
+ "paddusb %%xmm1,%%xmm0 \n"
+ "movq %%xmm0," MEMACCESS(1) " \n"
+ "movhlps %%xmm0,%%xmm1 \n"
+ "movd %%xmm1," MEMACCESS2(0x8,1) " \n"
+ "lea " MEMLEA(0xc,1) ",%1 \n"
+ "sub $0xc,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "m"(kShuf38a), // %3
+ "m"(kShuf38b) // %4
+ : "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5"
+ );
+}
+
+void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width) {
+ asm volatile(
+ "movdqa %0,%%xmm2 \n"
+ "movdqa %1,%%xmm3 \n"
+ "movdqa %2,%%xmm4 \n"
+ "movdqa %3,%%xmm5 \n"
+ :
+ : "m"(kShufAb0), // %0
+ "m"(kShufAb1), // %1
+ "m"(kShufAb2), // %2
+ "m"(kScaleAb2) // %3
+ );
+ asm volatile (
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ MEMOPREG(movdqu,0x00,0,3,1,xmm1) // movdqu (%0,%3,1),%%xmm1
+ "lea " MEMLEA(0x10,0) ",%0 \n"
+ "pavgb %%xmm1,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "pshufb %%xmm2,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm6 \n"
+ "pshufb %%xmm3,%%xmm6 \n"
+ "paddusw %%xmm6,%%xmm1 \n"
+ "pshufb %%xmm4,%%xmm0 \n"
+ "paddusw %%xmm0,%%xmm1 \n"
+ "pmulhuw %%xmm5,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm1 \n"
+ "movd %%xmm1," MEMACCESS(1) " \n"
+ "psrlq $0x10,%%xmm1 \n"
+ "movd %%xmm1," MEMACCESS2(0x2,1) " \n"
+ "lea " MEMLEA(0x6,1) ",%1 \n"
+ "sub $0x6,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"((intptr_t)(src_stride)) // %3
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+ );
+}
+
+void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width) {
+ asm volatile(
+ "movdqa %0,%%xmm2 \n"
+ "movdqa %1,%%xmm3 \n"
+ "movdqa %2,%%xmm4 \n"
+ "pxor %%xmm5,%%xmm5 \n"
+ :
+ : "m"(kShufAc), // %0
+ "m"(kShufAc3), // %1
+ "m"(kScaleAc33) // %2
+ );
+ asm volatile (
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ MEMOPREG(movdqu,0x00,0,3,1,xmm6) // movdqu (%0,%3,1),%%xmm6
+ "movhlps %%xmm0,%%xmm1 \n"
+ "movhlps %%xmm6,%%xmm7 \n"
+ "punpcklbw %%xmm5,%%xmm0 \n"
+ "punpcklbw %%xmm5,%%xmm1 \n"
+ "punpcklbw %%xmm5,%%xmm6 \n"
+ "punpcklbw %%xmm5,%%xmm7 \n"
+ "paddusw %%xmm6,%%xmm0 \n"
+ "paddusw %%xmm7,%%xmm1 \n"
+ MEMOPREG(movdqu,0x00,0,3,2,xmm6) // movdqu (%0,%3,2),%%xmm6
+ "lea " MEMLEA(0x10,0) ",%0 \n"
+ "movhlps %%xmm6,%%xmm7 \n"
+ "punpcklbw %%xmm5,%%xmm6 \n"
+ "punpcklbw %%xmm5,%%xmm7 \n"
+ "paddusw %%xmm6,%%xmm0 \n"
+ "paddusw %%xmm7,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm6 \n"
+ "psrldq $0x2,%%xmm0 \n"
+ "paddusw %%xmm0,%%xmm6 \n"
+ "psrldq $0x2,%%xmm0 \n"
+ "paddusw %%xmm0,%%xmm6 \n"
+ "pshufb %%xmm2,%%xmm6 \n"
+ "movdqa %%xmm1,%%xmm7 \n"
+ "psrldq $0x2,%%xmm1 \n"
+ "paddusw %%xmm1,%%xmm7 \n"
+ "psrldq $0x2,%%xmm1 \n"
+ "paddusw %%xmm1,%%xmm7 \n"
+ "pshufb %%xmm3,%%xmm7 \n"
+ "paddusw %%xmm7,%%xmm6 \n"
+ "pmulhuw %%xmm4,%%xmm6 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "movd %%xmm6," MEMACCESS(1) " \n"
+ "psrlq $0x10,%%xmm6 \n"
+ "movd %%xmm6," MEMACCESS2(0x2,1) " \n"
+ "lea " MEMLEA(0x6,1) ",%1 \n"
+ "sub $0x6,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"((intptr_t)(src_stride)) // %3
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+ );
+}
+
+// Reads 16xN bytes and produces 16 shorts at a time.
+void ScaleAddRow_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
+ asm volatile (
+ "pxor %%xmm5,%%xmm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm3 \n"
+ "lea " MEMLEA(0x10,0) ",%0 \n" // src_ptr += 16
+ "movdqu " MEMACCESS(1) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,1) ",%%xmm1 \n"
+ "movdqa %%xmm3,%%xmm2 \n"
+ "punpcklbw %%xmm5,%%xmm2 \n"
+ "punpckhbw %%xmm5,%%xmm3 \n"
+ "paddusw %%xmm2,%%xmm0 \n"
+ "paddusw %%xmm3,%%xmm1 \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
+ "lea " MEMLEA(0x20,1) ",%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(src_width) // %2
+ :
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+ );
+}
+
+#ifdef HAS_SCALEADDROW_AVX2
+// Reads 32 bytes and accumulates to 32 shorts at a time.
+void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
+ asm volatile (
+ "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu " MEMACCESS(0) ",%%ymm3 \n"
+ "lea " MEMLEA(0x20,0) ",%0 \n" // src_ptr += 32
+ "vpermq $0xd8,%%ymm3,%%ymm3 \n"
+ "vpunpcklbw %%ymm5,%%ymm3,%%ymm2 \n"
+ "vpunpckhbw %%ymm5,%%ymm3,%%ymm3 \n"
+ "vpaddusw " MEMACCESS(1) ",%%ymm2,%%ymm0 \n"
+ "vpaddusw " MEMACCESS2(0x20,1) ",%%ymm3,%%ymm1 \n"
+ "vmovdqu %%ymm0," MEMACCESS(1) " \n"
+ "vmovdqu %%ymm1," MEMACCESS2(0x20,1) " \n"
+ "lea " MEMLEA(0x40,1) ",%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(src_width) // %2
+ :
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+ );
+}
+#endif // HAS_SCALEADDROW_AVX2
+
+// Constant for making pixels signed to avoid pmaddubsw
+// saturation.
+static uvec8 kFsub80 = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80};
+
+// Constant for making pixels unsigned and adding .5 for rounding.
+static uvec16 kFadd40 = {0x4040, 0x4040, 0x4040, 0x4040,
+ 0x4040, 0x4040, 0x4040, 0x4040};
+
+// Bilinear column filtering. SSSE3 version.
+void ScaleFilterCols_SSSE3(uint8* dst_ptr,
+ const uint8* src_ptr,
+ int dst_width,
+ int x,
+ int dx) {
+ intptr_t x0, x1, temp_pixel;
+ asm volatile (
+ "movd %6,%%xmm2 \n"
+ "movd %7,%%xmm3 \n"
+ "movl $0x04040000,%k2 \n"
+ "movd %k2,%%xmm5 \n"
+ "pcmpeqb %%xmm6,%%xmm6 \n"
+ "psrlw $0x9,%%xmm6 \n" // 0x007f007f
+ "pcmpeqb %%xmm7,%%xmm7 \n"
+ "psrlw $15,%%xmm7 \n" // 0x00010001
+
+ "pextrw $0x1,%%xmm2,%k3 \n"
+ "subl $0x2,%5 \n"
+ "jl 29f \n"
+ "movdqa %%xmm2,%%xmm0 \n"
+ "paddd %%xmm3,%%xmm0 \n"
+ "punpckldq %%xmm0,%%xmm2 \n"
+ "punpckldq %%xmm3,%%xmm3 \n"
+ "paddd %%xmm3,%%xmm3 \n"
+ "pextrw $0x3,%%xmm2,%k4 \n"
+
+ LABELALIGN
+ "2: \n"
+ "movdqa %%xmm2,%%xmm1 \n"
+ "paddd %%xmm3,%%xmm2 \n"
+ MEMOPARG(movzwl,0x00,1,3,1,k2) // movzwl (%1,%3,1),%k2
+ "movd %k2,%%xmm0 \n"
+ "psrlw $0x9,%%xmm1 \n"
+ MEMOPARG(movzwl,0x00,1,4,1,k2) // movzwl (%1,%4,1),%k2
+ "movd %k2,%%xmm4 \n"
+ "pshufb %%xmm5,%%xmm1 \n"
+ "punpcklwd %%xmm4,%%xmm0 \n"
+ "psubb %8,%%xmm0 \n" // make pixels signed.
+ "pxor %%xmm6,%%xmm1 \n" // 128 - f = (f ^ 127 ) + 1
+ "paddusb %%xmm7,%%xmm1 \n"
+ "pmaddubsw %%xmm0,%%xmm1 \n"
+ "pextrw $0x1,%%xmm2,%k3 \n"
+ "pextrw $0x3,%%xmm2,%k4 \n"
+ "paddw %9,%%xmm1 \n" // make pixels unsigned.
+ "psrlw $0x7,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm1 \n"
+ "movd %%xmm1,%k2 \n"
+ "mov %w2," MEMACCESS(0) " \n"
+ "lea " MEMLEA(0x2,0) ",%0 \n"
+ "subl $0x2,%5 \n"
+ "jge 2b \n"
+
+ LABELALIGN
+ "29: \n"
+ "addl $0x1,%5 \n"
+ "jl 99f \n"
+ MEMOPARG(movzwl,0x00,1,3,1,k2) // movzwl (%1,%3,1),%k2
+ "movd %k2,%%xmm0 \n"
+ "psrlw $0x9,%%xmm2 \n"
+ "pshufb %%xmm5,%%xmm2 \n"
+ "psubb %8,%%xmm0 \n" // make pixels signed.
+ "pxor %%xmm6,%%xmm2 \n"
+ "paddusb %%xmm7,%%xmm2 \n"
+ "pmaddubsw %%xmm0,%%xmm2 \n"
+ "paddw %9,%%xmm2 \n" // make pixels unsigned.
+ "psrlw $0x7,%%xmm2 \n"
+ "packuswb %%xmm2,%%xmm2 \n"
+ "movd %%xmm2,%k2 \n"
+ "mov %b2," MEMACCESS(0) " \n"
+ "99: \n"
+ : "+r"(dst_ptr), // %0
+ "+r"(src_ptr), // %1
+ "=&a"(temp_pixel), // %2
+ "=&r"(x0), // %3
+ "=&r"(x1), // %4
+#if defined(__x86_64__)
+ "+rm"(dst_width) // %5
+#else
+ "+m"(dst_width) // %5
+#endif
+ : "rm"(x), // %6
+ "rm"(dx), // %7
+#if defined(__x86_64__)
+ "x"(kFsub80), // %8
+ "x"(kFadd40) // %9
+#else
+ "m"(kFsub80), // %8
+ "m"(kFadd40) // %9
+#endif
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+ );
+}
+
+// Reads 4 pixels, duplicates them and writes 8 pixels.
+// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
+void ScaleColsUp2_SSE2(uint8* dst_ptr,
+ const uint8* src_ptr,
+ int dst_width,
+ int x,
+ int dx) {
+ (void)x;
+ (void)dx;
+ asm volatile (
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(1) ",%%xmm0 \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklbw %%xmm0,%%xmm0 \n"
+ "punpckhbw %%xmm1,%%xmm1 \n"
+ "movdqu %%xmm0," MEMACCESS(0) " \n"
+ "movdqu %%xmm1," MEMACCESS2(0x10,0) " \n"
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
+
+ : "+r"(dst_ptr), // %0
+ "+r"(src_ptr), // %1
+ "+r"(dst_width) // %2
+ :: "memory", "cc", "xmm0", "xmm1"
+ );
+}
+
+void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
+ ptrdiff_t src_stride,
+ uint8* dst_argb,
+ int dst_width) {
+ (void)src_stride;
+ asm volatile (
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(dst_width) // %2
+ :: "memory", "cc", "xmm0", "xmm1"
+ );
+}
+
+void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
+ ptrdiff_t src_stride,
+ uint8* dst_argb,
+ int dst_width) {
+ (void)src_stride;
+ asm volatile (
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "shufps $0x88,%%xmm1,%%xmm0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm2 \n"
+ "pavgb %%xmm2,%%xmm0 \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(dst_width) // %2
+ :: "memory", "cc", "xmm0", "xmm1"
+ );
+}
+
+void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
+ ptrdiff_t src_stride,
+ uint8* dst_argb,
+ int dst_width) {
+ asm volatile (
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ MEMOPREG(movdqu,0x00,0,3,1,xmm2) // movdqu (%0,%3,1),%%xmm2
+ MEMOPREG(movdqu,0x10,0,3,1,xmm3) // movdqu 0x10(%0,%3,1),%%xmm3
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "pavgb %%xmm2,%%xmm0 \n"
+ "pavgb %%xmm3,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "shufps $0x88,%%xmm1,%%xmm0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm2 \n"
+ "pavgb %%xmm2,%%xmm0 \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(dst_width) // %2
+ : "r"((intptr_t)(src_stride)) // %3
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm3"
+ );
+}
+
+// Reads 4 pixels at a time.
+// Alignment requirement: dst_argb 16 byte aligned.
+void ScaleARGBRowDownEven_SSE2(const uint8* src_argb,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8* dst_argb,
+ int dst_width) {
+ intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
+ intptr_t src_stepx_x12;
+ (void)src_stride;
+ asm volatile (
+ "lea " MEMLEA3(0x00,1,4) ",%1 \n"
+ "lea " MEMLEA4(0x00,1,1,2) ",%4 \n"
+ LABELALIGN
+ "1: \n"
+ "movd " MEMACCESS(0) ",%%xmm0 \n"
+ MEMOPREG(movd,0x00,0,1,1,xmm1) // movd (%0,%1,1),%%xmm1
+ "punpckldq %%xmm1,%%xmm0 \n"
+ MEMOPREG(movd,0x00,0,1,2,xmm2) // movd (%0,%1,2),%%xmm2
+ MEMOPREG(movd,0x00,0,4,1,xmm3) // movd (%0,%4,1),%%xmm3
+ "lea " MEMLEA4(0x00,0,1,4) ",%0 \n"
+ "punpckldq %%xmm3,%%xmm2 \n"
+ "punpcklqdq %%xmm2,%%xmm0 \n"
+ "movdqu %%xmm0," MEMACCESS(2) " \n"
+ "lea " MEMLEA(0x10,2) ",%2 \n"
+ "sub $0x4,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(src_stepx_x4), // %1
+ "+r"(dst_argb), // %2
+ "+r"(dst_width), // %3
+ "=&r"(src_stepx_x12) // %4
+ :: "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm3"
+ );
+}
+
+// Blends four 2x2 to 4x1.
+// Alignment requirement: dst_argb 16 byte aligned.
+void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8* dst_argb,
+ int dst_width) {
+ intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
+ intptr_t src_stepx_x12;
+ intptr_t row1 = (intptr_t)(src_stride);
+ asm volatile (
+ "lea " MEMLEA3(0x00,1,4) ",%1 \n"
+ "lea " MEMLEA4(0x00,1,1,2) ",%4 \n"
+ "lea " MEMLEA4(0x00,0,5,1) ",%5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movq " MEMACCESS(0) ",%%xmm0 \n"
+ MEMOPREG(movhps,0x00,0,1,1,xmm0) // movhps (%0,%1,1),%%xmm0
+ MEMOPREG(movq,0x00,0,1,2,xmm1) // movq (%0,%1,2),%%xmm1
+ MEMOPREG(movhps,0x00,0,4,1,xmm1) // movhps (%0,%4,1),%%xmm1
+ "lea " MEMLEA4(0x00,0,1,4) ",%0 \n"
+ "movq " MEMACCESS(5) ",%%xmm2 \n"
+ MEMOPREG(movhps,0x00,5,1,1,xmm2) // movhps (%5,%1,1),%%xmm2
+ MEMOPREG(movq,0x00,5,1,2,xmm3) // movq (%5,%1,2),%%xmm3
+ MEMOPREG(movhps,0x00,5,4,1,xmm3) // movhps (%5,%4,1),%%xmm3
+ "lea " MEMLEA4(0x00,5,1,4) ",%5 \n"
+ "pavgb %%xmm2,%%xmm0 \n"
+ "pavgb %%xmm3,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "shufps $0x88,%%xmm1,%%xmm0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm2 \n"
+ "pavgb %%xmm2,%%xmm0 \n"
+ "movdqu %%xmm0," MEMACCESS(2) " \n"
+ "lea " MEMLEA(0x10,2) ",%2 \n"
+ "sub $0x4,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(src_stepx_x4), // %1
+ "+r"(dst_argb), // %2
+ "+rm"(dst_width), // %3
+ "=&r"(src_stepx_x12), // %4
+ "+r"(row1) // %5
+ :: "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm3"
+ );
+}
+
+void ScaleARGBCols_SSE2(uint8* dst_argb,
+ const uint8* src_argb,
+ int dst_width,
+ int x,
+ int dx) {
+ intptr_t x0, x1;
+ asm volatile (
+ "movd %5,%%xmm2 \n"
+ "movd %6,%%xmm3 \n"
+ "pshufd $0x0,%%xmm2,%%xmm2 \n"
+ "pshufd $0x11,%%xmm3,%%xmm0 \n"
+ "paddd %%xmm0,%%xmm2 \n"
+ "paddd %%xmm3,%%xmm3 \n"
+ "pshufd $0x5,%%xmm3,%%xmm0 \n"
+ "paddd %%xmm0,%%xmm2 \n"
+ "paddd %%xmm3,%%xmm3 \n"
+ "pshufd $0x0,%%xmm3,%%xmm3 \n"
+ "pextrw $0x1,%%xmm2,%k0 \n"
+ "pextrw $0x3,%%xmm2,%k1 \n"
+ "cmp $0x0,%4 \n"
+ "jl 99f \n"
+ "sub $0x4,%4 \n"
+ "jl 49f \n"
+
+ LABELALIGN
+ "40: \n"
+ MEMOPREG(movd,0x00,3,0,4,xmm0) // movd (%3,%0,4),%%xmm0
+ MEMOPREG(movd,0x00,3,1,4,xmm1) // movd (%3,%1,4),%%xmm1
+ "pextrw $0x5,%%xmm2,%k0 \n"
+ "pextrw $0x7,%%xmm2,%k1 \n"
+ "paddd %%xmm3,%%xmm2 \n"
+ "punpckldq %%xmm1,%%xmm0 \n"
+ MEMOPREG(movd,0x00,3,0,4,xmm1) // movd (%3,%0,4),%%xmm1
+ MEMOPREG(movd,0x00,3,1,4,xmm4) // movd (%3,%1,4),%%xmm4
+ "pextrw $0x1,%%xmm2,%k0 \n"
+ "pextrw $0x3,%%xmm2,%k1 \n"
+ "punpckldq %%xmm4,%%xmm1 \n"
+ "punpcklqdq %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0," MEMACCESS(2) " \n"
+ "lea " MEMLEA(0x10,2) ",%2 \n"
+ "sub $0x4,%4 \n"
+ "jge 40b \n"
+
+ "49: \n"
+ "test $0x2,%4 \n"
+ "je 29f \n"
+ MEMOPREG(movd,0x00,3,0,4,xmm0) // movd (%3,%0,4),%%xmm0
+ MEMOPREG(movd,0x00,3,1,4,xmm1) // movd (%3,%1,4),%%xmm1
+ "pextrw $0x5,%%xmm2,%k0 \n"
+ "punpckldq %%xmm1,%%xmm0 \n"
+ "movq %%xmm0," MEMACCESS(2) " \n"
+ "lea " MEMLEA(0x8,2) ",%2 \n"
+ "29: \n"
+ "test $0x1,%4 \n"
+ "je 99f \n"
+ MEMOPREG(movd,0x00,3,0,4,xmm0) // movd (%3,%0,4),%%xmm0
+ "movd %%xmm0," MEMACCESS(2) " \n"
+ "99: \n"
+ : "=&a"(x0), // %0
+ "=&d"(x1), // %1
+ "+r"(dst_argb), // %2
+ "+r"(src_argb), // %3
+ "+r"(dst_width) // %4
+ : "rm"(x), // %5
+ "rm"(dx) // %6
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
+ );
+}
+
+// Reads 4 pixels, duplicates them and writes 8 pixels.
+// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
+void ScaleARGBColsUp2_SSE2(uint8* dst_argb,
+ const uint8* src_argb,
+ int dst_width,
+ int x,
+ int dx) {
+ (void)x;
+ (void)dx;
+ asm volatile (
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(1) ",%%xmm0 \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpckldq %%xmm0,%%xmm0 \n"
+ "punpckhdq %%xmm1,%%xmm1 \n"
+ "movdqu %%xmm0," MEMACCESS(0) " \n"
+ "movdqu %%xmm1," MEMACCESS2(0x10,0) " \n"
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+
+ : "+r"(dst_argb), // %0
+ "+r"(src_argb), // %1
+ "+r"(dst_width) // %2
+ :: "memory", "cc", NACL_R14
+ "xmm0", "xmm1"
+ );
+}
+
+// Shuffle table for arranging 2 pixels into pairs for pmaddubsw
+static uvec8 kShuffleColARGB = {
+ 0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel
+ 8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel
+};
+
+// Shuffle table for duplicating 2 fractions into 8 bytes each
+static uvec8 kShuffleFractions = {
+ 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
+};
+
+// Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version
+void ScaleARGBFilterCols_SSSE3(uint8* dst_argb,
+ const uint8* src_argb,
+ int dst_width,
+ int x,
+ int dx) {
+ intptr_t x0, x1;
+ asm volatile(
+ "movdqa %0,%%xmm4 \n"
+ "movdqa %1,%%xmm5 \n"
+ :
+ : "m"(kShuffleColARGB), // %0
+ "m"(kShuffleFractions) // %1
+ );
+
+ asm volatile (
+ "movd %5,%%xmm2 \n"
+ "movd %6,%%xmm3 \n"
+ "pcmpeqb %%xmm6,%%xmm6 \n"
+ "psrlw $0x9,%%xmm6 \n"
+ "pextrw $0x1,%%xmm2,%k3 \n"
+ "sub $0x2,%2 \n"
+ "jl 29f \n"
+ "movdqa %%xmm2,%%xmm0 \n"
+ "paddd %%xmm3,%%xmm0 \n"
+ "punpckldq %%xmm0,%%xmm2 \n"
+ "punpckldq %%xmm3,%%xmm3 \n"
+ "paddd %%xmm3,%%xmm3 \n"
+ "pextrw $0x3,%%xmm2,%k4 \n"
+
+ LABELALIGN
+ "2: \n"
+ "movdqa %%xmm2,%%xmm1 \n"
+ "paddd %%xmm3,%%xmm2 \n"
+ MEMOPREG(movq,0x00,1,3,4,xmm0) // movq (%1,%3,4),%%xmm0
+ "psrlw $0x9,%%xmm1 \n"
+ MEMOPREG(movhps,0x00,1,4,4,xmm0) // movhps (%1,%4,4),%%xmm0
+ "pshufb %%xmm5,%%xmm1 \n"
+ "pshufb %%xmm4,%%xmm0 \n"
+ "pxor %%xmm6,%%xmm1 \n"
+ "pmaddubsw %%xmm1,%%xmm0 \n"
+ "psrlw $0x7,%%xmm0 \n"
+ "pextrw $0x1,%%xmm2,%k3 \n"
+ "pextrw $0x3,%%xmm2,%k4 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movq %%xmm0," MEMACCESS(0) " \n"
+ "lea " MEMLEA(0x8,0) ",%0 \n"
+ "sub $0x2,%2 \n"
+ "jge 2b \n"
+
+ LABELALIGN
+ "29: \n"
+ "add $0x1,%2 \n"
+ "jl 99f \n"
+ "psrlw $0x9,%%xmm2 \n"
+ MEMOPREG(movq,0x00,1,3,4,xmm0) // movq (%1,%3,4),%%xmm0
+ "pshufb %%xmm5,%%xmm2 \n"
+ "pshufb %%xmm4,%%xmm0 \n"
+ "pxor %%xmm6,%%xmm2 \n"
+ "pmaddubsw %%xmm2,%%xmm0 \n"
+ "psrlw $0x7,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movd %%xmm0," MEMACCESS(0) " \n"
+
+ LABELALIGN
+ "99: \n"
+ : "+r"(dst_argb), // %0
+ "+r"(src_argb), // %1
+ "+rm"(dst_width), // %2
+ "=&r"(x0), // %3
+ "=&r"(x1) // %4
+ : "rm"(x), // %5
+ "rm"(dx) // %6
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+ );
+}
+
+// Divide num by div and return as 16.16 fixed point result.
+int FixedDiv_X86(int num, int div) {
+ asm volatile(
+ "cdq \n"
+ "shld $0x10,%%eax,%%edx \n"
+ "shl $0x10,%%eax \n"
+ "idiv %1 \n"
+ "mov %0, %%eax \n"
+ : "+a"(num) // %0
+ : "c"(div) // %1
+ : "memory", "cc", "edx");
+ return num;
+}
+
+// Divide num - 1 by div - 1 and return as 16.16 fixed point result.
+int FixedDiv1_X86(int num, int div) {
+ asm volatile(
+ "cdq \n"
+ "shld $0x10,%%eax,%%edx \n"
+ "shl $0x10,%%eax \n"
+ "sub $0x10001,%%eax \n"
+ "sbb $0x0,%%edx \n"
+ "sub $0x1,%1 \n"
+ "idiv %1 \n"
+ "mov %0, %%eax \n"
+ : "+a"(num) // %0
+ : "c"(div) // %1
+ : "memory", "cc", "edx");
+ return num;
+}
+
+#endif // defined(__x86_64__) || defined(__i386__)
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/libyuv/src/main/cpp/libyuv/source/scale_msa.cc b/libyuv/src/main/cpp/libyuv/source/scale_msa.cc
new file mode 100644
index 0000000..bfcd10f
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/source/scale_msa.cc
@@ -0,0 +1,553 @@
+/*
+ * Copyright 2016 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include
+
+#include "libyuv/scale_row.h"
+
+// This module is for GCC MSA
+#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
+#include "libyuv/macros_msa.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+void ScaleARGBRowDown2_MSA(const uint8_t* src_argb,
+ ptrdiff_t src_stride,
+ uint8_t* dst_argb,
+ int dst_width) {
+ int x;
+ v16u8 src0, src1, dst0;
+ (void)src_stride;
+
+ for (x = 0; x < dst_width; x += 4) {
+ src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0);
+ src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16);
+ dst0 = (v16u8)__msa_pckod_w((v4i32)src1, (v4i32)src0);
+ ST_UB(dst0, dst_argb);
+ src_argb += 32;
+ dst_argb += 16;
+ }
+}
+
+void ScaleARGBRowDown2Linear_MSA(const uint8_t* src_argb,
+ ptrdiff_t src_stride,
+ uint8_t* dst_argb,
+ int dst_width) {
+ int x;
+ v16u8 src0, src1, vec0, vec1, dst0;
+ (void)src_stride;
+
+ for (x = 0; x < dst_width; x += 4) {
+ src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0);
+ src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16);
+ vec0 = (v16u8)__msa_pckev_w((v4i32)src1, (v4i32)src0);
+ vec1 = (v16u8)__msa_pckod_w((v4i32)src1, (v4i32)src0);
+ dst0 = (v16u8)__msa_aver_u_b((v16u8)vec0, (v16u8)vec1);
+ ST_UB(dst0, dst_argb);
+ src_argb += 32;
+ dst_argb += 16;
+ }
+}
+
+void ScaleARGBRowDown2Box_MSA(const uint8_t* src_argb,
+ ptrdiff_t src_stride,
+ uint8_t* dst_argb,
+ int dst_width) {
+ int x;
+ const uint8_t* s = src_argb;
+ const uint8_t* t = src_argb + src_stride;
+ v16u8 src0, src1, src2, src3, vec0, vec1, vec2, vec3, dst0;
+ v8u16 reg0, reg1, reg2, reg3;
+ v16i8 shuffler = {0, 4, 1, 5, 2, 6, 3, 7, 8, 12, 9, 13, 10, 14, 11, 15};
+
+ for (x = 0; x < dst_width; x += 4) {
+ src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+ src1 = (v16u8)__msa_ld_b((v16i8*)s, 16);
+ src2 = (v16u8)__msa_ld_b((v16i8*)t, 0);
+ src3 = (v16u8)__msa_ld_b((v16i8*)t, 16);
+ vec0 = (v16u8)__msa_vshf_b(shuffler, (v16i8)src0, (v16i8)src0);
+ vec1 = (v16u8)__msa_vshf_b(shuffler, (v16i8)src1, (v16i8)src1);
+ vec2 = (v16u8)__msa_vshf_b(shuffler, (v16i8)src2, (v16i8)src2);
+ vec3 = (v16u8)__msa_vshf_b(shuffler, (v16i8)src3, (v16i8)src3);
+ reg0 = __msa_hadd_u_h(vec0, vec0);
+ reg1 = __msa_hadd_u_h(vec1, vec1);
+ reg2 = __msa_hadd_u_h(vec2, vec2);
+ reg3 = __msa_hadd_u_h(vec3, vec3);
+ reg0 += reg2;
+ reg1 += reg3;
+ reg0 = (v8u16)__msa_srari_h((v8i16)reg0, 2);
+ reg1 = (v8u16)__msa_srari_h((v8i16)reg1, 2);
+ dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0);
+ ST_UB(dst0, dst_argb);
+ s += 32;
+ t += 32;
+ dst_argb += 16;
+ }
+}
+
+void ScaleARGBRowDownEven_MSA(const uint8_t* src_argb,
+ ptrdiff_t src_stride,
+ int32_t src_stepx,
+ uint8_t* dst_argb,
+ int dst_width) {
+ int x;
+ int32_t stepx = src_stepx * 4;
+ int32_t data0, data1, data2, data3;
+ (void)src_stride;
+
+ for (x = 0; x < dst_width; x += 4) {
+ data0 = LW(src_argb);
+ data1 = LW(src_argb + stepx);
+ data2 = LW(src_argb + stepx * 2);
+ data3 = LW(src_argb + stepx * 3);
+ SW(data0, dst_argb);
+ SW(data1, dst_argb + 4);
+ SW(data2, dst_argb + 8);
+ SW(data3, dst_argb + 12);
+ src_argb += stepx * 4;
+ dst_argb += 16;
+ }
+}
+
+void ScaleARGBRowDownEvenBox_MSA(const uint8* src_argb,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8* dst_argb,
+ int dst_width) {
+ int x;
+ const uint8* nxt_argb = src_argb + src_stride;
+ int32_t stepx = src_stepx * 4;
+ int64_t data0, data1, data2, data3;
+ v16u8 src0 = {0}, src1 = {0}, src2 = {0}, src3 = {0};
+ v16u8 vec0, vec1, vec2, vec3;
+ v8u16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
+ v16u8 dst0;
+
+ for (x = 0; x < dst_width; x += 4) {
+ data0 = LD(src_argb);
+ data1 = LD(src_argb + stepx);
+ data2 = LD(src_argb + stepx * 2);
+ data3 = LD(src_argb + stepx * 3);
+ src0 = (v16u8)__msa_insert_d((v2i64)src0, 0, data0);
+ src0 = (v16u8)__msa_insert_d((v2i64)src0, 1, data1);
+ src1 = (v16u8)__msa_insert_d((v2i64)src1, 0, data2);
+ src1 = (v16u8)__msa_insert_d((v2i64)src1, 1, data3);
+ data0 = LD(nxt_argb);
+ data1 = LD(nxt_argb + stepx);
+ data2 = LD(nxt_argb + stepx * 2);
+ data3 = LD(nxt_argb + stepx * 3);
+ src2 = (v16u8)__msa_insert_d((v2i64)src2, 0, data0);
+ src2 = (v16u8)__msa_insert_d((v2i64)src2, 1, data1);
+ src3 = (v16u8)__msa_insert_d((v2i64)src3, 0, data2);
+ src3 = (v16u8)__msa_insert_d((v2i64)src3, 1, data3);
+ vec0 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src0);
+ vec1 = (v16u8)__msa_ilvr_b((v16i8)src3, (v16i8)src1);
+ vec2 = (v16u8)__msa_ilvl_b((v16i8)src2, (v16i8)src0);
+ vec3 = (v16u8)__msa_ilvl_b((v16i8)src3, (v16i8)src1);
+ reg0 = __msa_hadd_u_h(vec0, vec0);
+ reg1 = __msa_hadd_u_h(vec1, vec1);
+ reg2 = __msa_hadd_u_h(vec2, vec2);
+ reg3 = __msa_hadd_u_h(vec3, vec3);
+ reg4 = (v8u16)__msa_pckev_d((v2i64)reg2, (v2i64)reg0);
+ reg5 = (v8u16)__msa_pckev_d((v2i64)reg3, (v2i64)reg1);
+ reg6 = (v8u16)__msa_pckod_d((v2i64)reg2, (v2i64)reg0);
+ reg7 = (v8u16)__msa_pckod_d((v2i64)reg3, (v2i64)reg1);
+ reg4 += reg6;
+ reg5 += reg7;
+ reg4 = (v8u16)__msa_srari_h((v8i16)reg4, 2);
+ reg5 = (v8u16)__msa_srari_h((v8i16)reg5, 2);
+ dst0 = (v16u8)__msa_pckev_b((v16i8)reg5, (v16i8)reg4);
+ ST_UB(dst0, dst_argb);
+ src_argb += stepx * 4;
+ nxt_argb += stepx * 4;
+ dst_argb += 16;
+ }
+}
+
+void ScaleRowDown2_MSA(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width) {
+ int x;
+ v16u8 src0, src1, src2, src3, dst0, dst1;
+ (void)src_stride;
+
+ for (x = 0; x < dst_width; x += 32) {
+ src0 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 0);
+ src1 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 16);
+ src2 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 32);
+ src3 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 48);
+ dst0 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
+ dst1 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
+ ST_UB2(dst0, dst1, dst, 16);
+ src_ptr += 64;
+ dst += 32;
+ }
+}
+
+void ScaleRowDown2Linear_MSA(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width) {
+ int x;
+ v16u8 src0, src1, src2, src3, vec0, vec1, vec2, vec3, dst0, dst1;
+ (void)src_stride;
+
+ for (x = 0; x < dst_width; x += 32) {
+ src0 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 0);
+ src1 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 16);
+ src2 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 32);
+ src3 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 48);
+ vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
+ vec2 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
+ vec1 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
+ vec3 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
+ dst0 = __msa_aver_u_b(vec1, vec0);
+ dst1 = __msa_aver_u_b(vec3, vec2);
+ ST_UB2(dst0, dst1, dst, 16);
+ src_ptr += 64;
+ dst += 32;
+ }
+}
+
+void ScaleRowDown2Box_MSA(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width) {
+ int x;
+ const uint8_t* s = src_ptr;
+ const uint8_t* t = src_ptr + src_stride;
+ v16u8 src0, src1, src2, src3, src4, src5, src6, src7, dst0, dst1;
+ v8u16 vec0, vec1, vec2, vec3;
+
+ for (x = 0; x < dst_width; x += 32) {
+ src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+ src1 = (v16u8)__msa_ld_b((v16i8*)s, 16);
+ src2 = (v16u8)__msa_ld_b((v16i8*)s, 32);
+ src3 = (v16u8)__msa_ld_b((v16i8*)s, 48);
+ src4 = (v16u8)__msa_ld_b((v16i8*)t, 0);
+ src5 = (v16u8)__msa_ld_b((v16i8*)t, 16);
+ src6 = (v16u8)__msa_ld_b((v16i8*)t, 32);
+ src7 = (v16u8)__msa_ld_b((v16i8*)t, 48);
+ vec0 = __msa_hadd_u_h(src0, src0);
+ vec1 = __msa_hadd_u_h(src1, src1);
+ vec2 = __msa_hadd_u_h(src2, src2);
+ vec3 = __msa_hadd_u_h(src3, src3);
+ vec0 += __msa_hadd_u_h(src4, src4);
+ vec1 += __msa_hadd_u_h(src5, src5);
+ vec2 += __msa_hadd_u_h(src6, src6);
+ vec3 += __msa_hadd_u_h(src7, src7);
+ vec0 = (v8u16)__msa_srari_h((v8i16)vec0, 2);
+ vec1 = (v8u16)__msa_srari_h((v8i16)vec1, 2);
+ vec2 = (v8u16)__msa_srari_h((v8i16)vec2, 2);
+ vec3 = (v8u16)__msa_srari_h((v8i16)vec3, 2);
+ dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+ dst1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
+ ST_UB2(dst0, dst1, dst, 16);
+ s += 64;
+ t += 64;
+ dst += 32;
+ }
+}
+
+void ScaleRowDown4_MSA(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width) {
+ int x;
+ v16u8 src0, src1, src2, src3, vec0, vec1, dst0;
+ (void)src_stride;
+
+ for (x = 0; x < dst_width; x += 16) {
+ src0 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 0);
+ src1 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 16);
+ src2 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 32);
+ src3 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 48);
+ vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
+ vec1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
+ dst0 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0);
+ ST_UB(dst0, dst);
+ src_ptr += 64;
+ dst += 16;
+ }
+}
+
+void ScaleRowDown4Box_MSA(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width) {
+ int x;
+ const uint8_t* s = src_ptr;
+ const uint8_t* t0 = s + src_stride;
+ const uint8_t* t1 = s + src_stride * 2;
+ const uint8_t* t2 = s + src_stride * 3;
+ v16u8 src0, src1, src2, src3, src4, src5, src6, src7, dst0;
+ v8u16 vec0, vec1, vec2, vec3;
+ v4u32 reg0, reg1, reg2, reg3;
+
+ for (x = 0; x < dst_width; x += 16) {
+ src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+ src1 = (v16u8)__msa_ld_b((v16i8*)s, 16);
+ src2 = (v16u8)__msa_ld_b((v16i8*)s, 32);
+ src3 = (v16u8)__msa_ld_b((v16i8*)s, 48);
+ src4 = (v16u8)__msa_ld_b((v16i8*)t0, 0);
+ src5 = (v16u8)__msa_ld_b((v16i8*)t0, 16);
+ src6 = (v16u8)__msa_ld_b((v16i8*)t0, 32);
+ src7 = (v16u8)__msa_ld_b((v16i8*)t0, 48);
+ vec0 = __msa_hadd_u_h(src0, src0);
+ vec1 = __msa_hadd_u_h(src1, src1);
+ vec2 = __msa_hadd_u_h(src2, src2);
+ vec3 = __msa_hadd_u_h(src3, src3);
+ vec0 += __msa_hadd_u_h(src4, src4);
+ vec1 += __msa_hadd_u_h(src5, src5);
+ vec2 += __msa_hadd_u_h(src6, src6);
+ vec3 += __msa_hadd_u_h(src7, src7);
+ src0 = (v16u8)__msa_ld_b((v16i8*)t1, 0);
+ src1 = (v16u8)__msa_ld_b((v16i8*)t1, 16);
+ src2 = (v16u8)__msa_ld_b((v16i8*)t1, 32);
+ src3 = (v16u8)__msa_ld_b((v16i8*)t1, 48);
+ src4 = (v16u8)__msa_ld_b((v16i8*)t2, 0);
+ src5 = (v16u8)__msa_ld_b((v16i8*)t2, 16);
+ src6 = (v16u8)__msa_ld_b((v16i8*)t2, 32);
+ src7 = (v16u8)__msa_ld_b((v16i8*)t2, 48);
+ vec0 += __msa_hadd_u_h(src0, src0);
+ vec1 += __msa_hadd_u_h(src1, src1);
+ vec2 += __msa_hadd_u_h(src2, src2);
+ vec3 += __msa_hadd_u_h(src3, src3);
+ vec0 += __msa_hadd_u_h(src4, src4);
+ vec1 += __msa_hadd_u_h(src5, src5);
+ vec2 += __msa_hadd_u_h(src6, src6);
+ vec3 += __msa_hadd_u_h(src7, src7);
+ reg0 = __msa_hadd_u_w(vec0, vec0);
+ reg1 = __msa_hadd_u_w(vec1, vec1);
+ reg2 = __msa_hadd_u_w(vec2, vec2);
+ reg3 = __msa_hadd_u_w(vec3, vec3);
+ reg0 = (v4u32)__msa_srari_w((v4i32)reg0, 4);
+ reg1 = (v4u32)__msa_srari_w((v4i32)reg1, 4);
+ reg2 = (v4u32)__msa_srari_w((v4i32)reg2, 4);
+ reg3 = (v4u32)__msa_srari_w((v4i32)reg3, 4);
+ vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
+ vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
+ dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+ ST_UB(dst0, dst);
+ s += 64;
+ t0 += 64;
+ t1 += 64;
+ t2 += 64;
+ dst += 16;
+ }
+}
+
+void ScaleRowDown38_MSA(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width) {
+ int x, width;
+ uint64_t dst0;
+ uint32_t dst1;
+ v16u8 src0, src1, vec0;
+ v16i8 mask = {0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0};
+ (void)src_stride;
+
+ assert(dst_width % 3 == 0);
+ width = dst_width / 3;
+
+ for (x = 0; x < width; x += 4) {
+ src0 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 0);
+ src1 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 16);
+ vec0 = (v16u8)__msa_vshf_b(mask, (v16i8)src1, (v16i8)src0);
+ dst0 = __msa_copy_u_d((v2i64)vec0, 0);
+ dst1 = __msa_copy_u_w((v4i32)vec0, 2);
+ SD(dst0, dst);
+ SW(dst1, dst + 8);
+ src_ptr += 32;
+ dst += 12;
+ }
+}
+
+void ScaleRowDown38_2_Box_MSA(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ int x, width;
+ const uint8_t* s = src_ptr;
+ const uint8_t* t = src_ptr + src_stride;
+ uint64_t dst0;
+ uint32_t dst1;
+ v16u8 src0, src1, src2, src3, out;
+ v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ v4u32 tmp0, tmp1, tmp2, tmp3, tmp4;
+ v8i16 zero = {0};
+ v8i16 mask = {0, 1, 2, 8, 3, 4, 5, 9};
+ v16i8 dst_mask = {0, 2, 16, 4, 6, 18, 8, 10, 20, 12, 14, 22, 0, 0, 0, 0};
+ v4u32 const_0x2AAA = (v4u32)__msa_fill_w(0x2AAA);
+ v4u32 const_0x4000 = (v4u32)__msa_fill_w(0x4000);
+
+ assert((dst_width % 3 == 0) && (dst_width > 0));
+ width = dst_width / 3;
+
+ for (x = 0; x < width; x += 4) {
+ src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+ src1 = (v16u8)__msa_ld_b((v16i8*)s, 16);
+ src2 = (v16u8)__msa_ld_b((v16i8*)t, 0);
+ src3 = (v16u8)__msa_ld_b((v16i8*)t, 16);
+ vec0 = (v8u16)__msa_ilvr_b((v16i8)src2, (v16i8)src0);
+ vec1 = (v8u16)__msa_ilvl_b((v16i8)src2, (v16i8)src0);
+ vec2 = (v8u16)__msa_ilvr_b((v16i8)src3, (v16i8)src1);
+ vec3 = (v8u16)__msa_ilvl_b((v16i8)src3, (v16i8)src1);
+ vec0 = __msa_hadd_u_h((v16u8)vec0, (v16u8)vec0);
+ vec1 = __msa_hadd_u_h((v16u8)vec1, (v16u8)vec1);
+ vec2 = __msa_hadd_u_h((v16u8)vec2, (v16u8)vec2);
+ vec3 = __msa_hadd_u_h((v16u8)vec3, (v16u8)vec3);
+ vec4 = (v8u16)__msa_vshf_h(mask, zero, (v8i16)vec0);
+ vec5 = (v8u16)__msa_vshf_h(mask, zero, (v8i16)vec1);
+ vec6 = (v8u16)__msa_vshf_h(mask, zero, (v8i16)vec2);
+ vec7 = (v8u16)__msa_vshf_h(mask, zero, (v8i16)vec3);
+ vec0 = (v8u16)__msa_pckod_w((v4i32)vec1, (v4i32)vec0);
+ vec1 = (v8u16)__msa_pckod_w((v4i32)vec3, (v4i32)vec2);
+ vec0 = (v8u16)__msa_pckod_w((v4i32)vec1, (v4i32)vec0);
+ tmp0 = __msa_hadd_u_w(vec4, vec4);
+ tmp1 = __msa_hadd_u_w(vec5, vec5);
+ tmp2 = __msa_hadd_u_w(vec6, vec6);
+ tmp3 = __msa_hadd_u_w(vec7, vec7);
+ tmp4 = __msa_hadd_u_w(vec0, vec0);
+ vec0 = (v8u16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0);
+ vec1 = (v8u16)__msa_pckev_h((v8i16)tmp3, (v8i16)tmp2);
+ tmp0 = __msa_hadd_u_w(vec0, vec0);
+ tmp1 = __msa_hadd_u_w(vec1, vec1);
+ tmp0 *= const_0x2AAA;
+ tmp1 *= const_0x2AAA;
+ tmp4 *= const_0x4000;
+ tmp0 = (v4u32)__msa_srai_w((v4i32)tmp0, 16);
+ tmp1 = (v4u32)__msa_srai_w((v4i32)tmp1, 16);
+ tmp4 = (v4u32)__msa_srai_w((v4i32)tmp4, 16);
+ vec0 = (v8u16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0);
+ vec1 = (v8u16)__msa_pckev_h((v8i16)tmp4, (v8i16)tmp4);
+ out = (v16u8)__msa_vshf_b(dst_mask, (v16i8)vec1, (v16i8)vec0);
+ dst0 = __msa_copy_u_d((v2i64)out, 0);
+ dst1 = __msa_copy_u_w((v4i32)out, 2);
+ SD(dst0, dst_ptr);
+ SW(dst1, dst_ptr + 8);
+ s += 32;
+ t += 32;
+ dst_ptr += 12;
+ }
+}
+
+void ScaleRowDown38_3_Box_MSA(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ int x, width;
+ const uint8_t* s = src_ptr;
+ const uint8_t* t0 = s + src_stride;
+ const uint8_t* t1 = s + src_stride * 2;
+ uint64_t dst0;
+ uint32_t dst1;
+ v16u8 src0, src1, src2, src3, src4, src5, out;
+ v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ v4u32 tmp0, tmp1, tmp2, tmp3, tmp4;
+ v8u16 zero = {0};
+ v8i16 mask = {0, 1, 2, 8, 3, 4, 5, 9};
+ v16i8 dst_mask = {0, 2, 16, 4, 6, 18, 8, 10, 20, 12, 14, 22, 0, 0, 0, 0};
+ v4u32 const_0x1C71 = (v4u32)__msa_fill_w(0x1C71);
+ v4u32 const_0x2AAA = (v4u32)__msa_fill_w(0x2AAA);
+
+ assert((dst_width % 3 == 0) && (dst_width > 0));
+ width = dst_width / 3;
+
+ for (x = 0; x < width; x += 4) {
+ src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+ src1 = (v16u8)__msa_ld_b((v16i8*)s, 16);
+ src2 = (v16u8)__msa_ld_b((v16i8*)t0, 0);
+ src3 = (v16u8)__msa_ld_b((v16i8*)t0, 16);
+ src4 = (v16u8)__msa_ld_b((v16i8*)t1, 0);
+ src5 = (v16u8)__msa_ld_b((v16i8*)t1, 16);
+ vec0 = (v8u16)__msa_ilvr_b((v16i8)src2, (v16i8)src0);
+ vec1 = (v8u16)__msa_ilvl_b((v16i8)src2, (v16i8)src0);
+ vec2 = (v8u16)__msa_ilvr_b((v16i8)src3, (v16i8)src1);
+ vec3 = (v8u16)__msa_ilvl_b((v16i8)src3, (v16i8)src1);
+ vec4 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src4);
+ vec5 = (v8u16)__msa_ilvl_b((v16i8)zero, (v16i8)src4);
+ vec6 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src5);
+ vec7 = (v8u16)__msa_ilvl_b((v16i8)zero, (v16i8)src5);
+ vec0 = __msa_hadd_u_h((v16u8)vec0, (v16u8)vec0);
+ vec1 = __msa_hadd_u_h((v16u8)vec1, (v16u8)vec1);
+ vec2 = __msa_hadd_u_h((v16u8)vec2, (v16u8)vec2);
+ vec3 = __msa_hadd_u_h((v16u8)vec3, (v16u8)vec3);
+ vec0 += __msa_hadd_u_h((v16u8)vec4, (v16u8)vec4);
+ vec1 += __msa_hadd_u_h((v16u8)vec5, (v16u8)vec5);
+ vec2 += __msa_hadd_u_h((v16u8)vec6, (v16u8)vec6);
+ vec3 += __msa_hadd_u_h((v16u8)vec7, (v16u8)vec7);
+ vec4 = (v8u16)__msa_vshf_h(mask, (v8i16)zero, (v8i16)vec0);
+ vec5 = (v8u16)__msa_vshf_h(mask, (v8i16)zero, (v8i16)vec1);
+ vec6 = (v8u16)__msa_vshf_h(mask, (v8i16)zero, (v8i16)vec2);
+ vec7 = (v8u16)__msa_vshf_h(mask, (v8i16)zero, (v8i16)vec3);
+ vec0 = (v8u16)__msa_pckod_w((v4i32)vec1, (v4i32)vec0);
+ vec1 = (v8u16)__msa_pckod_w((v4i32)vec3, (v4i32)vec2);
+ vec0 = (v8u16)__msa_pckod_w((v4i32)vec1, (v4i32)vec0);
+ tmp0 = __msa_hadd_u_w(vec4, vec4);
+ tmp1 = __msa_hadd_u_w(vec5, vec5);
+ tmp2 = __msa_hadd_u_w(vec6, vec6);
+ tmp3 = __msa_hadd_u_w(vec7, vec7);
+ tmp4 = __msa_hadd_u_w(vec0, vec0);
+ vec0 = (v8u16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0);
+ vec1 = (v8u16)__msa_pckev_h((v8i16)tmp3, (v8i16)tmp2);
+ tmp0 = __msa_hadd_u_w(vec0, vec0);
+ tmp1 = __msa_hadd_u_w(vec1, vec1);
+ tmp0 *= const_0x1C71;
+ tmp1 *= const_0x1C71;
+ tmp4 *= const_0x2AAA;
+ tmp0 = (v4u32)__msa_srai_w((v4i32)tmp0, 16);
+ tmp1 = (v4u32)__msa_srai_w((v4i32)tmp1, 16);
+ tmp4 = (v4u32)__msa_srai_w((v4i32)tmp4, 16);
+ vec0 = (v8u16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0);
+ vec1 = (v8u16)__msa_pckev_h((v8i16)tmp4, (v8i16)tmp4);
+ out = (v16u8)__msa_vshf_b(dst_mask, (v16i8)vec1, (v16i8)vec0);
+ dst0 = __msa_copy_u_d((v2i64)out, 0);
+ dst1 = __msa_copy_u_w((v4i32)out, 2);
+ SD(dst0, dst_ptr);
+ SW(dst1, dst_ptr + 8);
+ s += 32;
+ t0 += 32;
+ t1 += 32;
+ dst_ptr += 12;
+ }
+}
+
+void ScaleAddRow_MSA(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) {
+ int x;
+ v16u8 src0;
+ v8u16 dst0, dst1;
+ v16i8 zero = {0};
+
+ assert(src_width > 0);
+
+ for (x = 0; x < src_width; x += 16) {
+ src0 = LD_UB(src_ptr);
+ dst0 = (v8u16)__msa_ld_h((v8i16*)dst_ptr, 0);
+ dst1 = (v8u16)__msa_ld_h((v8i16*)dst_ptr, 16);
+ dst0 += (v8u16)__msa_ilvr_b(zero, (v16i8)src0);
+ dst1 += (v8u16)__msa_ilvl_b(zero, (v16i8)src0);
+ ST_UH2(dst0, dst1, dst_ptr, 8);
+ src_ptr += 16;
+ dst_ptr += 16;
+ }
+}
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
+
+#endif // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
diff --git a/libyuv/src/main/cpp/libyuv/source/scale_neon.cc b/libyuv/src/main/cpp/libyuv/source/scale_neon.cc
new file mode 100644
index 0000000..fcd2f32
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/source/scale_neon.cc
@@ -0,0 +1,980 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for GCC Neon.
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
+ !defined(__aarch64__)
+
+// NEON downscalers with interpolation.
+// Provided by Fritz Koenig
+
+// Read 32x1 throw away even pixels, and write 16x1.
+void ScaleRowDown2_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst,
+ int dst_width) {
+ (void)src_stride;
+ asm volatile(
+ "1: \n"
+ // load even pixels into q0, odd into q1
+ "vld2.8 {q0, q1}, [%0]! \n"
+ "subs %2, %2, #16 \n" // 16 processed per loop
+ "vst1.8 {q1}, [%1]! \n" // store odd pixels
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst), // %1
+ "+r"(dst_width) // %2
+ :
+ : "q0", "q1" // Clobber List
+ );
+}
+
+// Read 32x1 average down and write 16x1.
+void ScaleRowDown2Linear_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst,
+ int dst_width) {
+ (void)src_stride;
+ asm volatile(
+ "1: \n"
+ "vld1.8 {q0, q1}, [%0]! \n" // load pixels and post
+ // inc
+ "subs %2, %2, #16 \n" // 16 processed per loop
+ "vpaddl.u8 q0, q0 \n" // add adjacent
+ "vpaddl.u8 q1, q1 \n"
+ "vrshrn.u16 d0, q0, #1 \n" // downshift, round and
+ // pack
+ "vrshrn.u16 d1, q1, #1 \n"
+ "vst1.8 {q0}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst), // %1
+ "+r"(dst_width) // %2
+ :
+ : "q0", "q1" // Clobber List
+ );
+}
+
+// Read 32x2 average down and write 16x1.
+void ScaleRowDown2Box_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst,
+ int dst_width) {
+ asm volatile(
+ // change the stride to row 2 pointer
+ "add %1, %0 \n"
+ "1: \n"
+ "vld1.8 {q0, q1}, [%0]! \n" // load row 1 and post inc
+ "vld1.8 {q2, q3}, [%1]! \n" // load row 2 and post inc
+ "subs %3, %3, #16 \n" // 16 processed per loop
+ "vpaddl.u8 q0, q0 \n" // row 1 add adjacent
+ "vpaddl.u8 q1, q1 \n"
+ "vpadal.u8 q0, q2 \n" // row 2 add adjacent +
+ // row1
+ "vpadal.u8 q1, q3 \n"
+ "vrshrn.u16 d0, q0, #2 \n" // downshift, round and
+ // pack
+ "vrshrn.u16 d1, q1, #2 \n"
+ "vst1.8 {q0}, [%2]! \n"
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(src_stride), // %1
+ "+r"(dst), // %2
+ "+r"(dst_width) // %3
+ :
+ : "q0", "q1", "q2", "q3" // Clobber List
+ );
+}
+
+void ScaleRowDown4_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width) {
+ (void)src_stride;
+ asm volatile(
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
+ "subs %2, %2, #8 \n" // 8 processed per loop
+ "vst1.8 {d2}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ :
+ : "q0", "q1", "memory", "cc");
+}
+
+void ScaleRowDown4Box_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width) {
+ const uint8* src_ptr1 = src_ptr + src_stride;
+ const uint8* src_ptr2 = src_ptr + src_stride * 2;
+ const uint8* src_ptr3 = src_ptr + src_stride * 3;
+ asm volatile(
+ "1: \n"
+ "vld1.8 {q0}, [%0]! \n" // load up 16x4
+ "vld1.8 {q1}, [%3]! \n"
+ "vld1.8 {q2}, [%4]! \n"
+ "vld1.8 {q3}, [%5]! \n"
+ "subs %2, %2, #4 \n"
+ "vpaddl.u8 q0, q0 \n"
+ "vpadal.u8 q0, q1 \n"
+ "vpadal.u8 q0, q2 \n"
+ "vpadal.u8 q0, q3 \n"
+ "vpaddl.u16 q0, q0 \n"
+ "vrshrn.u32 d0, q0, #4 \n" // divide by 16 w/rounding
+ "vmovn.u16 d0, q0 \n"
+ "vst1.32 {d0[0]}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width), // %2
+ "+r"(src_ptr1), // %3
+ "+r"(src_ptr2), // %4
+ "+r"(src_ptr3) // %5
+ :
+ : "q0", "q1", "q2", "q3", "memory", "cc");
+}
+
+// Down scale from 4 to 3 pixels. Use the neon multilane read/write
+// to load up the every 4th pixel into a 4 different registers.
+// Point samples 32 pixels to 24 pixels.
+void ScaleRowDown34_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width) {
+ (void)src_stride;
+ asm volatile(
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
+ "subs %2, %2, #24 \n"
+ "vmov d2, d3 \n" // order d0, d1, d2
+ "vst3.8 {d0, d1, d2}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ :
+ : "d0", "d1", "d2", "d3", "memory", "cc");
+}
+
+void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width) {
+ asm volatile(
+ "vmov.u8 d24, #3 \n"
+ "add %3, %0 \n"
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
+ "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
+ "subs %2, %2, #24 \n"
+
+ // filter src line 0 with src line 1
+ // expand chars to shorts to allow for room
+ // when adding lines together
+ "vmovl.u8 q8, d4 \n"
+ "vmovl.u8 q9, d5 \n"
+ "vmovl.u8 q10, d6 \n"
+ "vmovl.u8 q11, d7 \n"
+
+ // 3 * line_0 + line_1
+ "vmlal.u8 q8, d0, d24 \n"
+ "vmlal.u8 q9, d1, d24 \n"
+ "vmlal.u8 q10, d2, d24 \n"
+ "vmlal.u8 q11, d3, d24 \n"
+
+ // (3 * line_0 + line_1) >> 2
+ "vqrshrn.u16 d0, q8, #2 \n"
+ "vqrshrn.u16 d1, q9, #2 \n"
+ "vqrshrn.u16 d2, q10, #2 \n"
+ "vqrshrn.u16 d3, q11, #2 \n"
+
+ // a0 = (src[0] * 3 + s[1] * 1) >> 2
+ "vmovl.u8 q8, d1 \n"
+ "vmlal.u8 q8, d0, d24 \n"
+ "vqrshrn.u16 d0, q8, #2 \n"
+
+ // a1 = (src[1] * 1 + s[2] * 1) >> 1
+ "vrhadd.u8 d1, d1, d2 \n"
+
+ // a2 = (src[2] * 1 + s[3] * 3) >> 2
+ "vmovl.u8 q8, d2 \n"
+ "vmlal.u8 q8, d3, d24 \n"
+ "vqrshrn.u16 d2, q8, #2 \n"
+
+ "vst3.8 {d0, d1, d2}, [%1]! \n"
+
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width), // %2
+ "+r"(src_stride) // %3
+ :
+ : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "d24", "memory",
+ "cc");
+}
+
+void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width) {
+ asm volatile(
+ "vmov.u8 d24, #3 \n"
+ "add %3, %0 \n"
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
+ "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
+ "subs %2, %2, #24 \n"
+ // average src line 0 with src line 1
+ "vrhadd.u8 q0, q0, q2 \n"
+ "vrhadd.u8 q1, q1, q3 \n"
+
+ // a0 = (src[0] * 3 + s[1] * 1) >> 2
+ "vmovl.u8 q3, d1 \n"
+ "vmlal.u8 q3, d0, d24 \n"
+ "vqrshrn.u16 d0, q3, #2 \n"
+
+ // a1 = (src[1] * 1 + s[2] * 1) >> 1
+ "vrhadd.u8 d1, d1, d2 \n"
+
+ // a2 = (src[2] * 1 + s[3] * 3) >> 2
+ "vmovl.u8 q3, d2 \n"
+ "vmlal.u8 q3, d3, d24 \n"
+ "vqrshrn.u16 d2, q3, #2 \n"
+
+ "vst3.8 {d0, d1, d2}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width), // %2
+ "+r"(src_stride) // %3
+ :
+ : "r4", "q0", "q1", "q2", "q3", "d24", "memory", "cc");
+}
+
+#define HAS_SCALEROWDOWN38_NEON
+static uvec8 kShuf38 = {0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0};
+static uvec8 kShuf38_2 = {0, 8, 16, 2, 10, 17, 4, 12,
+ 18, 6, 14, 19, 0, 0, 0, 0};
+static vec16 kMult38_Div6 = {65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12,
+ 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12};
+static vec16 kMult38_Div9 = {65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18,
+ 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18};
+
+// 32 -> 12
+void ScaleRowDown38_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width) {
+ (void)src_stride;
+ asm volatile(
+ "vld1.8 {q3}, [%3] \n"
+ "1: \n"
+ "vld1.8 {d0, d1, d2, d3}, [%0]! \n"
+ "subs %2, %2, #12 \n"
+ "vtbl.u8 d4, {d0, d1, d2, d3}, d6 \n"
+ "vtbl.u8 d5, {d0, d1, d2, d3}, d7 \n"
+ "vst1.8 {d4}, [%1]! \n"
+ "vst1.32 {d5[0]}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"(&kShuf38) // %3
+ : "d0", "d1", "d2", "d3", "d4", "d5", "memory", "cc");
+}
+
+// 32x3 -> 12x1
+void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width) {
+ const uint8* src_ptr1 = src_ptr + src_stride * 2;
+
+ asm volatile(
+ "vld1.16 {q13}, [%5] \n"
+ "vld1.8 {q14}, [%6] \n"
+ "vld1.8 {q15}, [%7] \n"
+ "add %3, %0 \n"
+ "1: \n"
+
+ // d0 = 00 40 01 41 02 42 03 43
+ // d1 = 10 50 11 51 12 52 13 53
+ // d2 = 20 60 21 61 22 62 23 63
+ // d3 = 30 70 31 71 32 72 33 73
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n"
+ "vld4.8 {d4, d5, d6, d7}, [%3]! \n"
+ "vld4.8 {d16, d17, d18, d19}, [%4]! \n"
+ "subs %2, %2, #12 \n"
+
+ // Shuffle the input data around to get align the data
+ // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
+ // d0 = 00 10 01 11 02 12 03 13
+ // d1 = 40 50 41 51 42 52 43 53
+ "vtrn.u8 d0, d1 \n"
+ "vtrn.u8 d4, d5 \n"
+ "vtrn.u8 d16, d17 \n"
+
+ // d2 = 20 30 21 31 22 32 23 33
+ // d3 = 60 70 61 71 62 72 63 73
+ "vtrn.u8 d2, d3 \n"
+ "vtrn.u8 d6, d7 \n"
+ "vtrn.u8 d18, d19 \n"
+
+ // d0 = 00+10 01+11 02+12 03+13
+ // d2 = 40+50 41+51 42+52 43+53
+ "vpaddl.u8 q0, q0 \n"
+ "vpaddl.u8 q2, q2 \n"
+ "vpaddl.u8 q8, q8 \n"
+
+ // d3 = 60+70 61+71 62+72 63+73
+ "vpaddl.u8 d3, d3 \n"
+ "vpaddl.u8 d7, d7 \n"
+ "vpaddl.u8 d19, d19 \n"
+
+ // combine source lines
+ "vadd.u16 q0, q2 \n"
+ "vadd.u16 q0, q8 \n"
+ "vadd.u16 d4, d3, d7 \n"
+ "vadd.u16 d4, d19 \n"
+
+ // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]
+ // + s[6 + st * 1] + s[7 + st * 1]
+ // + s[6 + st * 2] + s[7 + st * 2]) / 6
+ "vqrdmulh.s16 q2, q2, q13 \n"
+ "vmovn.u16 d4, q2 \n"
+
+ // Shuffle 2,3 reg around so that 2 can be added to the
+ // 0,1 reg and 3 can be added to the 4,5 reg. This
+ // requires expanding from u8 to u16 as the 0,1 and 4,5
+ // registers are already expanded. Then do transposes
+ // to get aligned.
+ // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
+ "vmovl.u8 q1, d2 \n"
+ "vmovl.u8 q3, d6 \n"
+ "vmovl.u8 q9, d18 \n"
+
+ // combine source lines
+ "vadd.u16 q1, q3 \n"
+ "vadd.u16 q1, q9 \n"
+
+ // d4 = xx 20 xx 30 xx 22 xx 32
+ // d5 = xx 21 xx 31 xx 23 xx 33
+ "vtrn.u32 d2, d3 \n"
+
+ // d4 = xx 20 xx 21 xx 22 xx 23
+ // d5 = xx 30 xx 31 xx 32 xx 33
+ "vtrn.u16 d2, d3 \n"
+
+ // 0+1+2, 3+4+5
+ "vadd.u16 q0, q1 \n"
+
+ // Need to divide, but can't downshift as the the value
+ // isn't a power of 2. So multiply by 65536 / n
+ // and take the upper 16 bits.
+ "vqrdmulh.s16 q0, q0, q15 \n"
+
+ // Align for table lookup, vtbl requires registers to
+ // be adjacent
+ "vmov.u8 d2, d4 \n"
+
+ "vtbl.u8 d3, {d0, d1, d2}, d28 \n"
+ "vtbl.u8 d4, {d0, d1, d2}, d29 \n"
+
+ "vst1.8 {d3}, [%1]! \n"
+ "vst1.32 {d4[0]}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width), // %2
+ "+r"(src_stride), // %3
+ "+r"(src_ptr1) // %4
+ : "r"(&kMult38_Div6), // %5
+ "r"(&kShuf38_2), // %6
+ "r"(&kMult38_Div9) // %7
+ : "q0", "q1", "q2", "q3", "q8", "q9", "q13", "q14", "q15", "memory",
+ "cc");
+}
+
+// 32x2 -> 12x1
+void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width) {
+ asm volatile(
+ "vld1.16 {q13}, [%4] \n"
+ "vld1.8 {q14}, [%5] \n"
+ "add %3, %0 \n"
+ "1: \n"
+
+ // d0 = 00 40 01 41 02 42 03 43
+ // d1 = 10 50 11 51 12 52 13 53
+ // d2 = 20 60 21 61 22 62 23 63
+ // d3 = 30 70 31 71 32 72 33 73
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n"
+ "vld4.8 {d4, d5, d6, d7}, [%3]! \n"
+ "subs %2, %2, #12 \n"
+
+ // Shuffle the input data around to get align the data
+ // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
+ // d0 = 00 10 01 11 02 12 03 13
+ // d1 = 40 50 41 51 42 52 43 53
+ "vtrn.u8 d0, d1 \n"
+ "vtrn.u8 d4, d5 \n"
+
+ // d2 = 20 30 21 31 22 32 23 33
+ // d3 = 60 70 61 71 62 72 63 73
+ "vtrn.u8 d2, d3 \n"
+ "vtrn.u8 d6, d7 \n"
+
+ // d0 = 00+10 01+11 02+12 03+13
+ // d2 = 40+50 41+51 42+52 43+53
+ "vpaddl.u8 q0, q0 \n"
+ "vpaddl.u8 q2, q2 \n"
+
+ // d3 = 60+70 61+71 62+72 63+73
+ "vpaddl.u8 d3, d3 \n"
+ "vpaddl.u8 d7, d7 \n"
+
+ // combine source lines
+ "vadd.u16 q0, q2 \n"
+ "vadd.u16 d4, d3, d7 \n"
+
+ // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4
+ "vqrshrn.u16 d4, q2, #2 \n"
+
+ // Shuffle 2,3 reg around so that 2 can be added to the
+ // 0,1 reg and 3 can be added to the 4,5 reg. This
+ // requires expanding from u8 to u16 as the 0,1 and 4,5
+ // registers are already expanded. Then do transposes
+ // to get aligned.
+ // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
+ "vmovl.u8 q1, d2 \n"
+ "vmovl.u8 q3, d6 \n"
+
+ // combine source lines
+ "vadd.u16 q1, q3 \n"
+
+ // d4 = xx 20 xx 30 xx 22 xx 32
+ // d5 = xx 21 xx 31 xx 23 xx 33
+ "vtrn.u32 d2, d3 \n"
+
+ // d4 = xx 20 xx 21 xx 22 xx 23
+ // d5 = xx 30 xx 31 xx 32 xx 33
+ "vtrn.u16 d2, d3 \n"
+
+ // 0+1+2, 3+4+5
+ "vadd.u16 q0, q1 \n"
+
+ // Need to divide, but can't downshift as the the value
+ // isn't a power of 2. So multiply by 65536 / n
+ // and take the upper 16 bits.
+ "vqrdmulh.s16 q0, q0, q13 \n"
+
+ // Align for table lookup, vtbl requires registers to
+ // be adjacent
+ "vmov.u8 d2, d4 \n"
+
+ "vtbl.u8 d3, {d0, d1, d2}, d28 \n"
+ "vtbl.u8 d4, {d0, d1, d2}, d29 \n"
+
+ "vst1.8 {d3}, [%1]! \n"
+ "vst1.32 {d4[0]}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width), // %2
+ "+r"(src_stride) // %3
+ : "r"(&kMult38_Div6), // %4
+ "r"(&kShuf38_2) // %5
+ : "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc");
+}
+
+void ScaleAddRows_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint16* dst_ptr,
+ int src_width,
+ int src_height) {
+ const uint8* src_tmp;
+ asm volatile(
+ "1: \n"
+ "mov %0, %1 \n"
+ "mov r12, %5 \n"
+ "veor q2, q2, q2 \n"
+ "veor q3, q3, q3 \n"
+ "2: \n"
+ // load 16 pixels into q0
+ "vld1.8 {q0}, [%0], %3 \n"
+ "vaddw.u8 q3, q3, d1 \n"
+ "vaddw.u8 q2, q2, d0 \n"
+ "subs r12, r12, #1 \n"
+ "bgt 2b \n"
+ "vst1.16 {q2, q3}, [%2]! \n" // store pixels
+ "add %1, %1, #16 \n"
+ "subs %4, %4, #16 \n" // 16 processed per loop
+ "bgt 1b \n"
+ : "=&r"(src_tmp), // %0
+ "+r"(src_ptr), // %1
+ "+r"(dst_ptr), // %2
+ "+r"(src_stride), // %3
+ "+r"(src_width), // %4
+ "+r"(src_height) // %5
+ :
+ : "memory", "cc", "r12", "q0", "q1", "q2", "q3" // Clobber List
+ );
+}
+
+// clang-format off
+// TODO(Yang Zhang): Investigate less load instructions for
+// the x/dx stepping
+#define LOAD2_DATA8_LANE(n) \
+ "lsr %5, %3, #16 \n" \
+ "add %6, %1, %5 \n" \
+ "add %3, %3, %4 \n" \
+ "vld2.8 {d6["#n"], d7["#n"]}, [%6] \n"
+// clang-format on
+
+// The NEON version mimics this formula (from row_common.cc):
+// #define BLENDER(a, b, f) (uint8)((int)(a) +
+// ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16))
+
+void ScaleFilterCols_NEON(uint8* dst_ptr,
+ const uint8* src_ptr,
+ int dst_width,
+ int x,
+ int dx) {
+ int dx_offset[4] = {0, 1, 2, 3};
+ int* tmp = dx_offset;
+ const uint8* src_tmp = src_ptr;
+ asm volatile (
+ "vdup.32 q0, %3 \n" // x
+ "vdup.32 q1, %4 \n" // dx
+ "vld1.32 {q2}, [%5] \n" // 0 1 2 3
+ "vshl.i32 q3, q1, #2 \n" // 4 * dx
+ "vmul.s32 q1, q1, q2 \n"
+ // x , x + 1 * dx, x + 2 * dx, x + 3 * dx
+ "vadd.s32 q1, q1, q0 \n"
+ // x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx
+ "vadd.s32 q2, q1, q3 \n"
+ "vshl.i32 q0, q3, #1 \n" // 8 * dx
+ "1: \n"
+ LOAD2_DATA8_LANE(0)
+ LOAD2_DATA8_LANE(1)
+ LOAD2_DATA8_LANE(2)
+ LOAD2_DATA8_LANE(3)
+ LOAD2_DATA8_LANE(4)
+ LOAD2_DATA8_LANE(5)
+ LOAD2_DATA8_LANE(6)
+ LOAD2_DATA8_LANE(7)
+ "vmov q10, q1 \n"
+ "vmov q11, q2 \n"
+ "vuzp.16 q10, q11 \n"
+ "vmovl.u8 q8, d6 \n"
+ "vmovl.u8 q9, d7 \n"
+ "vsubl.s16 q11, d18, d16 \n"
+ "vsubl.s16 q12, d19, d17 \n"
+ "vmovl.u16 q13, d20 \n"
+ "vmovl.u16 q10, d21 \n"
+ "vmul.s32 q11, q11, q13 \n"
+ "vmul.s32 q12, q12, q10 \n"
+ "vrshrn.s32 d18, q11, #16 \n"
+ "vrshrn.s32 d19, q12, #16 \n"
+ "vadd.s16 q8, q8, q9 \n"
+ "vmovn.s16 d6, q8 \n"
+
+ "vst1.8 {d6}, [%0]! \n" // store pixels
+ "vadd.s32 q1, q1, q0 \n"
+ "vadd.s32 q2, q2, q0 \n"
+ "subs %2, %2, #8 \n" // 8 processed per loop
+ "bgt 1b \n"
+ : "+r"(dst_ptr), // %0
+ "+r"(src_ptr), // %1
+ "+r"(dst_width), // %2
+ "+r"(x), // %3
+ "+r"(dx), // %4
+ "+r"(tmp), // %5
+ "+r"(src_tmp) // %6
+ :
+ : "memory", "cc", "q0", "q1", "q2", "q3",
+ "q8", "q9", "q10", "q11", "q12", "q13"
+ );
+}
+
+#undef LOAD2_DATA8_LANE
+
+// 16x2 -> 16x1
+void ScaleFilterRows_NEON(uint8* dst_ptr,
+ const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ int dst_width,
+ int source_y_fraction) {
+ asm volatile(
+ "cmp %4, #0 \n"
+ "beq 100f \n"
+ "add %2, %1 \n"
+ "cmp %4, #64 \n"
+ "beq 75f \n"
+ "cmp %4, #128 \n"
+ "beq 50f \n"
+ "cmp %4, #192 \n"
+ "beq 25f \n"
+
+ "vdup.8 d5, %4 \n"
+ "rsb %4, #256 \n"
+ "vdup.8 d4, %4 \n"
+ // General purpose row blend.
+ "1: \n"
+ "vld1.8 {q0}, [%1]! \n"
+ "vld1.8 {q1}, [%2]! \n"
+ "subs %3, %3, #16 \n"
+ "vmull.u8 q13, d0, d4 \n"
+ "vmull.u8 q14, d1, d4 \n"
+ "vmlal.u8 q13, d2, d5 \n"
+ "vmlal.u8 q14, d3, d5 \n"
+ "vrshrn.u16 d0, q13, #8 \n"
+ "vrshrn.u16 d1, q14, #8 \n"
+ "vst1.8 {q0}, [%0]! \n"
+ "bgt 1b \n"
+ "b 99f \n"
+
+ // Blend 25 / 75.
+ "25: \n"
+ "vld1.8 {q0}, [%1]! \n"
+ "vld1.8 {q1}, [%2]! \n"
+ "subs %3, %3, #16 \n"
+ "vrhadd.u8 q0, q1 \n"
+ "vrhadd.u8 q0, q1 \n"
+ "vst1.8 {q0}, [%0]! \n"
+ "bgt 25b \n"
+ "b 99f \n"
+
+ // Blend 50 / 50.
+ "50: \n"
+ "vld1.8 {q0}, [%1]! \n"
+ "vld1.8 {q1}, [%2]! \n"
+ "subs %3, %3, #16 \n"
+ "vrhadd.u8 q0, q1 \n"
+ "vst1.8 {q0}, [%0]! \n"
+ "bgt 50b \n"
+ "b 99f \n"
+
+ // Blend 75 / 25.
+ "75: \n"
+ "vld1.8 {q1}, [%1]! \n"
+ "vld1.8 {q0}, [%2]! \n"
+ "subs %3, %3, #16 \n"
+ "vrhadd.u8 q0, q1 \n"
+ "vrhadd.u8 q0, q1 \n"
+ "vst1.8 {q0}, [%0]! \n"
+ "bgt 75b \n"
+ "b 99f \n"
+
+ // Blend 100 / 0 - Copy row unchanged.
+ "100: \n"
+ "vld1.8 {q0}, [%1]! \n"
+ "subs %3, %3, #16 \n"
+ "vst1.8 {q0}, [%0]! \n"
+ "bgt 100b \n"
+
+ "99: \n"
+ "vst1.8 {d1[7]}, [%0] \n"
+ : "+r"(dst_ptr), // %0
+ "+r"(src_ptr), // %1
+ "+r"(src_stride), // %2
+ "+r"(dst_width), // %3
+ "+r"(source_y_fraction) // %4
+ :
+ : "q0", "q1", "d4", "d5", "q13", "q14", "memory", "cc");
+}
+
+void ScaleARGBRowDown2_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst,
+ int dst_width) {
+ (void)src_stride;
+ asm volatile(
+ "1: \n"
+ // load even pixels into q0, odd into q1
+ "vld2.32 {q0, q1}, [%0]! \n"
+ "vld2.32 {q2, q3}, [%0]! \n"
+ "subs %2, %2, #8 \n" // 8 processed per loop
+ "vst1.8 {q1}, [%1]! \n" // store odd pixels
+ "vst1.8 {q3}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst), // %1
+ "+r"(dst_width) // %2
+ :
+ : "memory", "cc", "q0", "q1", "q2", "q3" // Clobber List
+ );
+}
+
+void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb,
+ ptrdiff_t src_stride,
+ uint8* dst_argb,
+ int dst_width) {
+ (void)src_stride;
+ asm volatile(
+ "1: \n"
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
+ "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB
+ // pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop
+ "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
+ "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
+ "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
+ "vpaddl.u8 q3, q3 \n" // A 16 bytes -> 8 shorts.
+ "vrshrn.u16 d0, q0, #1 \n" // downshift, round and
+ // pack
+ "vrshrn.u16 d1, q1, #1 \n"
+ "vrshrn.u16 d2, q2, #1 \n"
+ "vrshrn.u16 d3, q3, #1 \n"
+ "vst4.8 {d0, d1, d2, d3}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(dst_width) // %2
+ :
+ : "memory", "cc", "q0", "q1", "q2", "q3" // Clobber List
+ );
+}
+
+void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst,
+ int dst_width) {
+ asm volatile(
+ // change the stride to row 2 pointer
+ "add %1, %1, %0 \n"
+ "1: \n"
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
+ "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB
+ // pixels.
+ "subs %3, %3, #8 \n" // 8 processed per loop.
+ "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
+ "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
+ "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
+ "vpaddl.u8 q3, q3 \n" // A 16 bytes -> 8 shorts.
+ "vld4.8 {d16, d18, d20, d22}, [%1]! \n" // load 8 more ARGB
+ // pixels.
+ "vld4.8 {d17, d19, d21, d23}, [%1]! \n" // load last 8 ARGB
+ // pixels.
+ "vpadal.u8 q0, q8 \n" // B 16 bytes -> 8 shorts.
+ "vpadal.u8 q1, q9 \n" // G 16 bytes -> 8 shorts.
+ "vpadal.u8 q2, q10 \n" // R 16 bytes -> 8 shorts.
+ "vpadal.u8 q3, q11 \n" // A 16 bytes -> 8 shorts.
+ "vrshrn.u16 d0, q0, #2 \n" // downshift, round and
+ // pack
+ "vrshrn.u16 d1, q1, #2 \n"
+ "vrshrn.u16 d2, q2, #2 \n"
+ "vrshrn.u16 d3, q3, #2 \n"
+ "vst4.8 {d0, d1, d2, d3}, [%2]! \n"
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(src_stride), // %1
+ "+r"(dst), // %2
+ "+r"(dst_width) // %3
+ :
+ : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
+}
+
+// Reads 4 pixels at a time.
+// Alignment requirement: src_argb 4 byte aligned.
+void ScaleARGBRowDownEven_NEON(const uint8* src_argb,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8* dst_argb,
+ int dst_width) {
+ (void)src_stride;
+ asm volatile(
+ "mov r12, %3, lsl #2 \n"
+ "1: \n"
+ "vld1.32 {d0[0]}, [%0], r12 \n"
+ "vld1.32 {d0[1]}, [%0], r12 \n"
+ "vld1.32 {d1[0]}, [%0], r12 \n"
+ "vld1.32 {d1[1]}, [%0], r12 \n"
+ "subs %2, %2, #4 \n" // 4 pixels per loop.
+ "vst1.8 {q0}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(dst_width) // %2
+ : "r"(src_stepx) // %3
+ : "memory", "cc", "r12", "q0");
+}
+
+// Reads 4 pixels at a time.
+// Alignment requirement: src_argb 4 byte aligned.
+void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8* dst_argb,
+ int dst_width) {
+ asm volatile(
+ "mov r12, %4, lsl #2 \n"
+ "add %1, %1, %0 \n"
+ "1: \n"
+ "vld1.8 {d0}, [%0], r12 \n" // Read 4 2x2 blocks ->
+ // 2x1
+ "vld1.8 {d1}, [%1], r12 \n"
+ "vld1.8 {d2}, [%0], r12 \n"
+ "vld1.8 {d3}, [%1], r12 \n"
+ "vld1.8 {d4}, [%0], r12 \n"
+ "vld1.8 {d5}, [%1], r12 \n"
+ "vld1.8 {d6}, [%0], r12 \n"
+ "vld1.8 {d7}, [%1], r12 \n"
+ "vaddl.u8 q0, d0, d1 \n"
+ "vaddl.u8 q1, d2, d3 \n"
+ "vaddl.u8 q2, d4, d5 \n"
+ "vaddl.u8 q3, d6, d7 \n"
+ "vswp.8 d1, d2 \n" // ab_cd -> ac_bd
+ "vswp.8 d5, d6 \n" // ef_gh -> eg_fh
+ "vadd.u16 q0, q0, q1 \n" // (a+b)_(c+d)
+ "vadd.u16 q2, q2, q3 \n" // (e+f)_(g+h)
+ "vrshrn.u16 d0, q0, #2 \n" // first 2 pixels.
+ "vrshrn.u16 d1, q2, #2 \n" // next 2 pixels.
+ "subs %3, %3, #4 \n" // 4 pixels per loop.
+ "vst1.8 {q0}, [%2]! \n"
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(src_stride), // %1
+ "+r"(dst_argb), // %2
+ "+r"(dst_width) // %3
+ : "r"(src_stepx) // %4
+ : "memory", "cc", "r12", "q0", "q1", "q2", "q3");
+}
+
+// clang-format off
+// TODO(Yang Zhang): Investigate less load instructions for
+// the x/dx stepping
+#define LOAD1_DATA32_LANE(dn, n) \
+ "lsr %5, %3, #16 \n" \
+ "add %6, %1, %5, lsl #2 \n" \
+ "add %3, %3, %4 \n" \
+ "vld1.32 {" #dn "[" #n "]}, [%6] \n"
+// clang-format on
+
+void ScaleARGBCols_NEON(uint8* dst_argb,
+ const uint8* src_argb,
+ int dst_width,
+ int x,
+ int dx) {
+ int tmp;
+ const uint8* src_tmp = src_argb;
+ asm volatile(
+ "1: \n" LOAD1_DATA32_LANE(
+ d0, 0) LOAD1_DATA32_LANE(d0, 1) LOAD1_DATA32_LANE(d1, 0)
+ LOAD1_DATA32_LANE(d1, 1) LOAD1_DATA32_LANE(d2, 0) LOAD1_DATA32_LANE(
+ d2, 1) LOAD1_DATA32_LANE(d3, 0) LOAD1_DATA32_LANE(d3, 1)
+
+ "vst1.32 {q0, q1}, [%0]! \n" // store pixels
+ "subs %2, %2, #8 \n" // 8 processed per
+ // loop
+ "bgt 1b \n"
+ : "+r"(dst_argb), // %0
+ "+r"(src_argb), // %1
+ "+r"(dst_width), // %2
+ "+r"(x), // %3
+ "+r"(dx), // %4
+ "=&r"(tmp), // %5
+ "+r"(src_tmp) // %6
+ :
+ : "memory", "cc", "q0", "q1");
+}
+
+#undef LOAD1_DATA32_LANE
+
+// clang-format off
+// TODO(Yang Zhang): Investigate less load instructions for
+// the x/dx stepping
+#define LOAD2_DATA32_LANE(dn1, dn2, n) \
+ "lsr %5, %3, #16 \n" \
+ "add %6, %1, %5, lsl #2 \n" \
+ "add %3, %3, %4 \n" \
+ "vld2.32 {" #dn1 "[" #n "], " #dn2 "[" #n "]}, [%6] \n"
+// clang-format on
+
+void ScaleARGBFilterCols_NEON(uint8* dst_argb,
+ const uint8* src_argb,
+ int dst_width,
+ int x,
+ int dx) {
+ int dx_offset[4] = {0, 1, 2, 3};
+ int* tmp = dx_offset;
+ const uint8* src_tmp = src_argb;
+ asm volatile (
+ "vdup.32 q0, %3 \n" // x
+ "vdup.32 q1, %4 \n" // dx
+ "vld1.32 {q2}, [%5] \n" // 0 1 2 3
+ "vshl.i32 q9, q1, #2 \n" // 4 * dx
+ "vmul.s32 q1, q1, q2 \n"
+ "vmov.i8 q3, #0x7f \n" // 0x7F
+ "vmov.i16 q15, #0x7f \n" // 0x7F
+ // x , x + 1 * dx, x + 2 * dx, x + 3 * dx
+ "vadd.s32 q8, q1, q0 \n"
+ "1: \n"
+ // d0, d1: a
+ // d2, d3: b
+ LOAD2_DATA32_LANE(d0, d2, 0)
+ LOAD2_DATA32_LANE(d0, d2, 1)
+ LOAD2_DATA32_LANE(d1, d3, 0)
+ LOAD2_DATA32_LANE(d1, d3, 1)
+ "vshrn.i32 d22, q8, #9 \n"
+ "vand.16 d22, d22, d30 \n"
+ "vdup.8 d24, d22[0] \n"
+ "vdup.8 d25, d22[2] \n"
+ "vdup.8 d26, d22[4] \n"
+ "vdup.8 d27, d22[6] \n"
+ "vext.8 d4, d24, d25, #4 \n"
+ "vext.8 d5, d26, d27, #4 \n" // f
+ "veor.8 q10, q2, q3 \n" // 0x7f ^ f
+ "vmull.u8 q11, d0, d20 \n"
+ "vmull.u8 q12, d1, d21 \n"
+ "vmull.u8 q13, d2, d4 \n"
+ "vmull.u8 q14, d3, d5 \n"
+ "vadd.i16 q11, q11, q13 \n"
+ "vadd.i16 q12, q12, q14 \n"
+ "vshrn.i16 d0, q11, #7 \n"
+ "vshrn.i16 d1, q12, #7 \n"
+
+ "vst1.32 {d0, d1}, [%0]! \n" // store pixels
+ "vadd.s32 q8, q8, q9 \n"
+ "subs %2, %2, #4 \n" // 4 processed per loop
+ "bgt 1b \n"
+ : "+r"(dst_argb), // %0
+ "+r"(src_argb), // %1
+ "+r"(dst_width), // %2
+ "+r"(x), // %3
+ "+r"(dx), // %4
+ "+r"(tmp), // %5
+ "+r"(src_tmp) // %6
+ :
+ : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9",
+ "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+
+#undef LOAD2_DATA32_LANE
+
+#endif // defined(__ARM_NEON__) && !defined(__aarch64__)
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/libyuv/src/main/cpp/libyuv/source/scale_neon64.cc b/libyuv/src/main/cpp/libyuv/source/scale_neon64.cc
new file mode 100644
index 0000000..01478b5
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/source/scale_neon64.cc
@@ -0,0 +1,1009 @@
+/*
+ * Copyright 2014 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+#include "libyuv/scale.h"
+#include "libyuv/scale_row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for GCC Neon armv8 64 bit.
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+
+// Read 32x1 throw away even pixels, and write 16x1.
+void ScaleRowDown2_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst,
+ int dst_width) {
+ (void)src_stride;
+ asm volatile (
+ "1: \n"
+ // load even pixels into v0, odd into v1
+ "ld2 {v0.16b,v1.16b}, [%0], #32 \n"
+ "subs %w2, %w2, #16 \n" // 16 processed per loop
+ "st1 {v1.16b}, [%1], #16 \n" // store odd pixels
+ "b.gt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst), // %1
+ "+r"(dst_width) // %2
+ :
+ : "v0", "v1" // Clobber List
+ );
+}
+
+// Read 32x1 average down and write 16x1.
+void ScaleRowDown2Linear_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst,
+ int dst_width) {
+ (void)src_stride;
+ asm volatile (
+ "1: \n"
+ "ld1 {v0.16b,v1.16b}, [%0], #32 \n" // load pixels and post inc
+ "subs %w2, %w2, #16 \n" // 16 processed per loop
+ "uaddlp v0.8h, v0.16b \n" // add adjacent
+ "uaddlp v1.8h, v1.16b \n"
+ "rshrn v0.8b, v0.8h, #1 \n" // downshift, round and pack
+ "rshrn2 v0.16b, v1.8h, #1 \n"
+ "st1 {v0.16b}, [%1], #16 \n"
+ "b.gt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst), // %1
+ "+r"(dst_width) // %2
+ :
+ : "v0", "v1" // Clobber List
+ );
+}
+
+// Read 32x2 average down and write 16x1.
+void ScaleRowDown2Box_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst,
+ int dst_width) {
+ asm volatile (
+ // change the stride to row 2 pointer
+ "add %1, %1, %0 \n"
+ "1: \n"
+ "ld1 {v0.16b, v1.16b}, [%0], #32 \n" // load row 1 and post inc
+ "ld1 {v2.16b, v3.16b}, [%1], #32 \n" // load row 2 and post inc
+ "subs %w3, %w3, #16 \n" // 16 processed per loop
+ "uaddlp v0.8h, v0.16b \n" // row 1 add adjacent
+ "uaddlp v1.8h, v1.16b \n"
+ "uadalp v0.8h, v2.16b \n" // row 2 add adjacent + row1
+ "uadalp v1.8h, v3.16b \n"
+ "rshrn v0.8b, v0.8h, #2 \n" // downshift, round and pack
+ "rshrn2 v0.16b, v1.8h, #2 \n"
+ "st1 {v0.16b}, [%2], #16 \n"
+ "b.gt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(src_stride), // %1
+ "+r"(dst), // %2
+ "+r"(dst_width) // %3
+ :
+ : "v0", "v1", "v2", "v3" // Clobber List
+ );
+}
+
+void ScaleRowDown4_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width) {
+ (void)src_stride;
+ asm volatile (
+ "1: \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
+ "subs %w2, %w2, #8 \n" // 8 processed per loop
+ "st1 {v2.8b}, [%1], #8 \n"
+ "b.gt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ :
+ : "v0", "v1", "v2", "v3", "memory", "cc"
+ );
+}
+
+void ScaleRowDown4Box_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width) {
+ const uint8* src_ptr1 = src_ptr + src_stride;
+ const uint8* src_ptr2 = src_ptr + src_stride * 2;
+ const uint8* src_ptr3 = src_ptr + src_stride * 3;
+ asm volatile (
+ "1: \n"
+ "ld1 {v0.16b}, [%0], #16 \n" // load up 16x4
+ "ld1 {v1.16b}, [%2], #16 \n"
+ "ld1 {v2.16b}, [%3], #16 \n"
+ "ld1 {v3.16b}, [%4], #16 \n"
+ "subs %w5, %w5, #4 \n"
+ "uaddlp v0.8h, v0.16b \n"
+ "uadalp v0.8h, v1.16b \n"
+ "uadalp v0.8h, v2.16b \n"
+ "uadalp v0.8h, v3.16b \n"
+ "addp v0.8h, v0.8h, v0.8h \n"
+ "rshrn v0.8b, v0.8h, #4 \n" // divide by 16 w/rounding
+ "st1 {v0.s}[0], [%1], #4 \n"
+ "b.gt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(src_ptr1), // %2
+ "+r"(src_ptr2), // %3
+ "+r"(src_ptr3), // %4
+ "+r"(dst_width) // %5
+ :
+ : "v0", "v1", "v2", "v3", "memory", "cc"
+ );
+}
+
+// Down scale from 4 to 3 pixels. Use the neon multilane read/write
+// to load up the every 4th pixel into a 4 different registers.
+// Point samples 32 pixels to 24 pixels.
+void ScaleRowDown34_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width) {
+ (void)src_stride;
+ asm volatile (
+ "1: \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
+ "subs %w2, %w2, #24 \n"
+ "orr v2.16b, v3.16b, v3.16b \n" // order v0, v1, v2
+ "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
+ "b.gt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ :
+ : "v0", "v1", "v2", "v3", "memory", "cc"
+ );
+}
+
+void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width) {
+ asm volatile (
+ "movi v20.8b, #3 \n"
+ "add %3, %3, %0 \n"
+ "1: \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
+ "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1
+ "subs %w2, %w2, #24 \n"
+
+ // filter src line 0 with src line 1
+ // expand chars to shorts to allow for room
+ // when adding lines together
+ "ushll v16.8h, v4.8b, #0 \n"
+ "ushll v17.8h, v5.8b, #0 \n"
+ "ushll v18.8h, v6.8b, #0 \n"
+ "ushll v19.8h, v7.8b, #0 \n"
+
+ // 3 * line_0 + line_1
+ "umlal v16.8h, v0.8b, v20.8b \n"
+ "umlal v17.8h, v1.8b, v20.8b \n"
+ "umlal v18.8h, v2.8b, v20.8b \n"
+ "umlal v19.8h, v3.8b, v20.8b \n"
+
+ // (3 * line_0 + line_1) >> 2
+ "uqrshrn v0.8b, v16.8h, #2 \n"
+ "uqrshrn v1.8b, v17.8h, #2 \n"
+ "uqrshrn v2.8b, v18.8h, #2 \n"
+ "uqrshrn v3.8b, v19.8h, #2 \n"
+
+ // a0 = (src[0] * 3 + s[1] * 1) >> 2
+ "ushll v16.8h, v1.8b, #0 \n"
+ "umlal v16.8h, v0.8b, v20.8b \n"
+ "uqrshrn v0.8b, v16.8h, #2 \n"
+
+ // a1 = (src[1] * 1 + s[2] * 1) >> 1
+ "urhadd v1.8b, v1.8b, v2.8b \n"
+
+ // a2 = (src[2] * 1 + s[3] * 3) >> 2
+ "ushll v16.8h, v2.8b, #0 \n"
+ "umlal v16.8h, v3.8b, v20.8b \n"
+ "uqrshrn v2.8b, v16.8h, #2 \n"
+
+ "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
+
+ "b.gt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width), // %2
+ "+r"(src_stride) // %3
+ :
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19",
+ "v20", "memory", "cc"
+ );
+}
+
+void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width) {
+ asm volatile (
+ "movi v20.8b, #3 \n"
+ "add %3, %3, %0 \n"
+ "1: \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
+ "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1
+ "subs %w2, %w2, #24 \n"
+ // average src line 0 with src line 1
+ "urhadd v0.8b, v0.8b, v4.8b \n"
+ "urhadd v1.8b, v1.8b, v5.8b \n"
+ "urhadd v2.8b, v2.8b, v6.8b \n"
+ "urhadd v3.8b, v3.8b, v7.8b \n"
+
+ // a0 = (src[0] * 3 + s[1] * 1) >> 2
+ "ushll v4.8h, v1.8b, #0 \n"
+ "umlal v4.8h, v0.8b, v20.8b \n"
+ "uqrshrn v0.8b, v4.8h, #2 \n"
+
+ // a1 = (src[1] * 1 + s[2] * 1) >> 1
+ "urhadd v1.8b, v1.8b, v2.8b \n"
+
+ // a2 = (src[2] * 1 + s[3] * 3) >> 2
+ "ushll v4.8h, v2.8b, #0 \n"
+ "umlal v4.8h, v3.8b, v20.8b \n"
+ "uqrshrn v2.8b, v4.8h, #2 \n"
+
+ "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
+ "b.gt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width), // %2
+ "+r"(src_stride) // %3
+ :
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "memory", "cc"
+ );
+}
+
+static uvec8 kShuf38 = {0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0};
+static uvec8 kShuf38_2 = {0, 16, 32, 2, 18, 33, 4, 20,
+ 34, 6, 22, 35, 0, 0, 0, 0};
+static vec16 kMult38_Div6 = {65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12,
+ 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12};
+static vec16 kMult38_Div9 = {65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18,
+ 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18};
+
+// 32 -> 12
+void ScaleRowDown38_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width) {
+ (void)src_stride;
+ asm volatile (
+ "ld1 {v3.16b}, [%3] \n"
+ "1: \n"
+ "ld1 {v0.16b,v1.16b}, [%0], #32 \n"
+ "subs %w2, %w2, #12 \n"
+ "tbl v2.16b, {v0.16b,v1.16b}, v3.16b \n"
+ "st1 {v2.8b}, [%1], #8 \n"
+ "st1 {v2.s}[2], [%1], #4 \n"
+ "b.gt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"(&kShuf38) // %3
+ : "v0", "v1", "v2", "v3", "memory", "cc"
+ );
+}
+
+// 32x3 -> 12x1
+void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width) {
+ const uint8* src_ptr1 = src_ptr + src_stride * 2;
+ ptrdiff_t tmp_src_stride = src_stride;
+
+ asm volatile (
+ "ld1 {v29.8h}, [%5] \n"
+ "ld1 {v30.16b}, [%6] \n"
+ "ld1 {v31.8h}, [%7] \n"
+ "add %2, %2, %0 \n"
+ "1: \n"
+
+ // 00 40 01 41 02 42 03 43
+ // 10 50 11 51 12 52 13 53
+ // 20 60 21 61 22 62 23 63
+ // 30 70 31 71 32 72 33 73
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"
+ "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n"
+ "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%3], #32 \n"
+ "subs %w4, %w4, #12 \n"
+
+ // Shuffle the input data around to get align the data
+ // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
+ // 00 10 01 11 02 12 03 13
+ // 40 50 41 51 42 52 43 53
+ "trn1 v20.8b, v0.8b, v1.8b \n"
+ "trn2 v21.8b, v0.8b, v1.8b \n"
+ "trn1 v22.8b, v4.8b, v5.8b \n"
+ "trn2 v23.8b, v4.8b, v5.8b \n"
+ "trn1 v24.8b, v16.8b, v17.8b \n"
+ "trn2 v25.8b, v16.8b, v17.8b \n"
+
+ // 20 30 21 31 22 32 23 33
+ // 60 70 61 71 62 72 63 73
+ "trn1 v0.8b, v2.8b, v3.8b \n"
+ "trn2 v1.8b, v2.8b, v3.8b \n"
+ "trn1 v4.8b, v6.8b, v7.8b \n"
+ "trn2 v5.8b, v6.8b, v7.8b \n"
+ "trn1 v16.8b, v18.8b, v19.8b \n"
+ "trn2 v17.8b, v18.8b, v19.8b \n"
+
+ // 00+10 01+11 02+12 03+13
+ // 40+50 41+51 42+52 43+53
+ "uaddlp v20.4h, v20.8b \n"
+ "uaddlp v21.4h, v21.8b \n"
+ "uaddlp v22.4h, v22.8b \n"
+ "uaddlp v23.4h, v23.8b \n"
+ "uaddlp v24.4h, v24.8b \n"
+ "uaddlp v25.4h, v25.8b \n"
+
+ // 60+70 61+71 62+72 63+73
+ "uaddlp v1.4h, v1.8b \n"
+ "uaddlp v5.4h, v5.8b \n"
+ "uaddlp v17.4h, v17.8b \n"
+
+ // combine source lines
+ "add v20.4h, v20.4h, v22.4h \n"
+ "add v21.4h, v21.4h, v23.4h \n"
+ "add v20.4h, v20.4h, v24.4h \n"
+ "add v21.4h, v21.4h, v25.4h \n"
+ "add v2.4h, v1.4h, v5.4h \n"
+ "add v2.4h, v2.4h, v17.4h \n"
+
+ // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]
+ // + s[6 + st * 1] + s[7 + st * 1]
+ // + s[6 + st * 2] + s[7 + st * 2]) / 6
+ "sqrdmulh v2.8h, v2.8h, v29.8h \n"
+ "xtn v2.8b, v2.8h \n"
+
+ // Shuffle 2,3 reg around so that 2 can be added to the
+ // 0,1 reg and 3 can be added to the 4,5 reg. This
+ // requires expanding from u8 to u16 as the 0,1 and 4,5
+ // registers are already expanded. Then do transposes
+ // to get aligned.
+ // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
+ "ushll v16.8h, v16.8b, #0 \n"
+ "uaddl v0.8h, v0.8b, v4.8b \n"
+
+ // combine source lines
+ "add v0.8h, v0.8h, v16.8h \n"
+
+ // xx 20 xx 21 xx 22 xx 23
+ // xx 30 xx 31 xx 32 xx 33
+ "trn1 v1.8h, v0.8h, v0.8h \n"
+ "trn2 v4.8h, v0.8h, v0.8h \n"
+ "xtn v0.4h, v1.4s \n"
+ "xtn v4.4h, v4.4s \n"
+
+ // 0+1+2, 3+4+5
+ "add v20.8h, v20.8h, v0.8h \n"
+ "add v21.8h, v21.8h, v4.8h \n"
+
+ // Need to divide, but can't downshift as the the value
+ // isn't a power of 2. So multiply by 65536 / n
+ // and take the upper 16 bits.
+ "sqrdmulh v0.8h, v20.8h, v31.8h \n"
+ "sqrdmulh v1.8h, v21.8h, v31.8h \n"
+
+ // Align for table lookup, vtbl requires registers to
+ // be adjacent
+ "tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v30.16b \n"
+
+ "st1 {v3.8b}, [%1], #8 \n"
+ "st1 {v3.s}[2], [%1], #4 \n"
+ "b.gt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(tmp_src_stride), // %2
+ "+r"(src_ptr1), // %3
+ "+r"(dst_width) // %4
+ : "r"(&kMult38_Div6), // %5
+ "r"(&kShuf38_2), // %6
+ "r"(&kMult38_Div9) // %7
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17",
+ "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v29",
+ "v30", "v31", "memory", "cc"
+ );
+}
+
+// 32x2 -> 12x1
+void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width) {
+ // TODO(fbarchard): use src_stride directly for clang 3.5+.
+ ptrdiff_t tmp_src_stride = src_stride;
+ asm volatile (
+ "ld1 {v30.8h}, [%4] \n"
+ "ld1 {v31.16b}, [%5] \n"
+ "add %2, %2, %0 \n"
+ "1: \n"
+
+ // 00 40 01 41 02 42 03 43
+ // 10 50 11 51 12 52 13 53
+ // 20 60 21 61 22 62 23 63
+ // 30 70 31 71 32 72 33 73
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"
+ "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n"
+ "subs %w3, %w3, #12 \n"
+
+ // Shuffle the input data around to get align the data
+ // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
+ // 00 10 01 11 02 12 03 13
+ // 40 50 41 51 42 52 43 53
+ "trn1 v16.8b, v0.8b, v1.8b \n"
+ "trn2 v17.8b, v0.8b, v1.8b \n"
+ "trn1 v18.8b, v4.8b, v5.8b \n"
+ "trn2 v19.8b, v4.8b, v5.8b \n"
+
+ // 20 30 21 31 22 32 23 33
+ // 60 70 61 71 62 72 63 73
+ "trn1 v0.8b, v2.8b, v3.8b \n"
+ "trn2 v1.8b, v2.8b, v3.8b \n"
+ "trn1 v4.8b, v6.8b, v7.8b \n"
+ "trn2 v5.8b, v6.8b, v7.8b \n"
+
+ // 00+10 01+11 02+12 03+13
+ // 40+50 41+51 42+52 43+53
+ "uaddlp v16.4h, v16.8b \n"
+ "uaddlp v17.4h, v17.8b \n"
+ "uaddlp v18.4h, v18.8b \n"
+ "uaddlp v19.4h, v19.8b \n"
+
+ // 60+70 61+71 62+72 63+73
+ "uaddlp v1.4h, v1.8b \n"
+ "uaddlp v5.4h, v5.8b \n"
+
+ // combine source lines
+ "add v16.4h, v16.4h, v18.4h \n"
+ "add v17.4h, v17.4h, v19.4h \n"
+ "add v2.4h, v1.4h, v5.4h \n"
+
+ // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4
+ "uqrshrn v2.8b, v2.8h, #2 \n"
+
+ // Shuffle 2,3 reg around so that 2 can be added to the
+ // 0,1 reg and 3 can be added to the 4,5 reg. This
+ // requires expanding from u8 to u16 as the 0,1 and 4,5
+ // registers are already expanded. Then do transposes
+ // to get aligned.
+ // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
+
+ // combine source lines
+ "uaddl v0.8h, v0.8b, v4.8b \n"
+
+ // xx 20 xx 21 xx 22 xx 23
+ // xx 30 xx 31 xx 32 xx 33
+ "trn1 v1.8h, v0.8h, v0.8h \n"
+ "trn2 v4.8h, v0.8h, v0.8h \n"
+ "xtn v0.4h, v1.4s \n"
+ "xtn v4.4h, v4.4s \n"
+
+ // 0+1+2, 3+4+5
+ "add v16.8h, v16.8h, v0.8h \n"
+ "add v17.8h, v17.8h, v4.8h \n"
+
+ // Need to divide, but can't downshift as the the value
+ // isn't a power of 2. So multiply by 65536 / n
+ // and take the upper 16 bits.
+ "sqrdmulh v0.8h, v16.8h, v30.8h \n"
+ "sqrdmulh v1.8h, v17.8h, v30.8h \n"
+
+ // Align for table lookup, vtbl requires registers to
+ // be adjacent
+
+ "tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v31.16b \n"
+
+ "st1 {v3.8b}, [%1], #8 \n"
+ "st1 {v3.s}[2], [%1], #4 \n"
+ "b.gt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(tmp_src_stride), // %2
+ "+r"(dst_width) // %3
+ : "r"(&kMult38_Div6), // %4
+ "r"(&kShuf38_2) // %5
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17",
+ "v18", "v19", "v30", "v31", "memory", "cc"
+ );
+}
+
+void ScaleAddRows_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint16* dst_ptr,
+ int src_width,
+ int src_height) {
+ const uint8* src_tmp;
+ asm volatile (
+ "1: \n"
+ "mov %0, %1 \n"
+ "mov w12, %w5 \n"
+ "eor v2.16b, v2.16b, v2.16b \n"
+ "eor v3.16b, v3.16b, v3.16b \n"
+ "2: \n"
+ // load 16 pixels into q0
+ "ld1 {v0.16b}, [%0], %3 \n"
+ "uaddw2 v3.8h, v3.8h, v0.16b \n"
+ "uaddw v2.8h, v2.8h, v0.8b \n"
+ "subs w12, w12, #1 \n"
+ "b.gt 2b \n"
+ "st1 {v2.8h, v3.8h}, [%2], #32 \n" // store pixels
+ "add %1, %1, #16 \n"
+ "subs %w4, %w4, #16 \n" // 16 processed per loop
+ "b.gt 1b \n"
+ : "=&r"(src_tmp), // %0
+ "+r"(src_ptr), // %1
+ "+r"(dst_ptr), // %2
+ "+r"(src_stride), // %3
+ "+r"(src_width), // %4
+ "+r"(src_height) // %5
+ :
+ : "memory", "cc", "w12", "v0", "v1", "v2", "v3" // Clobber List
+ );
+}
+
+// clang-format off
+// TODO(Yang Zhang): Investigate less load instructions for
+// the x/dx stepping
+#define LOAD2_DATA8_LANE(n) \
+ "lsr %5, %3, #16 \n" \
+ "add %6, %1, %5 \n" \
+ "add %3, %3, %4 \n" \
+ "ld2 {v4.b, v5.b}[" #n "], [%6] \n"
+// clang-format on
+
+// The NEON version mimics this formula (from row_common.cc):
+// #define BLENDER(a, b, f) (uint8)((int)(a) +
+// ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16))
+
+void ScaleFilterCols_NEON(uint8* dst_ptr,
+ const uint8* src_ptr,
+ int dst_width,
+ int x,
+ int dx) {
+ int dx_offset[4] = {0, 1, 2, 3};
+ int* tmp = dx_offset;
+ const uint8* src_tmp = src_ptr;
+ int64 x64 = (int64)x;
+ int64 dx64 = (int64)dx;
+ asm volatile (
+ "dup v0.4s, %w3 \n" // x
+ "dup v1.4s, %w4 \n" // dx
+ "ld1 {v2.4s}, [%5] \n" // 0 1 2 3
+ "shl v3.4s, v1.4s, #2 \n" // 4 * dx
+ "mul v1.4s, v1.4s, v2.4s \n"
+ // x , x + 1 * dx, x + 2 * dx, x + 3 * dx
+ "add v1.4s, v1.4s, v0.4s \n"
+ // x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx
+ "add v2.4s, v1.4s, v3.4s \n"
+ "shl v0.4s, v3.4s, #1 \n" // 8 * dx
+ "1: \n"
+ LOAD2_DATA8_LANE(0)
+ LOAD2_DATA8_LANE(1)
+ LOAD2_DATA8_LANE(2)
+ LOAD2_DATA8_LANE(3)
+ LOAD2_DATA8_LANE(4)
+ LOAD2_DATA8_LANE(5)
+ LOAD2_DATA8_LANE(6)
+ LOAD2_DATA8_LANE(7)
+ "mov v6.16b, v1.16b \n"
+ "mov v7.16b, v2.16b \n"
+ "uzp1 v6.8h, v6.8h, v7.8h \n"
+ "ushll v4.8h, v4.8b, #0 \n"
+ "ushll v5.8h, v5.8b, #0 \n"
+ "ssubl v16.4s, v5.4h, v4.4h \n"
+ "ssubl2 v17.4s, v5.8h, v4.8h \n"
+ "ushll v7.4s, v6.4h, #0 \n"
+ "ushll2 v6.4s, v6.8h, #0 \n"
+ "mul v16.4s, v16.4s, v7.4s \n"
+ "mul v17.4s, v17.4s, v6.4s \n"
+ "rshrn v6.4h, v16.4s, #16 \n"
+ "rshrn2 v6.8h, v17.4s, #16 \n"
+ "add v4.8h, v4.8h, v6.8h \n"
+ "xtn v4.8b, v4.8h \n"
+
+ "st1 {v4.8b}, [%0], #8 \n" // store pixels
+ "add v1.4s, v1.4s, v0.4s \n"
+ "add v2.4s, v2.4s, v0.4s \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop
+ "b.gt 1b \n"
+ : "+r"(dst_ptr), // %0
+ "+r"(src_ptr), // %1
+ "+r"(dst_width), // %2
+ "+r"(x64), // %3
+ "+r"(dx64), // %4
+ "+r"(tmp), // %5
+ "+r"(src_tmp) // %6
+ :
+ : "memory", "cc", "v0", "v1", "v2", "v3",
+ "v4", "v5", "v6", "v7", "v16", "v17"
+ );
+}
+
+#undef LOAD2_DATA8_LANE
+
+// 16x2 -> 16x1
+void ScaleFilterRows_NEON(uint8* dst_ptr,
+ const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ int dst_width,
+ int source_y_fraction) {
+ int y_fraction = 256 - source_y_fraction;
+ asm volatile (
+ "cmp %w4, #0 \n"
+ "b.eq 100f \n"
+ "add %2, %2, %1 \n"
+ "cmp %w4, #64 \n"
+ "b.eq 75f \n"
+ "cmp %w4, #128 \n"
+ "b.eq 50f \n"
+ "cmp %w4, #192 \n"
+ "b.eq 25f \n"
+
+ "dup v5.8b, %w4 \n"
+ "dup v4.8b, %w5 \n"
+ // General purpose row blend.
+ "1: \n"
+ "ld1 {v0.16b}, [%1], #16 \n"
+ "ld1 {v1.16b}, [%2], #16 \n"
+ "subs %w3, %w3, #16 \n"
+ "umull v6.8h, v0.8b, v4.8b \n"
+ "umull2 v7.8h, v0.16b, v4.16b \n"
+ "umlal v6.8h, v1.8b, v5.8b \n"
+ "umlal2 v7.8h, v1.16b, v5.16b \n"
+ "rshrn v0.8b, v6.8h, #8 \n"
+ "rshrn2 v0.16b, v7.8h, #8 \n"
+ "st1 {v0.16b}, [%0], #16 \n"
+ "b.gt 1b \n"
+ "b 99f \n"
+
+ // Blend 25 / 75.
+ "25: \n"
+ "ld1 {v0.16b}, [%1], #16 \n"
+ "ld1 {v1.16b}, [%2], #16 \n"
+ "subs %w3, %w3, #16 \n"
+ "urhadd v0.16b, v0.16b, v1.16b \n"
+ "urhadd v0.16b, v0.16b, v1.16b \n"
+ "st1 {v0.16b}, [%0], #16 \n"
+ "b.gt 25b \n"
+ "b 99f \n"
+
+ // Blend 50 / 50.
+ "50: \n"
+ "ld1 {v0.16b}, [%1], #16 \n"
+ "ld1 {v1.16b}, [%2], #16 \n"
+ "subs %w3, %w3, #16 \n"
+ "urhadd v0.16b, v0.16b, v1.16b \n"
+ "st1 {v0.16b}, [%0], #16 \n"
+ "b.gt 50b \n"
+ "b 99f \n"
+
+ // Blend 75 / 25.
+ "75: \n"
+ "ld1 {v1.16b}, [%1], #16 \n"
+ "ld1 {v0.16b}, [%2], #16 \n"
+ "subs %w3, %w3, #16 \n"
+ "urhadd v0.16b, v0.16b, v1.16b \n"
+ "urhadd v0.16b, v0.16b, v1.16b \n"
+ "st1 {v0.16b}, [%0], #16 \n"
+ "b.gt 75b \n"
+ "b 99f \n"
+
+ // Blend 100 / 0 - Copy row unchanged.
+ "100: \n"
+ "ld1 {v0.16b}, [%1], #16 \n"
+ "subs %w3, %w3, #16 \n"
+ "st1 {v0.16b}, [%0], #16 \n"
+ "b.gt 100b \n"
+
+ "99: \n"
+ "st1 {v0.b}[15], [%0] \n"
+ : "+r"(dst_ptr), // %0
+ "+r"(src_ptr), // %1
+ "+r"(src_stride), // %2
+ "+r"(dst_width), // %3
+ "+r"(source_y_fraction),// %4
+ "+r"(y_fraction) // %5
+ :
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "memory", "cc"
+ );
+}
+
+void ScaleARGBRowDown2_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst,
+ int dst_width) {
+ (void)src_stride;
+ asm volatile (
+ "1: \n"
+ // load even pixels into q0, odd into q1
+ "ld2 {v0.4s, v1.4s}, [%0], #32 \n"
+ "ld2 {v2.4s, v3.4s}, [%0], #32 \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop
+ "st1 {v1.16b}, [%1], #16 \n" // store odd pixels
+ "st1 {v3.16b}, [%1], #16 \n"
+ "b.gt 1b \n"
+ : "+r" (src_ptr), // %0
+ "+r" (dst), // %1
+ "+r" (dst_width) // %2
+ :
+ : "memory", "cc", "v0", "v1", "v2", "v3" // Clobber List
+ );
+}
+
+void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb,
+ ptrdiff_t src_stride,
+ uint8* dst_argb,
+ int dst_width) {
+ (void)src_stride;
+ asm volatile (
+ "1: \n"
+ // load 8 ARGB pixels.
+ "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
+ "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
+ "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
+ "uaddlp v3.8h, v3.16b \n" // A 16 bytes -> 8 shorts.
+ "rshrn v0.8b, v0.8h, #1 \n" // downshift, round and pack
+ "rshrn v1.8b, v1.8h, #1 \n"
+ "rshrn v2.8b, v2.8h, #1 \n"
+ "rshrn v3.8b, v3.8h, #1 \n"
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"
+ "b.gt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(dst_width) // %2
+ :
+ : "memory", "cc", "v0", "v1", "v2", "v3" // Clobber List
+ );
+}
+
+void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst,
+ int dst_width) {
+ asm volatile (
+ // change the stride to row 2 pointer
+ "add %1, %1, %0 \n"
+ "1: \n"
+ "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 8 ARGB pixels.
+ "subs %w3, %w3, #8 \n" // 8 processed per loop.
+ "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
+ "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
+ "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
+ "uaddlp v3.8h, v3.16b \n" // A 16 bytes -> 8 shorts.
+ "ld4 {v16.16b,v17.16b,v18.16b,v19.16b}, [%1], #64 \n" // load 8 more ARGB pixels.
+ "uadalp v0.8h, v16.16b \n" // B 16 bytes -> 8 shorts.
+ "uadalp v1.8h, v17.16b \n" // G 16 bytes -> 8 shorts.
+ "uadalp v2.8h, v18.16b \n" // R 16 bytes -> 8 shorts.
+ "uadalp v3.8h, v19.16b \n" // A 16 bytes -> 8 shorts.
+ "rshrn v0.8b, v0.8h, #2 \n" // downshift, round and pack
+ "rshrn v1.8b, v1.8h, #2 \n"
+ "rshrn v2.8b, v2.8h, #2 \n"
+ "rshrn v3.8b, v3.8h, #2 \n"
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"
+ "b.gt 1b \n"
+ : "+r" (src_ptr), // %0
+ "+r" (src_stride), // %1
+ "+r" (dst), // %2
+ "+r" (dst_width) // %3
+ :
+ : "memory", "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19"
+ );
+}
+
+// Reads 4 pixels at a time.
+// Alignment requirement: src_argb 4 byte aligned.
+void ScaleARGBRowDownEven_NEON(const uint8* src_argb,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8* dst_argb,
+ int dst_width) {
+ (void)src_stride;
+ asm volatile (
+ "1: \n"
+ "ld1 {v0.s}[0], [%0], %3 \n"
+ "ld1 {v0.s}[1], [%0], %3 \n"
+ "ld1 {v0.s}[2], [%0], %3 \n"
+ "ld1 {v0.s}[3], [%0], %3 \n"
+ "subs %w2, %w2, #4 \n" // 4 pixels per loop.
+ "st1 {v0.16b}, [%1], #16 \n"
+ "b.gt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(dst_width) // %2
+ : "r"((int64)(src_stepx * 4)) // %3
+ : "memory", "cc", "v0"
+ );
+}
+
+// Reads 4 pixels at a time.
+// Alignment requirement: src_argb 4 byte aligned.
+// TODO(Yang Zhang): Might be worth another optimization pass in future.
+// It could be upgraded to 8 pixels at a time to start with.
+void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8* dst_argb,
+ int dst_width) {
+ asm volatile (
+ "add %1, %1, %0 \n"
+ "1: \n"
+ "ld1 {v0.8b}, [%0], %4 \n" // Read 4 2x2 blocks -> 2x1
+ "ld1 {v1.8b}, [%1], %4 \n"
+ "ld1 {v2.8b}, [%0], %4 \n"
+ "ld1 {v3.8b}, [%1], %4 \n"
+ "ld1 {v4.8b}, [%0], %4 \n"
+ "ld1 {v5.8b}, [%1], %4 \n"
+ "ld1 {v6.8b}, [%0], %4 \n"
+ "ld1 {v7.8b}, [%1], %4 \n"
+ "uaddl v0.8h, v0.8b, v1.8b \n"
+ "uaddl v2.8h, v2.8b, v3.8b \n"
+ "uaddl v4.8h, v4.8b, v5.8b \n"
+ "uaddl v6.8h, v6.8b, v7.8b \n"
+ "mov v16.d[1], v0.d[1] \n" // ab_cd -> ac_bd
+ "mov v0.d[1], v2.d[0] \n"
+ "mov v2.d[0], v16.d[1] \n"
+ "mov v16.d[1], v4.d[1] \n" // ef_gh -> eg_fh
+ "mov v4.d[1], v6.d[0] \n"
+ "mov v6.d[0], v16.d[1] \n"
+ "add v0.8h, v0.8h, v2.8h \n" // (a+b)_(c+d)
+ "add v4.8h, v4.8h, v6.8h \n" // (e+f)_(g+h)
+ "rshrn v0.8b, v0.8h, #2 \n" // first 2 pixels.
+ "rshrn2 v0.16b, v4.8h, #2 \n" // next 2 pixels.
+ "subs %w3, %w3, #4 \n" // 4 pixels per loop.
+ "st1 {v0.16b}, [%2], #16 \n"
+ "b.gt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(src_stride), // %1
+ "+r"(dst_argb), // %2
+ "+r"(dst_width) // %3
+ : "r"((int64)(src_stepx * 4)) // %4
+ : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
+ );
+}
+
+// clang-format off
+// TODO(Yang Zhang): Investigate less load instructions for
+// the x/dx stepping
+#define LOAD1_DATA32_LANE(vn, n) \
+ "lsr %5, %3, #16 \n" \
+ "add %6, %1, %5, lsl #2 \n" \
+ "add %3, %3, %4 \n" \
+ "ld1 {" #vn ".s}[" #n "], [%6] \n"
+// clang-format on
+
+void ScaleARGBCols_NEON(uint8* dst_argb,
+ const uint8* src_argb,
+ int dst_width,
+ int x,
+ int dx) {
+ const uint8* src_tmp = src_argb;
+ int64 x64 = (int64)x;
+ int64 dx64 = (int64)dx;
+ int64 tmp64;
+ asm volatile (
+ "1: \n"
+ LOAD1_DATA32_LANE(v0, 0)
+ LOAD1_DATA32_LANE(v0, 1)
+ LOAD1_DATA32_LANE(v0, 2)
+ LOAD1_DATA32_LANE(v0, 3)
+ LOAD1_DATA32_LANE(v1, 0)
+ LOAD1_DATA32_LANE(v1, 1)
+ LOAD1_DATA32_LANE(v1, 2)
+ LOAD1_DATA32_LANE(v1, 3)
+
+ "st1 {v0.4s, v1.4s}, [%0], #32 \n" // store pixels
+ "subs %w2, %w2, #8 \n" // 8 processed per loop
+ "b.gt 1b \n"
+ : "+r"(dst_argb), // %0
+ "+r"(src_argb), // %1
+ "+r"(dst_width), // %2
+ "+r"(x64), // %3
+ "+r"(dx64), // %4
+ "=&r"(tmp64), // %5
+ "+r"(src_tmp) // %6
+ :
+ : "memory", "cc", "v0", "v1"
+ );
+}
+
+#undef LOAD1_DATA32_LANE
+
+// clang-format off
+// TODO(Yang Zhang): Investigate less load instructions for
+// the x/dx stepping
+#define LOAD2_DATA32_LANE(vn1, vn2, n) \
+ "lsr %5, %3, #16 \n" \
+ "add %6, %1, %5, lsl #2 \n" \
+ "add %3, %3, %4 \n" \
+ "ld2 {" #vn1 ".s, " #vn2 ".s}[" #n "], [%6] \n"
+// clang-format on
+
+void ScaleARGBFilterCols_NEON(uint8* dst_argb,
+ const uint8* src_argb,
+ int dst_width,
+ int x,
+ int dx) {
+ int dx_offset[4] = {0, 1, 2, 3};
+ int* tmp = dx_offset;
+ const uint8* src_tmp = src_argb;
+ int64 x64 = (int64)x;
+ int64 dx64 = (int64)dx;
+ asm volatile (
+ "dup v0.4s, %w3 \n" // x
+ "dup v1.4s, %w4 \n" // dx
+ "ld1 {v2.4s}, [%5] \n" // 0 1 2 3
+ "shl v6.4s, v1.4s, #2 \n" // 4 * dx
+ "mul v1.4s, v1.4s, v2.4s \n"
+ "movi v3.16b, #0x7f \n" // 0x7F
+ "movi v4.8h, #0x7f \n" // 0x7F
+ // x , x + 1 * dx, x + 2 * dx, x + 3 * dx
+ "add v5.4s, v1.4s, v0.4s \n"
+ "1: \n"
+ // d0, d1: a
+ // d2, d3: b
+ LOAD2_DATA32_LANE(v0, v1, 0)
+ LOAD2_DATA32_LANE(v0, v1, 1)
+ LOAD2_DATA32_LANE(v0, v1, 2)
+ LOAD2_DATA32_LANE(v0, v1, 3)
+ "shrn v2.4h, v5.4s, #9 \n"
+ "and v2.8b, v2.8b, v4.8b \n"
+ "dup v16.8b, v2.b[0] \n"
+ "dup v17.8b, v2.b[2] \n"
+ "dup v18.8b, v2.b[4] \n"
+ "dup v19.8b, v2.b[6] \n"
+ "ext v2.8b, v16.8b, v17.8b, #4 \n"
+ "ext v17.8b, v18.8b, v19.8b, #4 \n"
+ "ins v2.d[1], v17.d[0] \n" // f
+ "eor v7.16b, v2.16b, v3.16b \n" // 0x7f ^ f
+ "umull v16.8h, v0.8b, v7.8b \n"
+ "umull2 v17.8h, v0.16b, v7.16b \n"
+ "umull v18.8h, v1.8b, v2.8b \n"
+ "umull2 v19.8h, v1.16b, v2.16b \n"
+ "add v16.8h, v16.8h, v18.8h \n"
+ "add v17.8h, v17.8h, v19.8h \n"
+ "shrn v0.8b, v16.8h, #7 \n"
+ "shrn2 v0.16b, v17.8h, #7 \n"
+
+ "st1 {v0.4s}, [%0], #16 \n" // store pixels
+ "add v5.4s, v5.4s, v6.4s \n"
+ "subs %w2, %w2, #4 \n" // 4 processed per loop
+ "b.gt 1b \n"
+ : "+r"(dst_argb), // %0
+ "+r"(src_argb), // %1
+ "+r"(dst_width), // %2
+ "+r"(x64), // %3
+ "+r"(dx64), // %4
+ "+r"(tmp), // %5
+ "+r"(src_tmp) // %6
+ :
+ : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5",
+ "v6", "v7", "v16", "v17", "v18", "v19"
+ );
+}
+
+#undef LOAD2_DATA32_LANE
+
+#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/libyuv/src/main/cpp/libyuv/source/scale_win.cc b/libyuv/src/main/cpp/libyuv/source/scale_win.cc
new file mode 100644
index 0000000..0c5b3a1
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/source/scale_win.cc
@@ -0,0 +1,1390 @@
+/*
+ * Copyright 2013 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+#include "libyuv/scale_row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for 32 bit Visual C x86 and clangcl
+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
+
+// Offsets for source bytes 0 to 9
+static uvec8 kShuf0 = {0, 1, 3, 4, 5, 7, 8, 9,
+ 128, 128, 128, 128, 128, 128, 128, 128};
+
+// Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
+static uvec8 kShuf1 = {3, 4, 5, 7, 8, 9, 11, 12,
+ 128, 128, 128, 128, 128, 128, 128, 128};
+
+// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
+static uvec8 kShuf2 = {5, 7, 8, 9, 11, 12, 13, 15,
+ 128, 128, 128, 128, 128, 128, 128, 128};
+
+// Offsets for source bytes 0 to 10
+static uvec8 kShuf01 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10};
+
+// Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
+static uvec8 kShuf11 = {2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13};
+
+// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
+static uvec8 kShuf21 = {5, 6, 6, 7, 8, 9, 9, 10,
+ 10, 11, 12, 13, 13, 14, 14, 15};
+
+// Coefficients for source bytes 0 to 10
+static uvec8 kMadd01 = {3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2};
+
+// Coefficients for source bytes 10 to 21
+static uvec8 kMadd11 = {1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1};
+
+// Coefficients for source bytes 21 to 31
+static uvec8 kMadd21 = {2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3};
+
+// Coefficients for source bytes 21 to 31
+static vec16 kRound34 = {2, 2, 2, 2, 2, 2, 2, 2};
+
+static uvec8 kShuf38a = {0, 3, 6, 8, 11, 14, 128, 128,
+ 128, 128, 128, 128, 128, 128, 128, 128};
+
+static uvec8 kShuf38b = {128, 128, 128, 128, 128, 128, 0, 3,
+ 6, 8, 11, 14, 128, 128, 128, 128};
+
+// Arrange words 0,3,6 into 0,1,2
+static uvec8 kShufAc = {0, 1, 6, 7, 12, 13, 128, 128,
+ 128, 128, 128, 128, 128, 128, 128, 128};
+
+// Arrange words 0,3,6 into 3,4,5
+static uvec8 kShufAc3 = {128, 128, 128, 128, 128, 128, 0, 1,
+ 6, 7, 12, 13, 128, 128, 128, 128};
+
+// Scaling values for boxes of 3x3 and 2x3
+static uvec16 kScaleAc33 = {65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9,
+ 65536 / 9, 65536 / 6, 0, 0};
+
+// Arrange first value for pixels 0,1,2,3,4,5
+static uvec8 kShufAb0 = {0, 128, 3, 128, 6, 128, 8, 128,
+ 11, 128, 14, 128, 128, 128, 128, 128};
+
+// Arrange second value for pixels 0,1,2,3,4,5
+static uvec8 kShufAb1 = {1, 128, 4, 128, 7, 128, 9, 128,
+ 12, 128, 15, 128, 128, 128, 128, 128};
+
+// Arrange third value for pixels 0,1,2,3,4,5
+static uvec8 kShufAb2 = {2, 128, 5, 128, 128, 128, 10, 128,
+ 13, 128, 128, 128, 128, 128, 128, 128};
+
+// Scaling values for boxes of 3x2 and 2x2
+static uvec16 kScaleAb2 = {65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3,
+ 65536 / 3, 65536 / 2, 0, 0};
+
+// Reads 32 pixels, throws half away and writes 16 pixels.
+__declspec(naked) void ScaleRowDown2_SSSE3(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width) {
+ __asm {
+ mov eax, [esp + 4] // src_ptr
+ // src_stride ignored
+ mov edx, [esp + 12] // dst_ptr
+ mov ecx, [esp + 16] // dst_width
+
+ wloop:
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ lea eax, [eax + 32]
+ psrlw xmm0, 8 // isolate odd pixels.
+ psrlw xmm1, 8
+ packuswb xmm0, xmm1
+ movdqu [edx], xmm0
+ lea edx, [edx + 16]
+ sub ecx, 16
+ jg wloop
+
+ ret
+ }
+}
+
+// Blends 32x1 rectangle to 16x1.
+__declspec(naked) void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width) {
+ __asm {
+ mov eax, [esp + 4] // src_ptr
+ // src_stride
+ mov edx, [esp + 12] // dst_ptr
+ mov ecx, [esp + 16] // dst_width
+
+ pcmpeqb xmm4, xmm4 // constant 0x0101
+ psrlw xmm4, 15
+ packuswb xmm4, xmm4
+ pxor xmm5, xmm5 // constant 0
+
+ wloop:
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ lea eax, [eax + 32]
+ pmaddubsw xmm0, xmm4 // horizontal add
+ pmaddubsw xmm1, xmm4
+ pavgw xmm0, xmm5 // (x + 1) / 2
+ pavgw xmm1, xmm5
+ packuswb xmm0, xmm1
+ movdqu [edx], xmm0
+ lea edx, [edx + 16]
+ sub ecx, 16
+ jg wloop
+
+ ret
+ }
+}
+
+// Blends 32x2 rectangle to 16x1.
+__declspec(naked) void ScaleRowDown2Box_SSSE3(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] // src_ptr
+ mov esi, [esp + 4 + 8] // src_stride
+ mov edx, [esp + 4 + 12] // dst_ptr
+ mov ecx, [esp + 4 + 16] // dst_width
+
+ pcmpeqb xmm4, xmm4 // constant 0x0101
+ psrlw xmm4, 15
+ packuswb xmm4, xmm4
+ pxor xmm5, xmm5 // constant 0
+
+ wloop:
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ movdqu xmm2, [eax + esi]
+ movdqu xmm3, [eax + esi + 16]
+ lea eax, [eax + 32]
+ pmaddubsw xmm0, xmm4 // horizontal add
+ pmaddubsw xmm1, xmm4
+ pmaddubsw xmm2, xmm4
+ pmaddubsw xmm3, xmm4
+ paddw xmm0, xmm2 // vertical add
+ paddw xmm1, xmm3
+ psrlw xmm0, 1
+ psrlw xmm1, 1
+ pavgw xmm0, xmm5 // (x + 1) / 2
+ pavgw xmm1, xmm5
+ packuswb xmm0, xmm1
+ movdqu [edx], xmm0
+ lea edx, [edx + 16]
+ sub ecx, 16
+ jg wloop
+
+ pop esi
+ ret
+ }
+}
+
+#ifdef HAS_SCALEROWDOWN2_AVX2
+// Reads 64 pixels, throws half away and writes 32 pixels.
+__declspec(naked) void ScaleRowDown2_AVX2(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width) {
+ __asm {
+ mov eax, [esp + 4] // src_ptr
+ // src_stride ignored
+ mov edx, [esp + 12] // dst_ptr
+ mov ecx, [esp + 16] // dst_width
+
+ wloop:
+ vmovdqu ymm0, [eax]
+ vmovdqu ymm1, [eax + 32]
+ lea eax, [eax + 64]
+ vpsrlw ymm0, ymm0, 8 // isolate odd pixels.
+ vpsrlw ymm1, ymm1, 8
+ vpackuswb ymm0, ymm0, ymm1
+ vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
+ vmovdqu [edx], ymm0
+ lea edx, [edx + 32]
+ sub ecx, 32
+ jg wloop
+
+ vzeroupper
+ ret
+ }
+}
+
+// Blends 64x1 rectangle to 32x1.
+__declspec(naked) void ScaleRowDown2Linear_AVX2(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width) {
+ __asm {
+ mov eax, [esp + 4] // src_ptr
+ // src_stride
+ mov edx, [esp + 12] // dst_ptr
+ mov ecx, [esp + 16] // dst_width
+
+ vpcmpeqb ymm4, ymm4, ymm4 // '1' constant, 8b
+ vpsrlw ymm4, ymm4, 15
+ vpackuswb ymm4, ymm4, ymm4
+ vpxor ymm5, ymm5, ymm5 // constant 0
+
+ wloop:
+ vmovdqu ymm0, [eax]
+ vmovdqu ymm1, [eax + 32]
+ lea eax, [eax + 64]
+ vpmaddubsw ymm0, ymm0, ymm4 // horizontal add
+ vpmaddubsw ymm1, ymm1, ymm4
+ vpavgw ymm0, ymm0, ymm5 // (x + 1) / 2
+ vpavgw ymm1, ymm1, ymm5
+ vpackuswb ymm0, ymm0, ymm1
+ vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
+ vmovdqu [edx], ymm0
+ lea edx, [edx + 32]
+ sub ecx, 32
+ jg wloop
+
+ vzeroupper
+ ret
+ }
+}
+
+// For rounding, average = (sum + 2) / 4
+// becomes average((sum >> 1), 0)
+// Blends 64x2 rectangle to 32x1.
+__declspec(naked) void ScaleRowDown2Box_AVX2(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] // src_ptr
+ mov esi, [esp + 4 + 8] // src_stride
+ mov edx, [esp + 4 + 12] // dst_ptr
+ mov ecx, [esp + 4 + 16] // dst_width
+
+ vpcmpeqb ymm4, ymm4, ymm4 // '1' constant, 8b
+ vpsrlw ymm4, ymm4, 15
+ vpackuswb ymm4, ymm4, ymm4
+ vpxor ymm5, ymm5, ymm5 // constant 0
+
+ wloop:
+ vmovdqu ymm0, [eax]
+ vmovdqu ymm1, [eax + 32]
+ vmovdqu ymm2, [eax + esi]
+ vmovdqu ymm3, [eax + esi + 32]
+ lea eax, [eax + 64]
+ vpmaddubsw ymm0, ymm0, ymm4 // horizontal add
+ vpmaddubsw ymm1, ymm1, ymm4
+ vpmaddubsw ymm2, ymm2, ymm4
+ vpmaddubsw ymm3, ymm3, ymm4
+ vpaddw ymm0, ymm0, ymm2 // vertical add
+ vpaddw ymm1, ymm1, ymm3
+ vpsrlw ymm0, ymm0, 1 // (x + 2) / 4 = (x / 2 + 1) / 2
+ vpsrlw ymm1, ymm1, 1
+ vpavgw ymm0, ymm0, ymm5 // (x + 1) / 2
+ vpavgw ymm1, ymm1, ymm5
+ vpackuswb ymm0, ymm0, ymm1
+ vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
+ vmovdqu [edx], ymm0
+ lea edx, [edx + 32]
+ sub ecx, 32
+ jg wloop
+
+ pop esi
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_SCALEROWDOWN2_AVX2
+
+// Point samples 32 pixels to 8 pixels.
+__declspec(naked) void ScaleRowDown4_SSSE3(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width) {
+ __asm {
+ mov eax, [esp + 4] // src_ptr
+ // src_stride ignored
+ mov edx, [esp + 12] // dst_ptr
+ mov ecx, [esp + 16] // dst_width
+ pcmpeqb xmm5, xmm5 // generate mask 0x00ff0000
+ psrld xmm5, 24
+ pslld xmm5, 16
+
+ wloop:
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ lea eax, [eax + 32]
+ pand xmm0, xmm5
+ pand xmm1, xmm5
+ packuswb xmm0, xmm1
+ psrlw xmm0, 8
+ packuswb xmm0, xmm0
+ movq qword ptr [edx], xmm0
+ lea edx, [edx + 8]
+ sub ecx, 8
+ jg wloop
+
+ ret
+ }
+}
+
+// Blends 32x4 rectangle to 8x1.
+__declspec(naked) void ScaleRowDown4Box_SSSE3(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // src_ptr
+ mov esi, [esp + 8 + 8] // src_stride
+ mov edx, [esp + 8 + 12] // dst_ptr
+ mov ecx, [esp + 8 + 16] // dst_width
+ lea edi, [esi + esi * 2] // src_stride * 3
+ pcmpeqb xmm4, xmm4 // constant 0x0101
+ psrlw xmm4, 15
+ movdqa xmm5, xmm4
+ packuswb xmm4, xmm4
+ psllw xmm5, 3 // constant 0x0008
+
+ wloop:
+ movdqu xmm0, [eax] // average rows
+ movdqu xmm1, [eax + 16]
+ movdqu xmm2, [eax + esi]
+ movdqu xmm3, [eax + esi + 16]
+ pmaddubsw xmm0, xmm4 // horizontal add
+ pmaddubsw xmm1, xmm4
+ pmaddubsw xmm2, xmm4
+ pmaddubsw xmm3, xmm4
+ paddw xmm0, xmm2 // vertical add rows 0, 1
+ paddw xmm1, xmm3
+ movdqu xmm2, [eax + esi * 2]
+ movdqu xmm3, [eax + esi * 2 + 16]
+ pmaddubsw xmm2, xmm4
+ pmaddubsw xmm3, xmm4
+ paddw xmm0, xmm2 // add row 2
+ paddw xmm1, xmm3
+ movdqu xmm2, [eax + edi]
+ movdqu xmm3, [eax + edi + 16]
+ lea eax, [eax + 32]
+ pmaddubsw xmm2, xmm4
+ pmaddubsw xmm3, xmm4
+ paddw xmm0, xmm2 // add row 3
+ paddw xmm1, xmm3
+ phaddw xmm0, xmm1
+ paddw xmm0, xmm5 // + 8 for round
+ psrlw xmm0, 4 // /16 for average of 4 * 4
+ packuswb xmm0, xmm0
+ movq qword ptr [edx], xmm0
+ lea edx, [edx + 8]
+ sub ecx, 8
+ jg wloop
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+#ifdef HAS_SCALEROWDOWN4_AVX2
+// Point samples 64 pixels to 16 pixels.
+__declspec(naked) void ScaleRowDown4_AVX2(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width) {
+ __asm {
+ mov eax, [esp + 4] // src_ptr
+ // src_stride ignored
+ mov edx, [esp + 12] // dst_ptr
+ mov ecx, [esp + 16] // dst_width
+ vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff0000
+ vpsrld ymm5, ymm5, 24
+ vpslld ymm5, ymm5, 16
+
+ wloop:
+ vmovdqu ymm0, [eax]
+ vmovdqu ymm1, [eax + 32]
+ lea eax, [eax + 64]
+ vpand ymm0, ymm0, ymm5
+ vpand ymm1, ymm1, ymm5
+ vpackuswb ymm0, ymm0, ymm1
+ vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
+ vpsrlw ymm0, ymm0, 8
+ vpackuswb ymm0, ymm0, ymm0
+ vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
+ vmovdqu [edx], xmm0
+ lea edx, [edx + 16]
+ sub ecx, 16
+ jg wloop
+
+ vzeroupper
+ ret
+ }
+}
+
+// Blends 64x4 rectangle to 16x1.
+__declspec(naked) void ScaleRowDown4Box_AVX2(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // src_ptr
+ mov esi, [esp + 8 + 8] // src_stride
+ mov edx, [esp + 8 + 12] // dst_ptr
+ mov ecx, [esp + 8 + 16] // dst_width
+ lea edi, [esi + esi * 2] // src_stride * 3
+ vpcmpeqb ymm4, ymm4, ymm4 // constant 0x0101
+ vpsrlw ymm4, ymm4, 15
+ vpsllw ymm5, ymm4, 3 // constant 0x0008
+ vpackuswb ymm4, ymm4, ymm4
+
+ wloop:
+ vmovdqu ymm0, [eax] // average rows
+ vmovdqu ymm1, [eax + 32]
+ vmovdqu ymm2, [eax + esi]
+ vmovdqu ymm3, [eax + esi + 32]
+ vpmaddubsw ymm0, ymm0, ymm4 // horizontal add
+ vpmaddubsw ymm1, ymm1, ymm4
+ vpmaddubsw ymm2, ymm2, ymm4
+ vpmaddubsw ymm3, ymm3, ymm4
+ vpaddw ymm0, ymm0, ymm2 // vertical add rows 0, 1
+ vpaddw ymm1, ymm1, ymm3
+ vmovdqu ymm2, [eax + esi * 2]
+ vmovdqu ymm3, [eax + esi * 2 + 32]
+ vpmaddubsw ymm2, ymm2, ymm4
+ vpmaddubsw ymm3, ymm3, ymm4
+ vpaddw ymm0, ymm0, ymm2 // add row 2
+ vpaddw ymm1, ymm1, ymm3
+ vmovdqu ymm2, [eax + edi]
+ vmovdqu ymm3, [eax + edi + 32]
+ lea eax, [eax + 64]
+ vpmaddubsw ymm2, ymm2, ymm4
+ vpmaddubsw ymm3, ymm3, ymm4
+ vpaddw ymm0, ymm0, ymm2 // add row 3
+ vpaddw ymm1, ymm1, ymm3
+ vphaddw ymm0, ymm0, ymm1 // mutates
+ vpermq ymm0, ymm0, 0xd8 // unmutate vphaddw
+ vpaddw ymm0, ymm0, ymm5 // + 8 for round
+ vpsrlw ymm0, ymm0, 4 // /32 for average of 4 * 4
+ vpackuswb ymm0, ymm0, ymm0
+ vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
+ vmovdqu [edx], xmm0
+ lea edx, [edx + 16]
+ sub ecx, 16
+ jg wloop
+
+ pop edi
+ pop esi
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_SCALEROWDOWN4_AVX2
+
+// Point samples 32 pixels to 24 pixels.
+// Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
+// Then shuffled to do the scaling.
+
+__declspec(naked) void ScaleRowDown34_SSSE3(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width) {
+ __asm {
+ mov eax, [esp + 4] // src_ptr
+ // src_stride ignored
+ mov edx, [esp + 12] // dst_ptr
+ mov ecx, [esp + 16] // dst_width
+ movdqa xmm3, xmmword ptr kShuf0
+ movdqa xmm4, xmmword ptr kShuf1
+ movdqa xmm5, xmmword ptr kShuf2
+
+ wloop:
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ lea eax, [eax + 32]
+ movdqa xmm2, xmm1
+ palignr xmm1, xmm0, 8
+ pshufb xmm0, xmm3
+ pshufb xmm1, xmm4
+ pshufb xmm2, xmm5
+ movq qword ptr [edx], xmm0
+ movq qword ptr [edx + 8], xmm1
+ movq qword ptr [edx + 16], xmm2
+ lea edx, [edx + 24]
+ sub ecx, 24
+ jg wloop
+
+ ret
+ }
+}
+
+// Blends 32x2 rectangle to 24x1
+// Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
+// Then shuffled to do the scaling.
+
+// Register usage:
+// xmm0 src_row 0
+// xmm1 src_row 1
+// xmm2 shuf 0
+// xmm3 shuf 1
+// xmm4 shuf 2
+// xmm5 madd 0
+// xmm6 madd 1
+// xmm7 kRound34
+
+// Note that movdqa+palign may be better than movdqu.
+__declspec(naked) void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] // src_ptr
+ mov esi, [esp + 4 + 8] // src_stride
+ mov edx, [esp + 4 + 12] // dst_ptr
+ mov ecx, [esp + 4 + 16] // dst_width
+ movdqa xmm2, xmmword ptr kShuf01
+ movdqa xmm3, xmmword ptr kShuf11
+ movdqa xmm4, xmmword ptr kShuf21
+ movdqa xmm5, xmmword ptr kMadd01
+ movdqa xmm6, xmmword ptr kMadd11
+ movdqa xmm7, xmmword ptr kRound34
+
+ wloop:
+ movdqu xmm0, [eax] // pixels 0..7
+ movdqu xmm1, [eax + esi]
+ pavgb xmm0, xmm1
+ pshufb xmm0, xmm2
+ pmaddubsw xmm0, xmm5
+ paddsw xmm0, xmm7
+ psrlw xmm0, 2
+ packuswb xmm0, xmm0
+ movq qword ptr [edx], xmm0
+ movdqu xmm0, [eax + 8] // pixels 8..15
+ movdqu xmm1, [eax + esi + 8]
+ pavgb xmm0, xmm1
+ pshufb xmm0, xmm3
+ pmaddubsw xmm0, xmm6
+ paddsw xmm0, xmm7
+ psrlw xmm0, 2
+ packuswb xmm0, xmm0
+ movq qword ptr [edx + 8], xmm0
+ movdqu xmm0, [eax + 16] // pixels 16..23
+ movdqu xmm1, [eax + esi + 16]
+ lea eax, [eax + 32]
+ pavgb xmm0, xmm1
+ pshufb xmm0, xmm4
+ movdqa xmm1, xmmword ptr kMadd21
+ pmaddubsw xmm0, xmm1
+ paddsw xmm0, xmm7
+ psrlw xmm0, 2
+ packuswb xmm0, xmm0
+ movq qword ptr [edx + 16], xmm0
+ lea edx, [edx + 24]
+ sub ecx, 24
+ jg wloop
+
+ pop esi
+ ret
+ }
+}
+
+// Note that movdqa+palign may be better than movdqu.
+__declspec(naked) void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] // src_ptr
+ mov esi, [esp + 4 + 8] // src_stride
+ mov edx, [esp + 4 + 12] // dst_ptr
+ mov ecx, [esp + 4 + 16] // dst_width
+ movdqa xmm2, xmmword ptr kShuf01
+ movdqa xmm3, xmmword ptr kShuf11
+ movdqa xmm4, xmmword ptr kShuf21
+ movdqa xmm5, xmmword ptr kMadd01
+ movdqa xmm6, xmmword ptr kMadd11
+ movdqa xmm7, xmmword ptr kRound34
+
+ wloop:
+ movdqu xmm0, [eax] // pixels 0..7
+ movdqu xmm1, [eax + esi]
+ pavgb xmm1, xmm0
+ pavgb xmm0, xmm1
+ pshufb xmm0, xmm2
+ pmaddubsw xmm0, xmm5
+ paddsw xmm0, xmm7
+ psrlw xmm0, 2
+ packuswb xmm0, xmm0
+ movq qword ptr [edx], xmm0
+ movdqu xmm0, [eax + 8] // pixels 8..15
+ movdqu xmm1, [eax + esi + 8]
+ pavgb xmm1, xmm0
+ pavgb xmm0, xmm1
+ pshufb xmm0, xmm3
+ pmaddubsw xmm0, xmm6
+ paddsw xmm0, xmm7
+ psrlw xmm0, 2
+ packuswb xmm0, xmm0
+ movq qword ptr [edx + 8], xmm0
+ movdqu xmm0, [eax + 16] // pixels 16..23
+ movdqu xmm1, [eax + esi + 16]
+ lea eax, [eax + 32]
+ pavgb xmm1, xmm0
+ pavgb xmm0, xmm1
+ pshufb xmm0, xmm4
+ movdqa xmm1, xmmword ptr kMadd21
+ pmaddubsw xmm0, xmm1
+ paddsw xmm0, xmm7
+ psrlw xmm0, 2
+ packuswb xmm0, xmm0
+ movq qword ptr [edx + 16], xmm0
+ lea edx, [edx+24]
+ sub ecx, 24
+ jg wloop
+
+ pop esi
+ ret
+ }
+}
+
+// 3/8 point sampler
+
+// Scale 32 pixels to 12
+__declspec(naked) void ScaleRowDown38_SSSE3(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width) {
+ __asm {
+ mov eax, [esp + 4] // src_ptr
+ // src_stride ignored
+ mov edx, [esp + 12] // dst_ptr
+ mov ecx, [esp + 16] // dst_width
+ movdqa xmm4, xmmword ptr kShuf38a
+ movdqa xmm5, xmmword ptr kShuf38b
+
+ xloop:
+ movdqu xmm0, [eax] // 16 pixels -> 0,1,2,3,4,5
+ movdqu xmm1, [eax + 16] // 16 pixels -> 6,7,8,9,10,11
+ lea eax, [eax + 32]
+ pshufb xmm0, xmm4
+ pshufb xmm1, xmm5
+ paddusb xmm0, xmm1
+
+ movq qword ptr [edx], xmm0 // write 12 pixels
+ movhlps xmm1, xmm0
+ movd [edx + 8], xmm1
+ lea edx, [edx + 12]
+ sub ecx, 12
+ jg xloop
+
+ ret
+ }
+}
+
+// Scale 16x3 pixels to 6x1 with interpolation
+__declspec(naked) void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] // src_ptr
+ mov esi, [esp + 4 + 8] // src_stride
+ mov edx, [esp + 4 + 12] // dst_ptr
+ mov ecx, [esp + 4 + 16] // dst_width
+ movdqa xmm2, xmmword ptr kShufAc
+ movdqa xmm3, xmmword ptr kShufAc3
+ movdqa xmm4, xmmword ptr kScaleAc33
+ pxor xmm5, xmm5
+
+ xloop:
+ movdqu xmm0, [eax] // sum up 3 rows into xmm0/1
+ movdqu xmm6, [eax + esi]
+ movhlps xmm1, xmm0
+ movhlps xmm7, xmm6
+ punpcklbw xmm0, xmm5
+ punpcklbw xmm1, xmm5
+ punpcklbw xmm6, xmm5
+ punpcklbw xmm7, xmm5
+ paddusw xmm0, xmm6
+ paddusw xmm1, xmm7
+ movdqu xmm6, [eax + esi * 2]
+ lea eax, [eax + 16]
+ movhlps xmm7, xmm6
+ punpcklbw xmm6, xmm5
+ punpcklbw xmm7, xmm5
+ paddusw xmm0, xmm6
+ paddusw xmm1, xmm7
+
+ movdqa xmm6, xmm0 // 8 pixels -> 0,1,2 of xmm6
+ psrldq xmm0, 2
+ paddusw xmm6, xmm0
+ psrldq xmm0, 2
+ paddusw xmm6, xmm0
+ pshufb xmm6, xmm2
+
+ movdqa xmm7, xmm1 // 8 pixels -> 3,4,5 of xmm6
+ psrldq xmm1, 2
+ paddusw xmm7, xmm1
+ psrldq xmm1, 2
+ paddusw xmm7, xmm1
+ pshufb xmm7, xmm3
+ paddusw xmm6, xmm7
+
+ pmulhuw xmm6, xmm4 // divide by 9,9,6, 9,9,6
+ packuswb xmm6, xmm6
+
+ movd [edx], xmm6 // write 6 pixels
+ psrlq xmm6, 16
+ movd [edx + 2], xmm6
+ lea edx, [edx + 6]
+ sub ecx, 6
+ jg xloop
+
+ pop esi
+ ret
+ }
+}
+
+// Scale 16x2 pixels to 6x1 with interpolation
+__declspec(naked) void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] // src_ptr
+ mov esi, [esp + 4 + 8] // src_stride
+ mov edx, [esp + 4 + 12] // dst_ptr
+ mov ecx, [esp + 4 + 16] // dst_width
+ movdqa xmm2, xmmword ptr kShufAb0
+ movdqa xmm3, xmmword ptr kShufAb1
+ movdqa xmm4, xmmword ptr kShufAb2
+ movdqa xmm5, xmmword ptr kScaleAb2
+
+ xloop:
+ movdqu xmm0, [eax] // average 2 rows into xmm0
+ movdqu xmm1, [eax + esi]
+ lea eax, [eax + 16]
+ pavgb xmm0, xmm1
+
+ movdqa xmm1, xmm0 // 16 pixels -> 0,1,2,3,4,5 of xmm1
+ pshufb xmm1, xmm2
+ movdqa xmm6, xmm0
+ pshufb xmm6, xmm3
+ paddusw xmm1, xmm6
+ pshufb xmm0, xmm4
+ paddusw xmm1, xmm0
+
+ pmulhuw xmm1, xmm5 // divide by 3,3,2, 3,3,2
+ packuswb xmm1, xmm1
+
+ movd [edx], xmm1 // write 6 pixels
+ psrlq xmm1, 16
+ movd [edx + 2], xmm1
+ lea edx, [edx + 6]
+ sub ecx, 6
+ jg xloop
+
+ pop esi
+ ret
+ }
+}
+
+// Reads 16 bytes and accumulates to 16 shorts at a time.
+__declspec(naked) void ScaleAddRow_SSE2(const uint8* src_ptr,
+ uint16* dst_ptr,
+ int src_width) {
+ __asm {
+ mov eax, [esp + 4] // src_ptr
+ mov edx, [esp + 8] // dst_ptr
+ mov ecx, [esp + 12] // src_width
+ pxor xmm5, xmm5
+
+ // sum rows
+ xloop:
+ movdqu xmm3, [eax] // read 16 bytes
+ lea eax, [eax + 16]
+ movdqu xmm0, [edx] // read 16 words from destination
+ movdqu xmm1, [edx + 16]
+ movdqa xmm2, xmm3
+ punpcklbw xmm2, xmm5
+ punpckhbw xmm3, xmm5
+ paddusw xmm0, xmm2 // sum 16 words
+ paddusw xmm1, xmm3
+ movdqu [edx], xmm0 // write 16 words to destination
+ movdqu [edx + 16], xmm1
+ lea edx, [edx + 32]
+ sub ecx, 16
+ jg xloop
+ ret
+ }
+}
+
+#ifdef HAS_SCALEADDROW_AVX2
+// Reads 32 bytes and accumulates to 32 shorts at a time.
+__declspec(naked) void ScaleAddRow_AVX2(const uint8* src_ptr,
+ uint16* dst_ptr,
+ int src_width) {
+ __asm {
+ mov eax, [esp + 4] // src_ptr
+ mov edx, [esp + 8] // dst_ptr
+ mov ecx, [esp + 12] // src_width
+ vpxor ymm5, ymm5, ymm5
+
+ // sum rows
+ xloop:
+ vmovdqu ymm3, [eax] // read 32 bytes
+ lea eax, [eax + 32]
+ vpermq ymm3, ymm3, 0xd8 // unmutate for vpunpck
+ vpunpcklbw ymm2, ymm3, ymm5
+ vpunpckhbw ymm3, ymm3, ymm5
+ vpaddusw ymm0, ymm2, [edx] // sum 16 words
+ vpaddusw ymm1, ymm3, [edx + 32]
+ vmovdqu [edx], ymm0 // write 32 words to destination
+ vmovdqu [edx + 32], ymm1
+ lea edx, [edx + 64]
+ sub ecx, 32
+ jg xloop
+
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_SCALEADDROW_AVX2
+
+// Constant for making pixels signed to avoid pmaddubsw
+// saturation.
+static uvec8 kFsub80 = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80};
+
+// Constant for making pixels unsigned and adding .5 for rounding.
+static uvec16 kFadd40 = {0x4040, 0x4040, 0x4040, 0x4040,
+ 0x4040, 0x4040, 0x4040, 0x4040};
+
+// Bilinear column filtering. SSSE3 version.
+__declspec(naked) void ScaleFilterCols_SSSE3(uint8* dst_ptr,
+ const uint8* src_ptr,
+ int dst_width,
+ int x,
+ int dx) {
+ __asm {
+ push ebx
+ push esi
+ push edi
+ mov edi, [esp + 12 + 4] // dst_ptr
+ mov esi, [esp + 12 + 8] // src_ptr
+ mov ecx, [esp + 12 + 12] // dst_width
+ movd xmm2, [esp + 12 + 16] // x
+ movd xmm3, [esp + 12 + 20] // dx
+ mov eax, 0x04040000 // shuffle to line up fractions with pixel.
+ movd xmm5, eax
+ pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction.
+ psrlw xmm6, 9
+ pcmpeqb xmm7, xmm7 // generate 0x0001
+ psrlw xmm7, 15
+ pextrw eax, xmm2, 1 // get x0 integer. preroll
+ sub ecx, 2
+ jl xloop29
+
+ movdqa xmm0, xmm2 // x1 = x0 + dx
+ paddd xmm0, xmm3
+ punpckldq xmm2, xmm0 // x0 x1
+ punpckldq xmm3, xmm3 // dx dx
+ paddd xmm3, xmm3 // dx * 2, dx * 2
+ pextrw edx, xmm2, 3 // get x1 integer. preroll
+
+ // 2 Pixel loop.
+ xloop2:
+ movdqa xmm1, xmm2 // x0, x1 fractions.
+ paddd xmm2, xmm3 // x += dx
+ movzx ebx, word ptr [esi + eax] // 2 source x0 pixels
+ movd xmm0, ebx
+ psrlw xmm1, 9 // 7 bit fractions.
+ movzx ebx, word ptr [esi + edx] // 2 source x1 pixels
+ movd xmm4, ebx
+ pshufb xmm1, xmm5 // 0011
+ punpcklwd xmm0, xmm4
+ psubb xmm0, xmmword ptr kFsub80 // make pixels signed.
+ pxor xmm1, xmm6 // 0..7f and 7f..0
+ paddusb xmm1, xmm7 // +1 so 0..7f and 80..1
+ pmaddubsw xmm1, xmm0 // 16 bit, 2 pixels.
+ pextrw eax, xmm2, 1 // get x0 integer. next iteration.
+ pextrw edx, xmm2, 3 // get x1 integer. next iteration.
+ paddw xmm1, xmmword ptr kFadd40 // make pixels unsigned and round.
+ psrlw xmm1, 7 // 8.7 fixed point to low 8 bits.
+ packuswb xmm1, xmm1 // 8 bits, 2 pixels.
+ movd ebx, xmm1
+ mov [edi], bx
+ lea edi, [edi + 2]
+ sub ecx, 2 // 2 pixels
+ jge xloop2
+
+ xloop29:
+ add ecx, 2 - 1
+ jl xloop99
+
+ // 1 pixel remainder
+ movzx ebx, word ptr [esi + eax] // 2 source x0 pixels
+ movd xmm0, ebx
+ psrlw xmm2, 9 // 7 bit fractions.
+ pshufb xmm2, xmm5 // 0011
+ psubb xmm0, xmmword ptr kFsub80 // make pixels signed.
+ pxor xmm2, xmm6 // 0..7f and 7f..0
+ paddusb xmm2, xmm7 // +1 so 0..7f and 80..1
+ pmaddubsw xmm2, xmm0 // 16 bit
+ paddw xmm2, xmmword ptr kFadd40 // make pixels unsigned and round.
+ psrlw xmm2, 7 // 8.7 fixed point to low 8 bits.
+ packuswb xmm2, xmm2 // 8 bits
+ movd ebx, xmm2
+ mov [edi], bl
+
+ xloop99:
+
+ pop edi
+ pop esi
+ pop ebx
+ ret
+ }
+}
+
+// Reads 16 pixels, duplicates them and writes 32 pixels.
+__declspec(naked) void ScaleColsUp2_SSE2(uint8* dst_ptr,
+ const uint8* src_ptr,
+ int dst_width,
+ int x,
+ int dx) {
+ __asm {
+ mov edx, [esp + 4] // dst_ptr
+ mov eax, [esp + 8] // src_ptr
+ mov ecx, [esp + 12] // dst_width
+
+ wloop:
+ movdqu xmm0, [eax]
+ lea eax, [eax + 16]
+ movdqa xmm1, xmm0
+ punpcklbw xmm0, xmm0
+ punpckhbw xmm1, xmm1
+ movdqu [edx], xmm0
+ movdqu [edx + 16], xmm1
+ lea edx, [edx + 32]
+ sub ecx, 32
+ jg wloop
+
+ ret
+ }
+}
+
+// Reads 8 pixels, throws half away and writes 4 even pixels (0, 2, 4, 6)
+__declspec(naked) void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
+ ptrdiff_t src_stride,
+ uint8* dst_argb,
+ int dst_width) {
+ __asm {
+ mov eax, [esp + 4] // src_argb
+ // src_stride ignored
+ mov edx, [esp + 12] // dst_argb
+ mov ecx, [esp + 16] // dst_width
+
+ wloop:
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ lea eax, [eax + 32]
+ shufps xmm0, xmm1, 0xdd
+ movdqu [edx], xmm0
+ lea edx, [edx + 16]
+ sub ecx, 4
+ jg wloop
+
+ ret
+ }
+}
+
+// Blends 8x1 rectangle to 4x1.
+__declspec(naked) void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
+ ptrdiff_t src_stride,
+ uint8* dst_argb,
+ int dst_width) {
+ __asm {
+ mov eax, [esp + 4] // src_argb
+ // src_stride ignored
+ mov edx, [esp + 12] // dst_argb
+ mov ecx, [esp + 16] // dst_width
+
+ wloop:
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ lea eax, [eax + 32]
+ movdqa xmm2, xmm0
+ shufps xmm0, xmm1, 0x88 // even pixels
+ shufps xmm2, xmm1, 0xdd // odd pixels
+ pavgb xmm0, xmm2
+ movdqu [edx], xmm0
+ lea edx, [edx + 16]
+ sub ecx, 4
+ jg wloop
+
+ ret
+ }
+}
+
+// Blends 8x2 rectangle to 4x1.
+__declspec(naked) void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
+ ptrdiff_t src_stride,
+ uint8* dst_argb,
+ int dst_width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] // src_argb
+ mov esi, [esp + 4 + 8] // src_stride
+ mov edx, [esp + 4 + 12] // dst_argb
+ mov ecx, [esp + 4 + 16] // dst_width
+
+ wloop:
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ movdqu xmm2, [eax + esi]
+ movdqu xmm3, [eax + esi + 16]
+ lea eax, [eax + 32]
+ pavgb xmm0, xmm2 // average rows
+ pavgb xmm1, xmm3
+ movdqa xmm2, xmm0 // average columns (8 to 4 pixels)
+ shufps xmm0, xmm1, 0x88 // even pixels
+ shufps xmm2, xmm1, 0xdd // odd pixels
+ pavgb xmm0, xmm2
+ movdqu [edx], xmm0
+ lea edx, [edx + 16]
+ sub ecx, 4
+ jg wloop
+
+ pop esi
+ ret
+ }
+}
+
+// Reads 4 pixels at a time.
+__declspec(naked) void ScaleARGBRowDownEven_SSE2(const uint8* src_argb,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8* dst_argb,
+ int dst_width) {
+ __asm {
+ push ebx
+ push edi
+ mov eax, [esp + 8 + 4] // src_argb
+ // src_stride ignored
+ mov ebx, [esp + 8 + 12] // src_stepx
+ mov edx, [esp + 8 + 16] // dst_argb
+ mov ecx, [esp + 8 + 20] // dst_width
+ lea ebx, [ebx * 4]
+ lea edi, [ebx + ebx * 2]
+
+ wloop:
+ movd xmm0, [eax]
+ movd xmm1, [eax + ebx]
+ punpckldq xmm0, xmm1
+ movd xmm2, [eax + ebx * 2]
+ movd xmm3, [eax + edi]
+ lea eax, [eax + ebx * 4]
+ punpckldq xmm2, xmm3
+ punpcklqdq xmm0, xmm2
+ movdqu [edx], xmm0
+ lea edx, [edx + 16]
+ sub ecx, 4
+ jg wloop
+
+ pop edi
+ pop ebx
+ ret
+ }
+}
+
+// Blends four 2x2 to 4x1.
+__declspec(naked) void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8* dst_argb,
+ int dst_width) {
+ __asm {
+ push ebx
+ push esi
+ push edi
+ mov eax, [esp + 12 + 4] // src_argb
+ mov esi, [esp + 12 + 8] // src_stride
+ mov ebx, [esp + 12 + 12] // src_stepx
+ mov edx, [esp + 12 + 16] // dst_argb
+ mov ecx, [esp + 12 + 20] // dst_width
+ lea esi, [eax + esi] // row1 pointer
+ lea ebx, [ebx * 4]
+ lea edi, [ebx + ebx * 2]
+
+ wloop:
+ movq xmm0, qword ptr [eax] // row0 4 pairs
+ movhps xmm0, qword ptr [eax + ebx]
+ movq xmm1, qword ptr [eax + ebx * 2]
+ movhps xmm1, qword ptr [eax + edi]
+ lea eax, [eax + ebx * 4]
+ movq xmm2, qword ptr [esi] // row1 4 pairs
+ movhps xmm2, qword ptr [esi + ebx]
+ movq xmm3, qword ptr [esi + ebx * 2]
+ movhps xmm3, qword ptr [esi + edi]
+ lea esi, [esi + ebx * 4]
+ pavgb xmm0, xmm2 // average rows
+ pavgb xmm1, xmm3
+ movdqa xmm2, xmm0 // average columns (8 to 4 pixels)
+ shufps xmm0, xmm1, 0x88 // even pixels
+ shufps xmm2, xmm1, 0xdd // odd pixels
+ pavgb xmm0, xmm2
+ movdqu [edx], xmm0
+ lea edx, [edx + 16]
+ sub ecx, 4
+ jg wloop
+
+ pop edi
+ pop esi
+ pop ebx
+ ret
+ }
+}
+
+// Column scaling unfiltered. SSE2 version.
+__declspec(naked) void ScaleARGBCols_SSE2(uint8* dst_argb,
+ const uint8* src_argb,
+ int dst_width,
+ int x,
+ int dx) {
+ __asm {
+ push edi
+ push esi
+ mov edi, [esp + 8 + 4] // dst_argb
+ mov esi, [esp + 8 + 8] // src_argb
+ mov ecx, [esp + 8 + 12] // dst_width
+ movd xmm2, [esp + 8 + 16] // x
+ movd xmm3, [esp + 8 + 20] // dx
+
+ pshufd xmm2, xmm2, 0 // x0 x0 x0 x0
+ pshufd xmm0, xmm3, 0x11 // dx 0 dx 0
+ paddd xmm2, xmm0
+ paddd xmm3, xmm3 // 0, 0, 0, dx * 2
+ pshufd xmm0, xmm3, 0x05 // dx * 2, dx * 2, 0, 0
+ paddd xmm2, xmm0 // x3 x2 x1 x0
+ paddd xmm3, xmm3 // 0, 0, 0, dx * 4
+ pshufd xmm3, xmm3, 0 // dx * 4, dx * 4, dx * 4, dx * 4
+
+ pextrw eax, xmm2, 1 // get x0 integer.
+ pextrw edx, xmm2, 3 // get x1 integer.
+
+ cmp ecx, 0
+ jle xloop99
+ sub ecx, 4
+ jl xloop49
+
+ // 4 Pixel loop.
+ xloop4:
+ movd xmm0, [esi + eax * 4] // 1 source x0 pixels
+ movd xmm1, [esi + edx * 4] // 1 source x1 pixels
+ pextrw eax, xmm2, 5 // get x2 integer.
+ pextrw edx, xmm2, 7 // get x3 integer.
+ paddd xmm2, xmm3 // x += dx
+ punpckldq xmm0, xmm1 // x0 x1
+
+ movd xmm1, [esi + eax * 4] // 1 source x2 pixels
+ movd xmm4, [esi + edx * 4] // 1 source x3 pixels
+ pextrw eax, xmm2, 1 // get x0 integer. next iteration.
+ pextrw edx, xmm2, 3 // get x1 integer. next iteration.
+ punpckldq xmm1, xmm4 // x2 x3
+ punpcklqdq xmm0, xmm1 // x0 x1 x2 x3
+ movdqu [edi], xmm0
+ lea edi, [edi + 16]
+ sub ecx, 4 // 4 pixels
+ jge xloop4
+
+ xloop49:
+ test ecx, 2
+ je xloop29
+
+ // 2 Pixels.
+ movd xmm0, [esi + eax * 4] // 1 source x0 pixels
+ movd xmm1, [esi + edx * 4] // 1 source x1 pixels
+ pextrw eax, xmm2, 5 // get x2 integer.
+ punpckldq xmm0, xmm1 // x0 x1
+
+ movq qword ptr [edi], xmm0
+ lea edi, [edi + 8]
+
+ xloop29:
+ test ecx, 1
+ je xloop99
+
+ // 1 Pixels.
+ movd xmm0, [esi + eax * 4] // 1 source x2 pixels
+ movd dword ptr [edi], xmm0
+ xloop99:
+
+ pop esi
+ pop edi
+ ret
+ }
+}
+
+// Bilinear row filtering combines 2x1 -> 1x1. SSSE3 version.
+// TODO(fbarchard): Port to Neon
+
+// Shuffle table for arranging 2 pixels into pairs for pmaddubsw
+static uvec8 kShuffleColARGB = {
+ 0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel
+ 8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel
+};
+
+// Shuffle table for duplicating 2 fractions into 8 bytes each
+static uvec8 kShuffleFractions = {
+ 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
+};
+
+__declspec(naked) void ScaleARGBFilterCols_SSSE3(uint8* dst_argb,
+ const uint8* src_argb,
+ int dst_width,
+ int x,
+ int dx) {
+ __asm {
+ push esi
+ push edi
+ mov edi, [esp + 8 + 4] // dst_argb
+ mov esi, [esp + 8 + 8] // src_argb
+ mov ecx, [esp + 8 + 12] // dst_width
+ movd xmm2, [esp + 8 + 16] // x
+ movd xmm3, [esp + 8 + 20] // dx
+ movdqa xmm4, xmmword ptr kShuffleColARGB
+ movdqa xmm5, xmmword ptr kShuffleFractions
+ pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction.
+ psrlw xmm6, 9
+ pextrw eax, xmm2, 1 // get x0 integer. preroll
+ sub ecx, 2
+ jl xloop29
+
+ movdqa xmm0, xmm2 // x1 = x0 + dx
+ paddd xmm0, xmm3
+ punpckldq xmm2, xmm0 // x0 x1
+ punpckldq xmm3, xmm3 // dx dx
+ paddd xmm3, xmm3 // dx * 2, dx * 2
+ pextrw edx, xmm2, 3 // get x1 integer. preroll
+
+ // 2 Pixel loop.
+ xloop2:
+ movdqa xmm1, xmm2 // x0, x1 fractions.
+ paddd xmm2, xmm3 // x += dx
+ movq xmm0, qword ptr [esi + eax * 4] // 2 source x0 pixels
+ psrlw xmm1, 9 // 7 bit fractions.
+ movhps xmm0, qword ptr [esi + edx * 4] // 2 source x1 pixels
+ pshufb xmm1, xmm5 // 0000000011111111
+ pshufb xmm0, xmm4 // arrange pixels into pairs
+ pxor xmm1, xmm6 // 0..7f and 7f..0
+ pmaddubsw xmm0, xmm1 // argb_argb 16 bit, 2 pixels.
+ pextrw eax, xmm2, 1 // get x0 integer. next iteration.
+ pextrw edx, xmm2, 3 // get x1 integer. next iteration.
+ psrlw xmm0, 7 // argb 8.7 fixed point to low 8 bits.
+ packuswb xmm0, xmm0 // argb_argb 8 bits, 2 pixels.
+ movq qword ptr [edi], xmm0
+ lea edi, [edi + 8]
+ sub ecx, 2 // 2 pixels
+ jge xloop2
+
+ xloop29:
+
+ add ecx, 2 - 1
+ jl xloop99
+
+ // 1 pixel remainder
+ psrlw xmm2, 9 // 7 bit fractions.
+ movq xmm0, qword ptr [esi + eax * 4] // 2 source x0 pixels
+ pshufb xmm2, xmm5 // 00000000
+ pshufb xmm0, xmm4 // arrange pixels into pairs
+ pxor xmm2, xmm6 // 0..7f and 7f..0
+ pmaddubsw xmm0, xmm2 // argb 16 bit, 1 pixel.
+ psrlw xmm0, 7
+ packuswb xmm0, xmm0 // argb 8 bits, 1 pixel.
+ movd [edi], xmm0
+
+ xloop99:
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+// Reads 4 pixels, duplicates them and writes 8 pixels.
+__declspec(naked) void ScaleARGBColsUp2_SSE2(uint8* dst_argb,
+ const uint8* src_argb,
+ int dst_width,
+ int x,
+ int dx) {
+ __asm {
+ mov edx, [esp + 4] // dst_argb
+ mov eax, [esp + 8] // src_argb
+ mov ecx, [esp + 12] // dst_width
+
+ wloop:
+ movdqu xmm0, [eax]
+ lea eax, [eax + 16]
+ movdqa xmm1, xmm0
+ punpckldq xmm0, xmm0
+ punpckhdq xmm1, xmm1
+ movdqu [edx], xmm0
+ movdqu [edx + 16], xmm1
+ lea edx, [edx + 32]
+ sub ecx, 8
+ jg wloop
+
+ ret
+ }
+}
+
+// Divide num by div and return as 16.16 fixed point result.
+__declspec(naked) int FixedDiv_X86(int num, int div) {
+ __asm {
+ mov eax, [esp + 4] // num
+ cdq // extend num to 64 bits
+ shld edx, eax, 16 // 32.16
+ shl eax, 16
+ idiv dword ptr [esp + 8]
+ ret
+ }
+}
+
+// Divide num by div and return as 16.16 fixed point result.
+__declspec(naked) int FixedDiv1_X86(int num, int div) {
+ __asm {
+ mov eax, [esp + 4] // num
+ mov ecx, [esp + 8] // denom
+ cdq // extend num to 64 bits
+ shld edx, eax, 16 // 32.16
+ shl eax, 16
+ sub eax, 0x00010001
+ sbb edx, 0
+ sub ecx, 1
+ idiv ecx
+ ret
+ }
+}
+#endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/libyuv/src/main/cpp/libyuv/source/video_common.cc b/libyuv/src/main/cpp/libyuv/source/video_common.cc
new file mode 100644
index 0000000..3e9c6a2
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/source/video_common.cc
@@ -0,0 +1,63 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/video_common.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#define ARRAY_SIZE(x) (int)(sizeof(x) / sizeof(x[0]))
+
+struct FourCCAliasEntry {
+ uint32 alias;
+ uint32 canonical;
+};
+
+static const struct FourCCAliasEntry kFourCCAliases[] = {
+ {FOURCC_IYUV, FOURCC_I420},
+ {FOURCC_YU12, FOURCC_I420},
+ {FOURCC_YU16, FOURCC_I422},
+ {FOURCC_YU24, FOURCC_I444},
+ {FOURCC_YUYV, FOURCC_YUY2},
+ {FOURCC_YUVS, FOURCC_YUY2}, // kCMPixelFormat_422YpCbCr8_yuvs
+ {FOURCC_HDYC, FOURCC_UYVY},
+ {FOURCC_2VUY, FOURCC_UYVY}, // kCMPixelFormat_422YpCbCr8
+ {FOURCC_JPEG, FOURCC_MJPG}, // Note: JPEG has DHT while MJPG does not.
+ {FOURCC_DMB1, FOURCC_MJPG},
+ {FOURCC_BA81, FOURCC_BGGR}, // deprecated.
+ {FOURCC_RGB3, FOURCC_RAW},
+ {FOURCC_BGR3, FOURCC_24BG},
+ {FOURCC_CM32, FOURCC_BGRA}, // kCMPixelFormat_32ARGB
+ {FOURCC_CM24, FOURCC_RAW}, // kCMPixelFormat_24RGB
+ {FOURCC_L555, FOURCC_RGBO}, // kCMPixelFormat_16LE555
+ {FOURCC_L565, FOURCC_RGBP}, // kCMPixelFormat_16LE565
+ {FOURCC_5551, FOURCC_RGBO}, // kCMPixelFormat_16LE5551
+};
+// TODO(fbarchard): Consider mapping kCMPixelFormat_32BGRA to FOURCC_ARGB.
+// {FOURCC_BGRA, FOURCC_ARGB}, // kCMPixelFormat_32BGRA
+
+LIBYUV_API
+uint32 CanonicalFourCC(uint32 fourcc) {
+ int i;
+ for (i = 0; i < ARRAY_SIZE(kFourCCAliases); ++i) {
+ if (kFourCCAliases[i].alias == fourcc) {
+ return kFourCCAliases[i].canonical;
+ }
+ }
+ // Not an alias, so return it as-is.
+ return fourcc;
+}
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/libyuv/src/main/cpp/libyuv/tools_libyuv/OWNERS b/libyuv/src/main/cpp/libyuv/tools_libyuv/OWNERS
new file mode 100644
index 0000000..aca046d
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/tools_libyuv/OWNERS
@@ -0,0 +1 @@
+kjellander@chromium.org
diff --git a/libyuv/src/main/cpp/libyuv/tools_libyuv/autoroller/roll_deps.py b/libyuv/src/main/cpp/libyuv/tools_libyuv/autoroller/roll_deps.py
new file mode 100644
index 0000000..1160dbb
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/tools_libyuv/autoroller/roll_deps.py
@@ -0,0 +1,481 @@
+#!/usr/bin/env python
+# Copyright 2017 The LibYuv Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style license
+# that can be found in the LICENSE file in the root of the source
+# tree. An additional intellectual property rights grant can be found
+# in the file PATENTS. All contributing project authors may
+# be found in the AUTHORS file in the root of the source tree.
+
+# This is a modified copy of the script in
+# https://chromium.googlesource.com/external/webrtc/+/master/tools-webrtc/autoroller/roll_deps.py
+# customized for libyuv.
+
+
+"""Script to automatically roll dependencies in the libyuv DEPS file."""
+
+import argparse
+import base64
+import collections
+import logging
+import os
+import re
+import subprocess
+import sys
+import urllib
+
+
+# Skip these dependencies (list without solution name prefix).
+DONT_AUTOROLL_THESE = [
+ 'src/third_party/gflags/src',
+]
+
+LIBYUV_URL = 'https://chromium.googlesource.com/libyuv/libyuv'
+CHROMIUM_SRC_URL = 'https://chromium.googlesource.com/chromium/src'
+CHROMIUM_COMMIT_TEMPLATE = CHROMIUM_SRC_URL + '/+/%s'
+CHROMIUM_LOG_TEMPLATE = CHROMIUM_SRC_URL + '/+log/%s'
+CHROMIUM_FILE_TEMPLATE = CHROMIUM_SRC_URL + '/+/%s/%s'
+
+COMMIT_POSITION_RE = re.compile('^Cr-Commit-Position: .*#([0-9]+).*$')
+CLANG_REVISION_RE = re.compile(r'^CLANG_REVISION = \'(\d+)\'$')
+ROLL_BRANCH_NAME = 'roll_chromium_revision'
+
+SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
+CHECKOUT_SRC_DIR = os.path.realpath(os.path.join(SCRIPT_DIR, os.pardir,
+ os.pardir))
+CHECKOUT_ROOT_DIR = os.path.realpath(os.path.join(CHECKOUT_SRC_DIR, os.pardir))
+
+sys.path.append(os.path.join(CHECKOUT_SRC_DIR, 'build'))
+import find_depot_tools
+find_depot_tools.add_depot_tools_to_path()
+from gclient import GClientKeywords
+
+CLANG_UPDATE_SCRIPT_URL_PATH = 'tools/clang/scripts/update.py'
+CLANG_UPDATE_SCRIPT_LOCAL_PATH = os.path.join(CHECKOUT_SRC_DIR, 'tools',
+ 'clang', 'scripts', 'update.py')
+
+DepsEntry = collections.namedtuple('DepsEntry', 'path url revision')
+ChangedDep = collections.namedtuple('ChangedDep',
+ 'path url current_rev new_rev')
+
+class RollError(Exception):
+ pass
+
+
+def ParseDepsDict(deps_content):
+ local_scope = {}
+ var = GClientKeywords.VarImpl({}, local_scope)
+ global_scope = {
+ 'Var': var.Lookup,
+ 'deps_os': {},
+ }
+ exec(deps_content, global_scope, local_scope)
+ return local_scope
+
+
+def ParseLocalDepsFile(filename):
+ with open(filename, 'rb') as f:
+ deps_content = f.read()
+ return ParseDepsDict(deps_content)
+
+
+def ParseRemoteCrDepsFile(revision):
+ deps_content = ReadRemoteCrFile('DEPS', revision)
+ return ParseDepsDict(deps_content)
+
+
+def ParseCommitPosition(commit_message):
+ for line in reversed(commit_message.splitlines()):
+ m = COMMIT_POSITION_RE.match(line.strip())
+ if m:
+ return m.group(1)
+ logging.error('Failed to parse commit position id from:\n%s\n',
+ commit_message)
+ sys.exit(-1)
+
+
+def _RunCommand(command, working_dir=None, ignore_exit_code=False,
+ extra_env=None):
+ """Runs a command and returns the output from that command.
+
+ If the command fails (exit code != 0), the function will exit the process.
+
+ Returns:
+ A tuple containing the stdout and stderr outputs as strings.
+ """
+ working_dir = working_dir or CHECKOUT_SRC_DIR
+ logging.debug('CMD: %s CWD: %s', ' '.join(command), working_dir)
+ env = os.environ.copy()
+ if extra_env:
+ assert all(type(value) == str for value in extra_env.values())
+ logging.debug('extra env: %s', extra_env)
+ env.update(extra_env)
+ p = subprocess.Popen(command, stdout=subprocess.PIPE,
+ stderr=subprocess.PIPE, env=env,
+ cwd=working_dir, universal_newlines=True)
+ std_output = p.stdout.read()
+ err_output = p.stderr.read()
+ p.wait()
+ p.stdout.close()
+ p.stderr.close()
+ if not ignore_exit_code and p.returncode != 0:
+ logging.error('Command failed: %s\n'
+ 'stdout:\n%s\n'
+ 'stderr:\n%s\n', ' '.join(command), std_output, err_output)
+ sys.exit(p.returncode)
+ return std_output, err_output
+
+
+def _GetBranches():
+ """Returns a tuple of active,branches.
+
+ The 'active' is the name of the currently active branch and 'branches' is a
+ list of all branches.
+ """
+ lines = _RunCommand(['git', 'branch'])[0].split('\n')
+ branches = []
+ active = ''
+ for line in lines:
+ if '*' in line:
+ # The assumption is that the first char will always be the '*'.
+ active = line[1:].strip()
+ branches.append(active)
+ else:
+ branch = line.strip()
+ if branch:
+ branches.append(branch)
+ return active, branches
+
+
+def _ReadGitilesContent(url):
+ # Download and decode BASE64 content until
+ # https://code.google.com/p/gitiles/issues/detail?id=7 is fixed.
+ base64_content = ReadUrlContent(url + '?format=TEXT')
+ return base64.b64decode(base64_content[0])
+
+
+def ReadRemoteCrFile(path_below_src, revision):
+ """Reads a remote Chromium file of a specific revision. Returns a string."""
+ return _ReadGitilesContent(CHROMIUM_FILE_TEMPLATE % (revision,
+ path_below_src))
+
+
+def ReadRemoteCrCommit(revision):
+ """Reads a remote Chromium commit message. Returns a string."""
+ return _ReadGitilesContent(CHROMIUM_COMMIT_TEMPLATE % revision)
+
+
+def ReadUrlContent(url):
+ """Connect to a remote host and read the contents. Returns a list of lines."""
+ conn = urllib.urlopen(url)
+ try:
+ return conn.readlines()
+ except IOError as e:
+ logging.exception('Error connecting to %s. Error: %s', url, e)
+ raise
+ finally:
+ conn.close()
+
+
+def GetMatchingDepsEntries(depsentry_dict, dir_path):
+ """Gets all deps entries matching the provided path.
+
+ This list may contain more than one DepsEntry object.
+ Example: dir_path='src/testing' would give results containing both
+ 'src/testing/gtest' and 'src/testing/gmock' deps entries for Chromium's DEPS.
+ Example 2: dir_path='src/build' should return 'src/build' but not
+ 'src/buildtools'.
+
+ Returns:
+ A list of DepsEntry objects.
+ """
+ result = []
+ for path, depsentry in depsentry_dict.iteritems():
+ if path == dir_path:
+ result.append(depsentry)
+ else:
+ parts = path.split('/')
+ if all(part == parts[i]
+ for i, part in enumerate(dir_path.split('/'))):
+ result.append(depsentry)
+ return result
+
+
+def BuildDepsentryDict(deps_dict):
+ """Builds a dict of paths to DepsEntry objects from a raw parsed deps dict."""
+ result = {}
+ def AddDepsEntries(deps_subdict):
+ for path, deps_url in deps_subdict.iteritems():
+ if not result.has_key(path):
+ url, revision = deps_url.split('@') if deps_url else (None, None)
+ result[path] = DepsEntry(path, url, revision)
+
+ AddDepsEntries(deps_dict['deps'])
+ for deps_os in ['win', 'mac', 'unix', 'android', 'ios', 'unix']:
+ AddDepsEntries(deps_dict.get('deps_os', {}).get(deps_os, {}))
+ return result
+
+
+def CalculateChangedDeps(libyuv_deps, new_cr_deps):
+ """
+ Calculate changed deps entries based on entries defined in the libyuv DEPS
+ file:
+ - If a shared dependency with the Chromium DEPS file: roll it to the same
+ revision as Chromium (i.e. entry in the new_cr_deps dict)
+ - If it's a Chromium sub-directory, roll it to the HEAD revision (notice
+ this means it may be ahead of the chromium_revision, but generally these
+ should be close).
+ - If it's another DEPS entry (not shared with Chromium), roll it to HEAD
+ unless it's configured to be skipped.
+
+ Returns:
+ A list of ChangedDep objects representing the changed deps.
+ """
+ result = []
+ libyuv_entries = BuildDepsentryDict(libyuv_deps)
+ new_cr_entries = BuildDepsentryDict(new_cr_deps)
+ for path, libyuv_deps_entry in libyuv_entries.iteritems():
+ if path in DONT_AUTOROLL_THESE:
+ continue
+ cr_deps_entry = new_cr_entries.get(path)
+ if cr_deps_entry:
+ # Use the revision from Chromium's DEPS file.
+ new_rev = cr_deps_entry.revision
+ assert libyuv_deps_entry.url == cr_deps_entry.url, (
+ 'Libyuv DEPS entry %s has a different URL (%s) than Chromium (%s).' %
+ (path, libyuv_deps_entry.url, cr_deps_entry.url))
+ else:
+ # Use the HEAD of the deps repo.
+ stdout, _ = _RunCommand(['git', 'ls-remote', libyuv_deps_entry.url,
+ 'HEAD'])
+ new_rev = stdout.strip().split('\t')[0]
+
+ # Check if an update is necessary.
+ if libyuv_deps_entry.revision != new_rev:
+ logging.debug('Roll dependency %s to %s', path, new_rev)
+ result.append(ChangedDep(path, libyuv_deps_entry.url,
+ libyuv_deps_entry.revision, new_rev))
+ return sorted(result)
+
+
+def CalculateChangedClang(new_cr_rev):
+ def GetClangRev(lines):
+ for line in lines:
+ match = CLANG_REVISION_RE.match(line)
+ if match:
+ return match.group(1)
+ raise RollError('Could not parse Clang revision!')
+
+ with open(CLANG_UPDATE_SCRIPT_LOCAL_PATH, 'rb') as f:
+ current_lines = f.readlines()
+ current_rev = GetClangRev(current_lines)
+
+ new_clang_update_py = ReadRemoteCrFile(CLANG_UPDATE_SCRIPT_URL_PATH,
+ new_cr_rev).splitlines()
+ new_rev = GetClangRev(new_clang_update_py)
+ return ChangedDep(CLANG_UPDATE_SCRIPT_LOCAL_PATH, None, current_rev, new_rev)
+
+
+def GenerateCommitMessage(current_cr_rev, new_cr_rev, current_commit_pos,
+ new_commit_pos, changed_deps_list, clang_change):
+ current_cr_rev = current_cr_rev[0:10]
+ new_cr_rev = new_cr_rev[0:10]
+ rev_interval = '%s..%s' % (current_cr_rev, new_cr_rev)
+ git_number_interval = '%s:%s' % (current_commit_pos, new_commit_pos)
+
+ commit_msg = ['Roll chromium_revision %s (%s)\n' % (rev_interval,
+ git_number_interval)]
+ commit_msg.append('Change log: %s' % (CHROMIUM_LOG_TEMPLATE % rev_interval))
+ commit_msg.append('Full diff: %s\n' % (CHROMIUM_COMMIT_TEMPLATE %
+ rev_interval))
+ # TBR field will be empty unless in some custom cases, where some engineers
+ # are added.
+ tbr_authors = ''
+ if changed_deps_list:
+ commit_msg.append('Changed dependencies:')
+
+ for c in changed_deps_list:
+ commit_msg.append('* %s: %s/+log/%s..%s' % (c.path, c.url,
+ c.current_rev[0:10],
+ c.new_rev[0:10]))
+ change_url = CHROMIUM_FILE_TEMPLATE % (rev_interval, 'DEPS')
+ commit_msg.append('DEPS diff: %s\n' % change_url)
+ else:
+ commit_msg.append('No dependencies changed.')
+
+ if clang_change.current_rev != clang_change.new_rev:
+ commit_msg.append('Clang version changed %s:%s' %
+ (clang_change.current_rev, clang_change.new_rev))
+ change_url = CHROMIUM_FILE_TEMPLATE % (rev_interval,
+ CLANG_UPDATE_SCRIPT_URL_PATH)
+ commit_msg.append('Details: %s\n' % change_url)
+ else:
+ commit_msg.append('No update to Clang.\n')
+
+ commit_msg.append('TBR=%s' % tbr_authors)
+ commit_msg.append('BUG=None')
+ return '\n'.join(commit_msg)
+
+
+def UpdateDepsFile(deps_filename, old_cr_revision, new_cr_revision,
+ changed_deps):
+ """Update the DEPS file with the new revision."""
+
+ # Update the chromium_revision variable.
+ with open(deps_filename, 'rb') as deps_file:
+ deps_content = deps_file.read()
+ deps_content = deps_content.replace(old_cr_revision, new_cr_revision)
+ with open(deps_filename, 'wb') as deps_file:
+ deps_file.write(deps_content)
+
+ # Update each individual DEPS entry.
+ for dep in changed_deps:
+ local_dep_dir = os.path.join(CHECKOUT_ROOT_DIR, dep.path)
+ if not os.path.isdir(local_dep_dir):
+ raise RollError(
+ 'Cannot find local directory %s. Either run\n'
+ 'gclient sync --deps=all\n'
+ 'or make sure the .gclient file for your solution contains all '
+ 'platforms in the target_os list, i.e.\n'
+ 'target_os = ["android", "unix", "mac", "ios", "win"];\n'
+ 'Then run "gclient sync" again.' % local_dep_dir)
+ _, stderr = _RunCommand(
+ ['roll-dep-svn', '--no-verify-revision', dep.path, dep.new_rev],
+ working_dir=CHECKOUT_SRC_DIR, ignore_exit_code=True)
+ if stderr:
+ logging.warning('roll-dep-svn: %s', stderr)
+
+
+def _IsTreeClean():
+ stdout, _ = _RunCommand(['git', 'status', '--porcelain'])
+ if len(stdout) == 0:
+ return True
+
+ logging.error('Dirty/unversioned files:\n%s', stdout)
+ return False
+
+
+def _EnsureUpdatedMasterBranch(dry_run):
+ current_branch = _RunCommand(
+ ['git', 'rev-parse', '--abbrev-ref', 'HEAD'])[0].splitlines()[0]
+ if current_branch != 'master':
+ logging.error('Please checkout the master branch and re-run this script.')
+ if not dry_run:
+ sys.exit(-1)
+
+ logging.info('Updating master branch...')
+ _RunCommand(['git', 'pull'])
+
+
+def _CreateRollBranch(dry_run):
+ logging.info('Creating roll branch: %s', ROLL_BRANCH_NAME)
+ if not dry_run:
+ _RunCommand(['git', 'checkout', '-b', ROLL_BRANCH_NAME])
+
+
+def _RemovePreviousRollBranch(dry_run):
+ active_branch, branches = _GetBranches()
+ if active_branch == ROLL_BRANCH_NAME:
+ active_branch = 'master'
+ if ROLL_BRANCH_NAME in branches:
+ logging.info('Removing previous roll branch (%s)', ROLL_BRANCH_NAME)
+ if not dry_run:
+ _RunCommand(['git', 'checkout', active_branch])
+ _RunCommand(['git', 'branch', '-D', ROLL_BRANCH_NAME])
+
+
+def _LocalCommit(commit_msg, dry_run):
+ logging.info('Committing changes locally.')
+ if not dry_run:
+ _RunCommand(['git', 'add', '--update', '.'])
+ _RunCommand(['git', 'commit', '-m', commit_msg])
+
+
+def _UploadCL(dry_run, rietveld_email=None):
+ logging.info('Uploading CL...')
+ if not dry_run:
+ cmd = ['git', 'cl', 'upload', '-f']
+ if rietveld_email:
+ cmd.append('--email=%s' % rietveld_email)
+ _RunCommand(cmd, extra_env={'EDITOR': 'true'})
+
+
+def _SendToCQ(dry_run, skip_cq):
+ logging.info('Sending the CL to the CQ...')
+ if not dry_run and not skip_cq:
+ _RunCommand(['git', 'cl', 'set_commit'])
+ logging.info('Sent the CL to the CQ.')
+
+
+def main():
+ p = argparse.ArgumentParser()
+ p.add_argument('--clean', action='store_true', default=False,
+ help='Removes any previous local roll branch.')
+ p.add_argument('-r', '--revision',
+ help=('Chromium Git revision to roll to. Defaults to the '
+ 'Chromium HEAD revision if omitted.'))
+ p.add_argument('-u', '--rietveld-email',
+ help=('E-mail address to use for creating the CL at Rietveld'
+ 'If omitted a previously cached one will be used or an '
+ 'error will be thrown during upload.'))
+ p.add_argument('--dry-run', action='store_true', default=False,
+ help=('Calculate changes and modify DEPS, but don\'t create '
+ 'any local branch, commit, upload CL or send any '
+ 'tryjobs.'))
+ p.add_argument('-i', '--ignore-unclean-workdir', action='store_true',
+ default=False,
+ help=('Ignore if the current branch is not master or if there '
+ 'are uncommitted changes (default: %(default)s).'))
+ p.add_argument('--skip-cq', action='store_true', default=False,
+ help='Skip sending the CL to the CQ (default: %(default)s)')
+ p.add_argument('-v', '--verbose', action='store_true', default=False,
+ help='Be extra verbose in printing of log messages.')
+ opts = p.parse_args()
+
+ if opts.verbose:
+ logging.basicConfig(level=logging.DEBUG)
+ else:
+ logging.basicConfig(level=logging.INFO)
+
+ if not opts.ignore_unclean_workdir and not _IsTreeClean():
+ logging.error('Please clean your local checkout first.')
+ return 1
+
+ if opts.clean:
+ _RemovePreviousRollBranch(opts.dry_run)
+
+ if not opts.ignore_unclean_workdir:
+ _EnsureUpdatedMasterBranch(opts.dry_run)
+
+ new_cr_rev = opts.revision
+ if not new_cr_rev:
+ stdout, _ = _RunCommand(['git', 'ls-remote', CHROMIUM_SRC_URL, 'HEAD'])
+ head_rev = stdout.strip().split('\t')[0]
+ logging.info('No revision specified. Using HEAD: %s', head_rev)
+ new_cr_rev = head_rev
+
+ deps_filename = os.path.join(CHECKOUT_SRC_DIR, 'DEPS')
+ libyuv_deps = ParseLocalDepsFile(deps_filename)
+ current_cr_rev = libyuv_deps['vars']['chromium_revision']
+
+ current_commit_pos = ParseCommitPosition(ReadRemoteCrCommit(current_cr_rev))
+ new_commit_pos = ParseCommitPosition(ReadRemoteCrCommit(new_cr_rev))
+
+ new_cr_deps = ParseRemoteCrDepsFile(new_cr_rev)
+ changed_deps = CalculateChangedDeps(libyuv_deps, new_cr_deps)
+ clang_change = CalculateChangedClang(new_cr_rev)
+ commit_msg = GenerateCommitMessage(current_cr_rev, new_cr_rev,
+ current_commit_pos, new_commit_pos,
+ changed_deps, clang_change)
+ logging.debug('Commit message:\n%s', commit_msg)
+
+ _CreateRollBranch(opts.dry_run)
+ UpdateDepsFile(deps_filename, current_cr_rev, new_cr_rev, changed_deps)
+ _LocalCommit(commit_msg, opts.dry_run)
+ _UploadCL(opts.dry_run, opts.rietveld_email)
+ _SendToCQ(opts.dry_run, opts.skip_cq)
+ return 0
+
+
+if __name__ == '__main__':
+ sys.exit(main())
diff --git a/libyuv/src/main/cpp/libyuv/tools_libyuv/autoroller/unittests/roll_deps_test.py b/libyuv/src/main/cpp/libyuv/tools_libyuv/autoroller/unittests/roll_deps_test.py
new file mode 100644
index 0000000..025e46e
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/tools_libyuv/autoroller/unittests/roll_deps_test.py
@@ -0,0 +1,143 @@
+#!/usr/bin/env python
+# Copyright 2017 The LibYuv Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style license
+# that can be found in the LICENSE file in the root of the source
+# tree. An additional intellectual property rights grant can be found
+# in the file PATENTS. All contributing project authors may
+# be found in the AUTHORS file in the root of the source tree.
+
+import glob
+import os
+import shutil
+import sys
+import tempfile
+import unittest
+
+
+SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
+PARENT_DIR = os.path.join(SCRIPT_DIR, os.pardir)
+sys.path.append(PARENT_DIR)
+import roll_deps
+from roll_deps import CalculateChangedDeps, GetMatchingDepsEntries, \
+ ParseDepsDict, ParseLocalDepsFile, UpdateDepsFile
+
+
+TEST_DATA_VARS = {
+ 'chromium_git': 'https://chromium.googlesource.com',
+ 'chromium_revision': '1b9c098a08e40114e44b6c1ec33ddf95c40b901d',
+}
+
+DEPS_ENTRIES = {
+ 'src/build': 'https://build.com',
+ 'src/buildtools': 'https://buildtools.com',
+ 'src/testing/gtest': 'https://gtest.com',
+ 'src/testing/gmock': 'https://gmock.com',
+}
+
+BUILD_OLD_REV = '52f7afeca991d96d68cf0507e20dbdd5b845691f'
+BUILD_NEW_REV = 'HEAD'
+BUILDTOOLS_OLD_REV = '64e38f0cebdde27aa0cfb405f330063582f9ac76'
+BUILDTOOLS_NEW_REV = '55ad626b08ef971fd82a62b7abb325359542952b'
+
+
+class TestError(Exception):
+ pass
+
+
+class FakeCmd(object):
+ def __init__(self):
+ self.expectations = []
+
+ def add_expectation(self, *args, **kwargs):
+ returns = kwargs.pop('_returns', None)
+ self.expectations.append((args, kwargs, returns))
+
+ def __call__(self, *args, **kwargs):
+ if not self.expectations:
+ raise TestError('Got unexpected\n%s\n%s' % (args, kwargs))
+ exp_args, exp_kwargs, exp_returns = self.expectations.pop(0)
+ if args != exp_args or kwargs != exp_kwargs:
+ message = 'Expected:\n args: %s\n kwargs: %s\n' % (exp_args, exp_kwargs)
+ message += 'Got:\n args: %s\n kwargs: %s\n' % (args, kwargs)
+ raise TestError(message)
+ return exp_returns
+
+
+class TestRollChromiumRevision(unittest.TestCase):
+ def setUp(self):
+ self._output_dir = tempfile.mkdtemp()
+ for test_file in glob.glob(os.path.join(SCRIPT_DIR, 'testdata', '*')):
+ shutil.copy(test_file, self._output_dir)
+ self._libyuv_depsfile = os.path.join(self._output_dir, 'DEPS')
+ self._old_cr_depsfile = os.path.join(self._output_dir, 'DEPS.chromium.old')
+ self._new_cr_depsfile = os.path.join(self._output_dir, 'DEPS.chromium.new')
+
+ self.fake = FakeCmd()
+ self.old_RunCommand = getattr(roll_deps, '_RunCommand')
+ setattr(roll_deps, '_RunCommand', self.fake)
+
+ def tearDown(self):
+ shutil.rmtree(self._output_dir, ignore_errors=True)
+ self.assertEqual(self.fake.expectations, [])
+ setattr(roll_deps, '_RunCommand', self.old_RunCommand)
+
+ def testUpdateDepsFile(self):
+ new_rev = 'aaaaabbbbbcccccdddddeeeeefffff0000011111'
+
+ current_rev = TEST_DATA_VARS['chromium_revision']
+ UpdateDepsFile(self._libyuv_depsfile, current_rev, new_rev, [])
+ with open(self._libyuv_depsfile) as deps_file:
+ deps_contents = deps_file.read()
+ self.assertTrue(new_rev in deps_contents,
+ 'Failed to find %s in\n%s' % (new_rev, deps_contents))
+
+ def testParseDepsDict(self):
+ with open(self._libyuv_depsfile) as deps_file:
+ deps_contents = deps_file.read()
+ local_scope = ParseDepsDict(deps_contents)
+ vars_dict = local_scope['vars']
+
+ def assertVar(variable_name):
+ self.assertEquals(vars_dict[variable_name], TEST_DATA_VARS[variable_name])
+ assertVar('chromium_git')
+ assertVar('chromium_revision')
+ self.assertEquals(len(local_scope['deps']), 3)
+
+ def testGetMatchingDepsEntriesReturnsPathInSimpleCase(self):
+ entries = GetMatchingDepsEntries(DEPS_ENTRIES, 'src/testing/gtest')
+ self.assertEquals(len(entries), 1)
+ self.assertEquals(entries[0], DEPS_ENTRIES['src/testing/gtest'])
+
+ def testGetMatchingDepsEntriesHandlesSimilarStartingPaths(self):
+ entries = GetMatchingDepsEntries(DEPS_ENTRIES, 'src/testing')
+ self.assertEquals(len(entries), 2)
+
+ def testGetMatchingDepsEntriesHandlesTwoPathsWithIdenticalFirstParts(self):
+ entries = GetMatchingDepsEntries(DEPS_ENTRIES, 'src/build')
+ self.assertEquals(len(entries), 1)
+ self.assertEquals(entries[0], DEPS_ENTRIES['src/build'])
+
+ def testCalculateChangedDeps(self):
+ _SetupGitLsRemoteCall(self.fake,
+ 'https://chromium.googlesource.com/chromium/src/build', BUILD_NEW_REV)
+ libyuv_deps = ParseLocalDepsFile(self._libyuv_depsfile)
+ new_cr_deps = ParseLocalDepsFile(self._new_cr_depsfile)
+ changed_deps = CalculateChangedDeps(libyuv_deps, new_cr_deps)
+ self.assertEquals(len(changed_deps), 2)
+ self.assertEquals(changed_deps[0].path, 'src/build')
+ self.assertEquals(changed_deps[0].current_rev, BUILD_OLD_REV)
+ self.assertEquals(changed_deps[0].new_rev, BUILD_NEW_REV)
+
+ self.assertEquals(changed_deps[1].path, 'src/buildtools')
+ self.assertEquals(changed_deps[1].current_rev, BUILDTOOLS_OLD_REV)
+ self.assertEquals(changed_deps[1].new_rev, BUILDTOOLS_NEW_REV)
+
+
+def _SetupGitLsRemoteCall(cmd_fake, url, revision):
+ cmd = ['git', 'ls-remote', url, revision]
+ cmd_fake.add_expectation(cmd, _returns=(revision, None))
+
+
+if __name__ == '__main__':
+ unittest.main()
diff --git a/libyuv/src/main/cpp/libyuv/tools_libyuv/autoroller/unittests/testdata/DEPS b/libyuv/src/main/cpp/libyuv/tools_libyuv/autoroller/unittests/testdata/DEPS
new file mode 100644
index 0000000..9fbb48a
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/tools_libyuv/autoroller/unittests/testdata/DEPS
@@ -0,0 +1,20 @@
+# DEPS file for unit tests.
+
+vars = {
+ 'chromium_git': 'https://chromium.googlesource.com',
+ 'chromium_revision': '1b9c098a08e40114e44b6c1ec33ddf95c40b901d',
+}
+
+deps = {
+ # Entry that is a directory in Chromium, so we're using a Git subtree mirror for it.
+ 'src/build':
+ Var('chromium_git') + '/chromium/src/build' + '@' + '52f7afeca991d96d68cf0507e20dbdd5b845691f',
+
+ # Entry that's also a DEPS entry in the Chromium DEPS file.
+ 'src/buildtools':
+ Var('chromium_git') + '/chromium/buildtools.git' + '@' + '64e38f0cebdde27aa0cfb405f330063582f9ac76',
+
+ # Entry only present in libyuv, not Chromium.
+ 'src/third_party/gflags/src':
+ Var('chromium_git') + '/external/github.com/gflags/gflags@03bebcb065c83beff83d50ae025a55a4bf94dfca',
+}
diff --git a/libyuv/src/main/cpp/libyuv/tools_libyuv/autoroller/unittests/testdata/DEPS.chromium.new b/libyuv/src/main/cpp/libyuv/tools_libyuv/autoroller/unittests/testdata/DEPS.chromium.new
new file mode 100644
index 0000000..d53083c
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/tools_libyuv/autoroller/unittests/testdata/DEPS.chromium.new
@@ -0,0 +1,13 @@
+# DEPS file for unit tests.
+
+vars = {
+ 'chromium_git': 'https://chromium.googlesource.com',
+
+ # This is updated compared to the DEPS.chromium.old file.
+ 'buildtools_revision': '55ad626b08ef971fd82a62b7abb325359542952b',
+}
+
+deps = {
+ 'src/buildtools':
+ Var('chromium_git') + '/chromium/buildtools.git' + '@' + Var('buildtools_revision'),
+}
diff --git a/libyuv/src/main/cpp/libyuv/tools_libyuv/autoroller/unittests/testdata/DEPS.chromium.old b/libyuv/src/main/cpp/libyuv/tools_libyuv/autoroller/unittests/testdata/DEPS.chromium.old
new file mode 100644
index 0000000..dd6ddae
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/tools_libyuv/autoroller/unittests/testdata/DEPS.chromium.old
@@ -0,0 +1,13 @@
+# DEPS file for unit tests.
+
+vars = {
+ 'chromium_git': 'https://chromium.googlesource.com',
+
+ # This is and older revision than DEPS.chromium.new file.
+ 'buildtools_revision': '64e38f0cebdde27aa0cfb405f330063582f9ac76',
+}
+
+deps = {
+ 'src/buildtools':
+ Var('chromium_git') + '/chromium/buildtools.git' + '@' + Var('buildtools_revision'),
+}
diff --git a/libyuv/src/main/cpp/libyuv/tools_libyuv/get_landmines.py b/libyuv/src/main/cpp/libyuv/tools_libyuv/get_landmines.py
new file mode 100644
index 0000000..3dc78bb
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/tools_libyuv/get_landmines.py
@@ -0,0 +1,50 @@
+#!/usr/bin/env python
+# Copyright 2016 The LibYuv Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style license
+# that can be found in the LICENSE file in the root of the source
+# tree. An additional intellectual property rights grant can be found
+# in the file PATENTS. All contributing project authors may
+# be found in the AUTHORS file in the root of the source tree.
+
+"""
+This file emits the list of reasons why a particular build needs to be clobbered
+(or a list of 'landmines').
+"""
+
+import os
+import sys
+
+script_dir = os.path.dirname(os.path.realpath(__file__))
+checkout_root = os.path.abspath(os.path.join(script_dir, os.pardir))
+sys.path.insert(0, os.path.join(checkout_root, 'build'))
+import landmine_utils
+
+
+distributor = landmine_utils.distributor
+gyp_defines = landmine_utils.gyp_defines
+gyp_msvs_version = landmine_utils.gyp_msvs_version
+platform = landmine_utils.platform
+
+
+def print_landmines():
+ """
+ ALL LANDMINES ARE EMITTED FROM HERE.
+ """
+ # DO NOT add landmines as part of a regular CL. Landmines are a last-effort
+ # bandaid fix if a CL that got landed has a build dependency bug and all bots
+ # need to be cleaned up. If you're writing a new CL that causes build
+ # dependency problems, fix the dependency problems instead of adding a
+ # landmine.
+ # See the Chromium version in src/build/get_landmines.py for usage examples.
+ print 'Clobber to remove GYP artifacts after switching bots to GN.'
+ print 'Another try to remove GYP artifacts after switching bots to GN.'
+
+
+def main():
+ print_landmines()
+ return 0
+
+
+if __name__ == '__main__':
+ sys.exit(main())
diff --git a/libyuv/src/main/cpp/libyuv/tools_libyuv/msan/OWNERS b/libyuv/src/main/cpp/libyuv/tools_libyuv/msan/OWNERS
new file mode 100644
index 0000000..60351e7
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/tools_libyuv/msan/OWNERS
@@ -0,0 +1,3 @@
+pbos@chromium.org
+kjellander@chromium.org
+
diff --git a/libyuv/src/main/cpp/libyuv/tools_libyuv/msan/blacklist.txt b/libyuv/src/main/cpp/libyuv/tools_libyuv/msan/blacklist.txt
new file mode 100644
index 0000000..8b5e42a
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/tools_libyuv/msan/blacklist.txt
@@ -0,0 +1,9 @@
+# The rules in this file are only applied at compile time.
+# Because the Chrome buildsystem does not automatically touch the files
+# mentioned here, changing this file requires clobbering all MSan bots.
+#
+# Please think twice before you add or remove these rules.
+
+# This is a stripped down copy of Chromium's blacklist.txt, to enable
+# adding libyuv-specific blacklist entries.
+
diff --git a/libyuv/src/main/cpp/libyuv/tools_libyuv/ubsan/OWNERS b/libyuv/src/main/cpp/libyuv/tools_libyuv/ubsan/OWNERS
new file mode 100644
index 0000000..b608519
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/tools_libyuv/ubsan/OWNERS
@@ -0,0 +1,4 @@
+pbos@webrtc.org
+kjellander@webrtc.org
+fbarchard@chromium.org
+
diff --git a/libyuv/src/main/cpp/libyuv/tools_libyuv/ubsan/blacklist.txt b/libyuv/src/main/cpp/libyuv/tools_libyuv/ubsan/blacklist.txt
new file mode 100644
index 0000000..8bcb290
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/tools_libyuv/ubsan/blacklist.txt
@@ -0,0 +1,15 @@
+#############################################################################
+# UBSan blacklist.
+# Please think twice before you add or remove these rules.
+
+# This is a stripped down copy of Chromium's blacklist.txt, to enable
+# adding WebRTC-specific blacklist entries.
+
+#############################################################################
+# YASM does some funny things that UBsan doesn't like.
+# https://crbug.com/489901
+src:*/third_party/yasm/*
+
+#############################################################################
+# Ignore system libraries.
+src:*/usr/*
diff --git a/libyuv/src/main/cpp/libyuv/tools_libyuv/ubsan/vptr_blacklist.txt b/libyuv/src/main/cpp/libyuv/tools_libyuv/ubsan/vptr_blacklist.txt
new file mode 100644
index 0000000..8ed070c
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/tools_libyuv/ubsan/vptr_blacklist.txt
@@ -0,0 +1,21 @@
+#############################################################################
+# UBSan vptr blacklist.
+# Function and type based blacklisting use a mangled name, and it is especially
+# tricky to represent C++ types. For now, any possible changes by name manglings
+# are simply represented as wildcard expressions of regexp, and thus it might be
+# over-blacklisted.
+#
+# Please think twice before you add or remove these rules.
+#
+# This is a stripped down copy of Chromium's vptr_blacklist.txt, to enable
+# adding libyuv-specific blacklist entries.
+
+#############################################################################
+# Using raw pointer values.
+#
+# A raw pointer value (16) is used to infer the field offset by
+# GOOGLE_PROTOBUF_GENERATED_MESSAGE_FIELD_OFFSET.
+
+# Example:
+# src:*/third_party/protobuf/src/google/protobuf/compiler/plugin.pb.cc
+
diff --git a/libyuv/src/main/cpp/libyuv/tools_libyuv/valgrind/libyuv_tests.bat b/libyuv/src/main/cpp/libyuv/tools_libyuv/valgrind/libyuv_tests.bat
new file mode 100644
index 0000000..5fceca6
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/tools_libyuv/valgrind/libyuv_tests.bat
@@ -0,0 +1,79 @@
+@echo off
+:: Copyright (c) 2012 The LibYuv Project Authors. All rights reserved.
+::
+:: Use of this source code is governed by a BSD-style license
+:: that can be found in the LICENSE file in the root of the source
+:: tree. An additional intellectual property rights grant can be found
+:: in the file PATENTS. All contributing project authors may
+:: be found in the AUTHORS file in the root of the source tree.
+
+:: This script is a copy of chrome_tests.bat with the following changes:
+:: - Invokes libyuv_tests.py instead of chrome_tests.py
+:: - Chromium's Valgrind scripts directory is added to the PYTHONPATH to make
+:: it possible to execute the Python scripts properly.
+
+:: TODO(timurrrr): batch files 'export' all the variables to the parent shell
+set THISDIR=%~dp0
+set TOOL_NAME="unknown"
+
+:: Get the tool name and put it into TOOL_NAME {{{1
+:: NB: SHIFT command doesn't modify %*
+:PARSE_ARGS_LOOP
+ if %1 == () GOTO:TOOLNAME_NOT_FOUND
+ if %1 == --tool GOTO:TOOLNAME_FOUND
+ SHIFT
+ goto :PARSE_ARGS_LOOP
+
+:TOOLNAME_NOT_FOUND
+echo "Please specify a tool (tsan or drmemory) by using --tool flag"
+exit /B 1
+
+:TOOLNAME_FOUND
+SHIFT
+set TOOL_NAME=%1
+:: }}}
+if "%TOOL_NAME%" == "drmemory" GOTO :SETUP_DRMEMORY
+if "%TOOL_NAME%" == "drmemory_light" GOTO :SETUP_DRMEMORY
+if "%TOOL_NAME%" == "drmemory_full" GOTO :SETUP_DRMEMORY
+if "%TOOL_NAME%" == "drmemory_pattern" GOTO :SETUP_DRMEMORY
+if "%TOOL_NAME%" == "tsan" GOTO :SETUP_TSAN
+echo "Unknown tool: `%TOOL_NAME%`! Only tsan and drmemory are supported."
+exit /B 1
+
+:SETUP_DRMEMORY
+if NOT "%DRMEMORY_COMMAND%"=="" GOTO :RUN_TESTS
+:: Set up DRMEMORY_COMMAND to invoke Dr. Memory {{{1
+set DRMEMORY_PATH=%THISDIR%..\..\third_party\drmemory
+set DRMEMORY_SFX=%DRMEMORY_PATH%\drmemory-windows-sfx.exe
+if EXIST %DRMEMORY_SFX% GOTO DRMEMORY_BINARY_OK
+echo "Can't find Dr. Memory executables."
+echo "See http://www.chromium.org/developers/how-tos/using-valgrind/dr-memory"
+echo "for the instructions on how to get them."
+exit /B 1
+
+:DRMEMORY_BINARY_OK
+%DRMEMORY_SFX% -o%DRMEMORY_PATH%\unpacked -y
+set DRMEMORY_COMMAND=%DRMEMORY_PATH%\unpacked\bin\drmemory.exe
+:: }}}
+goto :RUN_TESTS
+
+:SETUP_TSAN
+:: Set up PIN_COMMAND to invoke TSan {{{1
+set TSAN_PATH=%THISDIR%..\..\third_party\tsan
+set TSAN_SFX=%TSAN_PATH%\tsan-x86-windows-sfx.exe
+if EXIST %TSAN_SFX% GOTO TSAN_BINARY_OK
+echo "Can't find ThreadSanitizer executables."
+echo "See http://www.chromium.org/developers/how-tos/using-valgrind/threadsanitizer/threadsanitizer-on-windows"
+echo "for the instructions on how to get them."
+exit /B 1
+
+:TSAN_BINARY_OK
+%TSAN_SFX% -o%TSAN_PATH%\unpacked -y
+set PIN_COMMAND=%TSAN_PATH%\unpacked\tsan-x86-windows\tsan.bat
+:: }}}
+goto :RUN_TESTS
+
+:RUN_TESTS
+set PYTHONPATH=%THISDIR%..\python\google;%THISDIR%..\valgrind
+set RUNNING_ON_VALGRIND=yes
+python %THISDIR%libyuv_tests.py %*
diff --git a/libyuv/src/main/cpp/libyuv/tools_libyuv/valgrind/libyuv_tests.py b/libyuv/src/main/cpp/libyuv/tools_libyuv/valgrind/libyuv_tests.py
new file mode 100644
index 0000000..e780bd9
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/tools_libyuv/valgrind/libyuv_tests.py
@@ -0,0 +1,139 @@
+#!/usr/bin/env python
+# Copyright (c) 2012 The LibYuv Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style license
+# that can be found in the LICENSE file in the root of the source
+# tree. An additional intellectual property rights grant can be found
+# in the file PATENTS. All contributing project authors may
+# be found in the AUTHORS file in the root of the source tree.
+
+"""Runs various libyuv tests through valgrind_test.py.
+
+This script inherits the chrome_tests.py in Chrome, but allows running any test
+instead of only the hard-coded ones. It uses the -t cmdline flag to do this, and
+only supports specifying a single test for each run.
+
+Suppression files:
+The Chrome valgrind directory we use as a DEPS dependency contains the following
+suppression files:
+ valgrind/memcheck/suppressions.txt
+ valgrind/memcheck/suppressions_mac.txt
+ valgrind/tsan/suppressions.txt
+ valgrind/tsan/suppressions_mac.txt
+ valgrind/tsan/suppressions_win32.txt
+Since they're referenced from the chrome_tests.py script, we have similar files
+below the directory of this script. When executing, this script will setup both
+Chrome's suppression files and our own, so we can easily maintain libyuv
+specific suppressions in our own files.
+"""
+
+import logging
+import optparse
+import os
+import sys
+
+import logging_utils
+import path_utils
+
+import chrome_tests
+
+
+class LibyuvTest(chrome_tests.ChromeTests):
+ """Class that handles setup of suppressions for libyuv.
+
+ Everything else is inherited from chrome_tests.ChromeTests.
+ """
+
+ def _DefaultCommand(self, tool, exe=None, valgrind_test_args=None):
+ """Override command-building method so we can add more suppressions."""
+ cmd = chrome_tests.ChromeTests._DefaultCommand(self, tool, exe,
+ valgrind_test_args)
+ # When ChromeTests._DefaultCommand has executed, it has setup suppression
+ # files based on what's found in the memcheck/ or tsan/ subdirectories of
+ # this script's location. If Mac or Windows is executing, additional
+ # platform specific files have also been added.
+ # Since only the ones located below this directory is added, we must also
+ # add the ones maintained by Chrome, located in ../../tools/valgrind.
+
+ # The idea is to look for --suppression arguments in the cmd list and add a
+ # modified copy of each suppression file, for the corresponding file in
+ # ../../tools/valgrind.
+ script_dir = path_utils.ScriptDir()
+ old_base, _ = os.path.split(script_dir)
+
+ checkout_src = os.path.abspath(os.path.join(script_dir, os.pardir,
+ os.pardir))
+ new_dir = os.path.join(checkout_src, 'tools', 'valgrind')
+ add_suppressions = []
+ for token in cmd:
+ if '--suppressions' in token:
+ add_suppressions.append(token.replace(script_dir, new_dir))
+ return add_suppressions + cmd
+
+
+def main(_):
+ parser = optparse.OptionParser('usage: %prog -b -t ')
+ parser.disable_interspersed_args()
+ parser.add_option('-b', '--build-dir',
+ help=('Location of the compiler output. Can only be used '
+ 'when the test argument does not contain this path.'))
+ parser.add_option("--target", help="Debug or Release")
+ parser.add_option('-t', '--test', help='Test to run.')
+ parser.add_option('', '--baseline', action='store_true', default=False,
+ help='Generate baseline data instead of validating')
+ parser.add_option('', '--gtest_filter',
+ help='Additional arguments to --gtest_filter')
+ parser.add_option('', '--gtest_repeat',
+ help='Argument for --gtest_repeat')
+ parser.add_option("--gtest_shuffle", action="store_true", default=False,
+ help="Randomize tests' orders on every iteration.")
+ parser.add_option("--gtest_break_on_failure", action="store_true",
+ default=False,
+ help="Drop in to debugger on assertion failure. Also "
+ "useful for forcing tests to exit with a stack dump "
+ "on the first assertion failure when running with "
+ "--gtest_repeat=-1")
+ parser.add_option('-v', '--verbose', action='store_true', default=False,
+ help='Verbose output - enable debug log messages')
+ parser.add_option('', '--tool', dest='valgrind_tool', default='memcheck',
+ help='Specify a valgrind tool to run the tests under')
+ parser.add_option('', '--tool_flags', dest='valgrind_tool_flags', default='',
+ help='Specify custom flags for the selected valgrind tool')
+ parser.add_option('', '--keep_logs', action='store_true', default=False,
+ help=('Store memory tool logs in the .logs directory '
+ 'instead of /tmp.\nThis can be useful for tool '
+ 'developers/maintainers.\nPlease note that the '
+ '.logs directory will be clobbered on tool startup.'))
+ parser.add_option("--test-launcher-bot-mode", action="store_true",
+ help="run the tests with --test-launcher-bot-mode")
+ parser.add_option("--test-launcher-total-shards", type=int,
+ help="run the tests with --test-launcher-total-shards")
+ parser.add_option("--test-launcher-shard-index", type=int,
+ help="run the tests with --test-launcher-shard-index")
+ options, args = parser.parse_args()
+
+ if options.verbose:
+ logging_utils.config_root(logging.DEBUG)
+ else:
+ logging_utils.config_root()
+
+ if not options.test:
+ parser.error('--test not specified')
+
+ # Support build dir both with and without the target.
+ if (options.target and options.build_dir and
+ not options.build_dir.endswith(options.target)):
+ options.build_dir = os.path.join(options.build_dir, options.target)
+
+ # If --build_dir is provided, prepend it to the test executable if needed.
+ test_executable = options.test
+ if options.build_dir and not test_executable.startswith(options.build_dir):
+ test_executable = os.path.join(options.build_dir, test_executable)
+ args = [test_executable] + args
+
+ test = LibyuvTest(options, args, 'cmdline')
+ return test.Run()
+
+if __name__ == '__main__':
+ return_code = main(sys.argv)
+ sys.exit(return_code)
diff --git a/libyuv/src/main/cpp/libyuv/tools_libyuv/valgrind/libyuv_tests.sh b/libyuv/src/main/cpp/libyuv/tools_libyuv/valgrind/libyuv_tests.sh
new file mode 100644
index 0000000..975b5e3
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/tools_libyuv/valgrind/libyuv_tests.sh
@@ -0,0 +1,101 @@
+#!/bin/bash
+# Copyright (c) 2012 The LibYuv Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style license
+# that can be found in the LICENSE file in the root of the source
+# tree. An additional intellectual property rights grant can be found
+# in the file PATENTS. All contributing project authors may
+# be found in the AUTHORS file in the root of the source tree.
+
+# Set up some paths and re-direct the arguments to libyuv_tests.py
+
+# This script is a copy of the chrome_tests.sh wrapper script with the following
+# changes:
+# - The locate_valgrind.sh of Chromium's Valgrind scripts dir is used to locate
+# the Valgrind framework install. If it fails a fallback path is used instead
+# (../../chromium/src/third_party/valgrind/linux_x64) and a warning message
+# is showed by |show_locate_valgrind_failed_warning|.
+# - libyuv_tests.py is invoked instead of chrome_tests.py.
+# - Chromium's Valgrind scripts directory is added to the PYTHONPATH to make it
+# possible to execute the Python scripts properly.
+
+export THISDIR=`dirname $0`
+ARGV_COPY="$@"
+
+# We need to set CHROME_VALGRIND iff using Memcheck:
+# tools_libyuv/valgrind/libyuv_tests.sh --tool memcheck
+# or
+# tools_libyuv/valgrind/libyuv_tests.sh --tool=memcheck
+tool="memcheck" # Default to memcheck.
+while (( "$#" ))
+do
+ if [[ "$1" == "--tool" ]]
+ then
+ tool="$2"
+ shift
+ elif [[ "$1" =~ --tool=(.*) ]]
+ then
+ tool="${BASH_REMATCH[1]}"
+ fi
+ shift
+done
+
+NEEDS_VALGRIND=0
+
+case "$tool" in
+ "memcheck")
+ NEEDS_VALGRIND=1
+ ;;
+esac
+
+# For libyuv, we'll use the locate_valgrind.sh script in Chromium's Valgrind
+# scripts dir to locate the Valgrind framework install
+CHROME_VALGRIND_SCRIPTS=$THISDIR/../../tools/valgrind
+
+if [ "$NEEDS_VALGRIND" == "1" ]
+then
+ CHROME_VALGRIND=`sh $CHROME_VALGRIND_SCRIPTS/locate_valgrind.sh`
+ if [ "$CHROME_VALGRIND" = "" ]
+ then
+ CHROME_VALGRIND=../../src/third_party/valgrind/linux_x64
+ echo
+ echo "-------------------- WARNING ------------------------"
+ echo "locate_valgrind.sh failed."
+ echo "Using $CHROME_VALGRIND as a fallback location."
+ echo "This might be because:"
+ echo "1) This is a swarming bot"
+ echo "2) You haven't set up the valgrind binaries correctly."
+ echo "In this case, please make sure you have followed the instructions at"
+ echo "http://www.chromium.org/developers/how-tos/using-valgrind/get-valgrind"
+ echo "Notice: In the .gclient file, you need to add this for the 'libyuv'"
+ echo "solution since our directory structure is different from Chromium's:"
+ echo "\"custom_deps\": {"
+ echo " \"libyuv/third_party/valgrind\":"
+ echo " \"https://chromium.googlesource.com/chromium/deps/valgrind/binaries\","
+ echo "},"
+ echo "-----------------------------------------------------"
+ echo
+ fi
+ echo "Using valgrind binaries from ${CHROME_VALGRIND}"
+
+ PATH="${CHROME_VALGRIND}/bin:$PATH"
+ # We need to set these variables to override default lib paths hard-coded into
+ # Valgrind binary.
+ export VALGRIND_LIB="$CHROME_VALGRIND/lib/valgrind"
+ export VALGRIND_LIB_INNER="$CHROME_VALGRIND/lib/valgrind"
+
+ # Clean up some /tmp directories that might be stale due to interrupted
+ # chrome_tests.py execution.
+ # FYI:
+ # -mtime +1 <- only print files modified more than 24h ago,
+ # -print0/-0 are needed to handle possible newlines in the filenames.
+ echo "Cleanup /tmp from Valgrind stuff"
+ find /tmp -maxdepth 1 \(\
+ -name "vgdb-pipe-*" -or -name "vg_logs_*" -or -name "valgrind.*" \
+ \) -mtime +1 -print0 | xargs -0 rm -rf
+fi
+
+# Add Chrome's Valgrind scripts dir to the PYTHON_PATH since it contains
+# the scripts that are needed for this script to run
+PYTHONPATH=$THISDIR/../../tools/python/google:$CHROME_VALGRIND_SCRIPTS python \
+ "$THISDIR/libyuv_tests.py" $ARGV_COPY
diff --git a/libyuv/src/main/cpp/libyuv/tools_libyuv/valgrind/memcheck/OWNERS b/libyuv/src/main/cpp/libyuv/tools_libyuv/valgrind/memcheck/OWNERS
new file mode 100644
index 0000000..72e8ffc
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/tools_libyuv/valgrind/memcheck/OWNERS
@@ -0,0 +1 @@
+*
diff --git a/libyuv/src/main/cpp/libyuv/tools_libyuv/valgrind/memcheck/PRESUBMIT.py b/libyuv/src/main/cpp/libyuv/tools_libyuv/valgrind/memcheck/PRESUBMIT.py
new file mode 100644
index 0000000..0332921
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/tools_libyuv/valgrind/memcheck/PRESUBMIT.py
@@ -0,0 +1,99 @@
+#!/usr/bin/env python
+# Copyright (c) 2012 The LibYuv Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style license
+# that can be found in the LICENSE file in the root of the source
+# tree. An additional intellectual property rights grant can be found
+# in the file PATENTS. All contributing project authors may
+# be found in the AUTHORS file in the root of the source tree.
+
+"""
+Copied from Chrome's src/tools/valgrind/memcheck/PRESUBMIT.py
+
+See http://dev.chromium.org/developers/how-tos/depottools/presubmit-scripts
+for more details on the presubmit API built into gcl.
+"""
+
+import os
+import re
+import sys
+
+def CheckChange(input_api, output_api):
+ """Checks the memcheck suppressions files for bad data."""
+
+ # Add the path to the Chrome valgrind dir to the import path:
+ tools_vg_path = os.path.join(input_api.PresubmitLocalPath(), '..', '..', '..',
+ 'tools', 'valgrind')
+ sys.path.append(tools_vg_path)
+ import suppressions
+
+ sup_regex = re.compile('suppressions.*\.txt$')
+ suppressions = {}
+ errors = []
+ check_for_memcheck = False
+ # skip_next_line has 3 possible values:
+ # - False: don't skip the next line.
+ # - 'skip_suppression_name': the next line is a suppression name, skip.
+ # - 'skip_param': the next line is a system call parameter error, skip.
+ skip_next_line = False
+ for f in filter(lambda x: sup_regex.search(x.LocalPath()),
+ input_api.AffectedFiles()):
+ for line, line_num in zip(f.NewContents(),
+ xrange(1, len(f.NewContents()) + 1)):
+ line = line.lstrip()
+ if line.startswith('#') or not line:
+ continue
+
+ if skip_next_line:
+ if skip_next_line == 'skip_suppression_name':
+ if 'insert_a_suppression_name_here' in line:
+ errors.append('"insert_a_suppression_name_here" is not a valid '
+ 'suppression name')
+ if suppressions.has_key(line):
+ if f.LocalPath() == suppressions[line][1]:
+ errors.append('suppression with name "%s" at %s line %s '
+ 'has already been defined at line %s' %
+ (line, f.LocalPath(), line_num,
+ suppressions[line][1]))
+ else:
+ errors.append('suppression with name "%s" at %s line %s '
+ 'has already been defined at %s line %s' %
+ (line, f.LocalPath(), line_num,
+ suppressions[line][0], suppressions[line][1]))
+ else:
+ suppressions[line] = (f, line_num)
+ check_for_memcheck = True;
+ skip_next_line = False
+ continue
+ if check_for_memcheck:
+ if not line.startswith('Memcheck:'):
+ errors.append('"%s" should be "Memcheck:..." in %s line %s' %
+ (line, f.LocalPath(), line_num))
+ check_for_memcheck = False;
+ if line == '{':
+ skip_next_line = 'skip_suppression_name'
+ continue
+ if line == "Memcheck:Param":
+ skip_next_line = 'skip_param'
+ continue
+
+ if (line.startswith('fun:') or line.startswith('obj:') or
+ line.startswith('Memcheck:') or line == '}' or
+ line == '...'):
+ continue
+ errors.append('"%s" is probably wrong: %s line %s' % (line, f.LocalPath(),
+ line_num))
+ if errors:
+ return [output_api.PresubmitError('\n'.join(errors))]
+ return []
+
+def CheckChangeOnUpload(input_api, output_api):
+ return CheckChange(input_api, output_api)
+
+def CheckChangeOnCommit(input_api, output_api):
+ return CheckChange(input_api, output_api)
+
+def GetPreferredTrySlaves():
+ # We don't have any memcheck slaves yet, so there's no use for this method.
+ # When we have, the slave name(s) should be put into this list.
+ return []
diff --git a/libyuv/src/main/cpp/libyuv/tools_libyuv/valgrind/memcheck/suppressions.txt b/libyuv/src/main/cpp/libyuv/tools_libyuv/valgrind/memcheck/suppressions.txt
new file mode 100644
index 0000000..3ad0c8c
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/tools_libyuv/valgrind/memcheck/suppressions.txt
@@ -0,0 +1,5 @@
+# This file is used in addition to the one already maintained in Chrome.
+# It acts as a place holder for future additions for this project.
+# It must exist for the Python wrapper script to work properly.
+
+
diff --git a/libyuv/src/main/cpp/libyuv/tools_libyuv/valgrind/memcheck/suppressions_mac.txt b/libyuv/src/main/cpp/libyuv/tools_libyuv/valgrind/memcheck/suppressions_mac.txt
new file mode 100644
index 0000000..3ad0c8c
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/tools_libyuv/valgrind/memcheck/suppressions_mac.txt
@@ -0,0 +1,5 @@
+# This file is used in addition to the one already maintained in Chrome.
+# It acts as a place holder for future additions for this project.
+# It must exist for the Python wrapper script to work properly.
+
+
diff --git a/libyuv/src/main/cpp/libyuv/tools_libyuv/valgrind/memcheck/suppressions_win32.txt b/libyuv/src/main/cpp/libyuv/tools_libyuv/valgrind/memcheck/suppressions_win32.txt
new file mode 100644
index 0000000..3ad0c8c
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/tools_libyuv/valgrind/memcheck/suppressions_win32.txt
@@ -0,0 +1,5 @@
+# This file is used in addition to the one already maintained in Chrome.
+# It acts as a place holder for future additions for this project.
+# It must exist for the Python wrapper script to work properly.
+
+
diff --git a/libyuv/src/main/cpp/libyuv/unit_test/basictypes_test.cc b/libyuv/src/main/cpp/libyuv/unit_test/basictypes_test.cc
new file mode 100644
index 0000000..89f7644
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/unit_test/basictypes_test.cc
@@ -0,0 +1,60 @@
+/*
+ * Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "../unit_test/unit_test.h"
+#include "libyuv/basic_types.h"
+
+namespace libyuv {
+
+TEST_F(LibYUVBaseTest, Endian) {
+ uint16 v16 = 0x1234u;
+ uint8 first_byte = *reinterpret_cast(&v16);
+#if defined(LIBYUV_LITTLE_ENDIAN)
+ EXPECT_EQ(0x34u, first_byte);
+#else
+ EXPECT_EQ(0x12u, first_byte);
+#endif
+}
+
+TEST_F(LibYUVBaseTest, SizeOfTypes) {
+ int8 i8 = -1;
+ uint8 u8 = 1u;
+ int16 i16 = -1;
+ uint16 u16 = 1u;
+ int32 i32 = -1;
+ uint32 u32 = 1u;
+ int64 i64 = -1;
+ uint64 u64 = 1u;
+ EXPECT_EQ(1u, sizeof(i8));
+ EXPECT_EQ(1u, sizeof(u8));
+ EXPECT_EQ(2u, sizeof(i16));
+ EXPECT_EQ(2u, sizeof(u16));
+ EXPECT_EQ(4u, sizeof(i32));
+ EXPECT_EQ(4u, sizeof(u32));
+ EXPECT_EQ(8u, sizeof(i64));
+ EXPECT_EQ(8u, sizeof(u64));
+ EXPECT_GT(0, i8);
+ EXPECT_LT(0u, u8);
+ EXPECT_GT(0, i16);
+ EXPECT_LT(0u, u16);
+ EXPECT_GT(0, i32);
+ EXPECT_LT(0u, u32);
+ EXPECT_GT(0, i64);
+ EXPECT_LT(0u, u64);
+}
+
+TEST_F(LibYUVBaseTest, SizeOfConstants) {
+ EXPECT_EQ(8u, sizeof(INT64_C(0)));
+ EXPECT_EQ(8u, sizeof(UINT64_C(0)));
+ EXPECT_EQ(8u, sizeof(INT64_C(0x1234567887654321)));
+ EXPECT_EQ(8u, sizeof(UINT64_C(0x8765432112345678)));
+}
+
+} // namespace libyuv
diff --git a/libyuv/src/main/cpp/libyuv/unit_test/color_test.cc b/libyuv/src/main/cpp/libyuv/unit_test/color_test.cc
new file mode 100644
index 0000000..0aa7a54
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/unit_test/color_test.cc
@@ -0,0 +1,585 @@
+/*
+ * Copyright 2015 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include
+
+#include "../unit_test/unit_test.h"
+#include "libyuv/basic_types.h"
+#include "libyuv/convert.h"
+#include "libyuv/convert_argb.h"
+#include "libyuv/convert_from.h"
+#include "libyuv/convert_from_argb.h"
+#include "libyuv/cpu_id.h"
+
+namespace libyuv {
+
+// TODO(fbarchard): Port high accuracy YUV to RGB to Neon.
+#if !defined(LIBYUV_DISABLE_NEON) && \
+ (defined(__aarch64__) || defined(__ARM_NEON__) || defined(LIBYUV_NEON))
+#define ERROR_R 1
+#define ERROR_G 1
+#define ERROR_B 3
+#define ERROR_FULL 6
+#define ERROR_J420 5
+#else
+#define ERROR_R 1
+#define ERROR_G 1
+#define ERROR_B 3
+#define ERROR_FULL 5
+#define ERROR_J420 3
+#endif
+
+#define TESTCS(TESTNAME, YUVTOARGB, ARGBTOYUV, HS1, HS, HN, DIFF) \
+ TEST_F(LibYUVColorTest, TESTNAME) { \
+ const int kPixels = benchmark_width_ * benchmark_height_; \
+ const int kHalfPixels = \
+ ((benchmark_width_ + 1) / 2) * ((benchmark_height_ + HS1) / HS); \
+ align_buffer_page_end(orig_y, kPixels); \
+ align_buffer_page_end(orig_u, kHalfPixels); \
+ align_buffer_page_end(orig_v, kHalfPixels); \
+ align_buffer_page_end(orig_pixels, kPixels * 4); \
+ align_buffer_page_end(temp_y, kPixels); \
+ align_buffer_page_end(temp_u, kHalfPixels); \
+ align_buffer_page_end(temp_v, kHalfPixels); \
+ align_buffer_page_end(dst_pixels_opt, kPixels * 4); \
+ align_buffer_page_end(dst_pixels_c, kPixels * 4); \
+ \
+ MemRandomize(orig_pixels, kPixels * 4); \
+ MemRandomize(orig_y, kPixels); \
+ MemRandomize(orig_u, kHalfPixels); \
+ MemRandomize(orig_v, kHalfPixels); \
+ MemRandomize(temp_y, kPixels); \
+ MemRandomize(temp_u, kHalfPixels); \
+ MemRandomize(temp_v, kHalfPixels); \
+ MemRandomize(dst_pixels_opt, kPixels * 4); \
+ MemRandomize(dst_pixels_c, kPixels * 4); \
+ \
+ /* The test is overall for color conversion matrix being reversible, so */ \
+ /* this initializes the pixel with 2x2 blocks to eliminate subsampling. */ \
+ uint8* p = orig_y; \
+ for (int y = 0; y < benchmark_height_ - HS1; y += HS) { \
+ for (int x = 0; x < benchmark_width_ - 1; x += 2) { \
+ uint8 r = static_cast(fastrand()); \
+ p[0] = r; \
+ p[1] = r; \
+ p[HN] = r; \
+ p[HN + 1] = r; \
+ p += 2; \
+ } \
+ if (benchmark_width_ & 1) { \
+ uint8 r = static_cast(fastrand()); \
+ p[0] = r; \
+ p[HN] = r; \
+ p += 1; \
+ } \
+ p += HN; \
+ } \
+ if ((benchmark_height_ & 1) && HS == 2) { \
+ for (int x = 0; x < benchmark_width_ - 1; x += 2) { \
+ uint8 r = static_cast(fastrand()); \
+ p[0] = r; \
+ p[1] = r; \
+ p += 2; \
+ } \
+ if (benchmark_width_ & 1) { \
+ uint8 r = static_cast(fastrand()); \
+ p[0] = r; \
+ p += 1; \
+ } \
+ } \
+ /* Start with YUV converted to ARGB. */ \
+ YUVTOARGB(orig_y, benchmark_width_, orig_u, (benchmark_width_ + 1) / 2, \
+ orig_v, (benchmark_width_ + 1) / 2, orig_pixels, \
+ benchmark_width_ * 4, benchmark_width_, benchmark_height_); \
+ \
+ ARGBTOYUV(orig_pixels, benchmark_width_ * 4, temp_y, benchmark_width_, \
+ temp_u, (benchmark_width_ + 1) / 2, temp_v, \
+ (benchmark_width_ + 1) / 2, benchmark_width_, \
+ benchmark_height_); \
+ \
+ MaskCpuFlags(disable_cpu_flags_); \
+ YUVTOARGB(temp_y, benchmark_width_, temp_u, (benchmark_width_ + 1) / 2, \
+ temp_v, (benchmark_width_ + 1) / 2, dst_pixels_c, \
+ benchmark_width_ * 4, benchmark_width_, benchmark_height_); \
+ MaskCpuFlags(benchmark_cpu_info_); \
+ \
+ for (int i = 0; i < benchmark_iterations_; ++i) { \
+ YUVTOARGB(temp_y, benchmark_width_, temp_u, (benchmark_width_ + 1) / 2, \
+ temp_v, (benchmark_width_ + 1) / 2, dst_pixels_opt, \
+ benchmark_width_ * 4, benchmark_width_, benchmark_height_); \
+ } \
+ /* Test C and SIMD match. */ \
+ for (int i = 0; i < kPixels * 4; ++i) { \
+ EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]); \
+ } \
+ /* Test SIMD is close to original. */ \
+ for (int i = 0; i < kPixels * 4; ++i) { \
+ EXPECT_NEAR(static_cast(orig_pixels[i]), \
+ static_cast(dst_pixels_opt[i]), DIFF); \
+ } \
+ \
+ free_aligned_buffer_page_end(orig_pixels); \
+ free_aligned_buffer_page_end(orig_y); \
+ free_aligned_buffer_page_end(orig_u); \
+ free_aligned_buffer_page_end(orig_v); \
+ free_aligned_buffer_page_end(temp_y); \
+ free_aligned_buffer_page_end(temp_u); \
+ free_aligned_buffer_page_end(temp_v); \
+ free_aligned_buffer_page_end(dst_pixels_opt); \
+ free_aligned_buffer_page_end(dst_pixels_c); \
+ }
+
+TESTCS(TestI420, I420ToARGB, ARGBToI420, 1, 2, benchmark_width_, ERROR_FULL)
+TESTCS(TestI422, I422ToARGB, ARGBToI422, 0, 1, 0, ERROR_FULL)
+TESTCS(TestJ420, J420ToARGB, ARGBToJ420, 1, 2, benchmark_width_, ERROR_J420)
+TESTCS(TestJ422, J422ToARGB, ARGBToJ422, 0, 1, 0, ERROR_J420)
+
+static void YUVToRGB(int y, int u, int v, int* r, int* g, int* b) {
+ const int kWidth = 16;
+ const int kHeight = 1;
+ const int kPixels = kWidth * kHeight;
+ const int kHalfPixels = ((kWidth + 1) / 2) * ((kHeight + 1) / 2);
+
+ SIMD_ALIGNED(uint8 orig_y[16]);
+ SIMD_ALIGNED(uint8 orig_u[8]);
+ SIMD_ALIGNED(uint8 orig_v[8]);
+ SIMD_ALIGNED(uint8 orig_pixels[16 * 4]);
+ memset(orig_y, y, kPixels);
+ memset(orig_u, u, kHalfPixels);
+ memset(orig_v, v, kHalfPixels);
+
+ /* YUV converted to ARGB. */
+ I422ToARGB(orig_y, kWidth, orig_u, (kWidth + 1) / 2, orig_v, (kWidth + 1) / 2,
+ orig_pixels, kWidth * 4, kWidth, kHeight);
+
+ *b = orig_pixels[0];
+ *g = orig_pixels[1];
+ *r = orig_pixels[2];
+}
+
+static void YUVJToRGB(int y, int u, int v, int* r, int* g, int* b) {
+ const int kWidth = 16;
+ const int kHeight = 1;
+ const int kPixels = kWidth * kHeight;
+ const int kHalfPixels = ((kWidth + 1) / 2) * ((kHeight + 1) / 2);
+
+ SIMD_ALIGNED(uint8 orig_y[16]);
+ SIMD_ALIGNED(uint8 orig_u[8]);
+ SIMD_ALIGNED(uint8 orig_v[8]);
+ SIMD_ALIGNED(uint8 orig_pixels[16 * 4]);
+ memset(orig_y, y, kPixels);
+ memset(orig_u, u, kHalfPixels);
+ memset(orig_v, v, kHalfPixels);
+
+ /* YUV converted to ARGB. */
+ J422ToARGB(orig_y, kWidth, orig_u, (kWidth + 1) / 2, orig_v, (kWidth + 1) / 2,
+ orig_pixels, kWidth * 4, kWidth, kHeight);
+
+ *b = orig_pixels[0];
+ *g = orig_pixels[1];
+ *r = orig_pixels[2];
+}
+
+static void YToRGB(int y, int* r, int* g, int* b) {
+ const int kWidth = 16;
+ const int kHeight = 1;
+ const int kPixels = kWidth * kHeight;
+
+ SIMD_ALIGNED(uint8 orig_y[16]);
+ SIMD_ALIGNED(uint8 orig_pixels[16 * 4]);
+ memset(orig_y, y, kPixels);
+
+ /* YUV converted to ARGB. */
+ I400ToARGB(orig_y, kWidth, orig_pixels, kWidth * 4, kWidth, kHeight);
+
+ *b = orig_pixels[0];
+ *g = orig_pixels[1];
+ *r = orig_pixels[2];
+}
+
+static void YJToRGB(int y, int* r, int* g, int* b) {
+ const int kWidth = 16;
+ const int kHeight = 1;
+ const int kPixels = kWidth * kHeight;
+
+ SIMD_ALIGNED(uint8 orig_y[16]);
+ SIMD_ALIGNED(uint8 orig_pixels[16 * 4]);
+ memset(orig_y, y, kPixels);
+
+ /* YUV converted to ARGB. */
+ J400ToARGB(orig_y, kWidth, orig_pixels, kWidth * 4, kWidth, kHeight);
+
+ *b = orig_pixels[0];
+ *g = orig_pixels[1];
+ *r = orig_pixels[2];
+}
+
+// Pick a method for clamping.
+// #define CLAMPMETHOD_IF 1
+// #define CLAMPMETHOD_TABLE 1
+#define CLAMPMETHOD_TERNARY 1
+// #define CLAMPMETHOD_MASK 1
+
+// Pick a method for rounding.
+#define ROUND(f) static_cast(f + 0.5f)
+// #define ROUND(f) lrintf(f)
+// #define ROUND(f) static_cast(round(f))
+// #define ROUND(f) _mm_cvt_ss2si(_mm_load_ss(&f))
+
+#if defined(CLAMPMETHOD_IF)
+static int RoundToByte(float f) {
+ int i = ROUND(f);
+ if (i < 0) {
+ i = 0;
+ }
+ if (i > 255) {
+ i = 255;
+ }
+ return i;
+}
+#elif defined(CLAMPMETHOD_TABLE)
+static const unsigned char clamptable[811] = {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8,
+ 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
+ 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38,
+ 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53,
+ 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68,
+ 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83,
+ 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98,
+ 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113,
+ 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128,
+ 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143,
+ 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158,
+ 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173,
+ 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188,
+ 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203,
+ 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218,
+ 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233,
+ 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248,
+ 249, 250, 251, 252, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255};
+
+static int RoundToByte(float f) {
+ return clamptable[ROUND(f) + 276];
+}
+#elif defined(CLAMPMETHOD_TERNARY)
+static int RoundToByte(float f) {
+ int i = ROUND(f);
+ return (i < 0) ? 0 : ((i > 255) ? 255 : i);
+}
+#elif defined(CLAMPMETHOD_MASK)
+static int RoundToByte(float f) {
+ int i = ROUND(f);
+ i = ((-(i) >> 31) & (i)); // clamp to 0.
+ return (((255 - (i)) >> 31) | (i)) & 255; // clamp to 255.
+}
+#endif
+
+#define RANDOM256(s) ((s & 1) ? ((s >> 1) ^ 0xb8) : (s >> 1))
+
+TEST_F(LibYUVColorTest, TestRoundToByte) {
+ int allb = 0;
+ int count = benchmark_width_ * benchmark_height_;
+ for (int i = 0; i < benchmark_iterations_; ++i) {
+ float f = (fastrand() & 255) * 3.14f - 260.f;
+ for (int j = 0; j < count; ++j) {
+ int b = RoundToByte(f);
+ f += 0.91f;
+ allb |= b;
+ }
+ }
+ EXPECT_GE(allb, 0);
+ EXPECT_LE(allb, 255);
+}
+
+static void YUVToRGBReference(int y, int u, int v, int* r, int* g, int* b) {
+ *r = RoundToByte((y - 16) * 1.164 - (v - 128) * -1.596);
+ *g = RoundToByte((y - 16) * 1.164 - (u - 128) * 0.391 - (v - 128) * 0.813);
+ *b = RoundToByte((y - 16) * 1.164 - (u - 128) * -2.018);
+}
+
+static void YUVJToRGBReference(int y, int u, int v, int* r, int* g, int* b) {
+ *r = RoundToByte(y - (v - 128) * -1.40200);
+ *g = RoundToByte(y - (u - 128) * 0.34414 - (v - 128) * 0.71414);
+ *b = RoundToByte(y - (u - 128) * -1.77200);
+}
+
+TEST_F(LibYUVColorTest, TestYUV) {
+ int r0, g0, b0, r1, g1, b1;
+
+ // cyan (less red)
+ YUVToRGBReference(240, 255, 0, &r0, &g0, &b0);
+ EXPECT_EQ(56, r0);
+ EXPECT_EQ(255, g0);
+ EXPECT_EQ(255, b0);
+
+ YUVToRGB(240, 255, 0, &r1, &g1, &b1);
+ EXPECT_EQ(57, r1);
+ EXPECT_EQ(255, g1);
+ EXPECT_EQ(255, b1);
+
+ // green (less red and blue)
+ YUVToRGBReference(240, 0, 0, &r0, &g0, &b0);
+ EXPECT_EQ(56, r0);
+ EXPECT_EQ(255, g0);
+ EXPECT_EQ(2, b0);
+
+ YUVToRGB(240, 0, 0, &r1, &g1, &b1);
+ EXPECT_EQ(57, r1);
+ EXPECT_EQ(255, g1);
+ EXPECT_EQ(5, b1);
+
+ for (int i = 0; i < 256; ++i) {
+ YUVToRGBReference(i, 128, 128, &r0, &g0, &b0);
+ YUVToRGB(i, 128, 128, &r1, &g1, &b1);
+ EXPECT_NEAR(r0, r1, ERROR_R);
+ EXPECT_NEAR(g0, g1, ERROR_G);
+ EXPECT_NEAR(b0, b1, ERROR_B);
+
+ YUVToRGBReference(i, 0, 0, &r0, &g0, &b0);
+ YUVToRGB(i, 0, 0, &r1, &g1, &b1);
+ EXPECT_NEAR(r0, r1, ERROR_R);
+ EXPECT_NEAR(g0, g1, ERROR_G);
+ EXPECT_NEAR(b0, b1, ERROR_B);
+
+ YUVToRGBReference(i, 0, 255, &r0, &g0, &b0);
+ YUVToRGB(i, 0, 255, &r1, &g1, &b1);
+ EXPECT_NEAR(r0, r1, ERROR_R);
+ EXPECT_NEAR(g0, g1, ERROR_G);
+ EXPECT_NEAR(b0, b1, ERROR_B);
+ }
+}
+
+TEST_F(LibYUVColorTest, TestGreyYUV) {
+ int r0, g0, b0, r1, g1, b1, r2, g2, b2;
+
+ // black
+ YUVToRGBReference(16, 128, 128, &r0, &g0, &b0);
+ EXPECT_EQ(0, r0);
+ EXPECT_EQ(0, g0);
+ EXPECT_EQ(0, b0);
+
+ YUVToRGB(16, 128, 128, &r1, &g1, &b1);
+ EXPECT_EQ(0, r1);
+ EXPECT_EQ(0, g1);
+ EXPECT_EQ(0, b1);
+
+ // white
+ YUVToRGBReference(240, 128, 128, &r0, &g0, &b0);
+ EXPECT_EQ(255, r0);
+ EXPECT_EQ(255, g0);
+ EXPECT_EQ(255, b0);
+
+ YUVToRGB(240, 128, 128, &r1, &g1, &b1);
+ EXPECT_EQ(255, r1);
+ EXPECT_EQ(255, g1);
+ EXPECT_EQ(255, b1);
+
+ // grey
+ YUVToRGBReference(128, 128, 128, &r0, &g0, &b0);
+ EXPECT_EQ(130, r0);
+ EXPECT_EQ(130, g0);
+ EXPECT_EQ(130, b0);
+
+ YUVToRGB(128, 128, 128, &r1, &g1, &b1);
+ EXPECT_EQ(130, r1);
+ EXPECT_EQ(130, g1);
+ EXPECT_EQ(130, b1);
+
+ for (int y = 0; y < 256; ++y) {
+ YUVToRGBReference(y, 128, 128, &r0, &g0, &b0);
+ YUVToRGB(y, 128, 128, &r1, &g1, &b1);
+ YToRGB(y, &r2, &g2, &b2);
+ EXPECT_EQ(r0, r1);
+ EXPECT_EQ(g0, g1);
+ EXPECT_EQ(b0, b1);
+ EXPECT_EQ(r0, r2);
+ EXPECT_EQ(g0, g2);
+ EXPECT_EQ(b0, b2);
+ }
+}
+
+static void PrintHistogram(int rh[256], int gh[256], int bh[256]) {
+ int i;
+ printf("hist");
+ for (i = 0; i < 256; ++i) {
+ if (rh[i] || gh[i] || bh[i]) {
+ printf("\t%8d", i - 128);
+ }
+ }
+ printf("\nred");
+ for (i = 0; i < 256; ++i) {
+ if (rh[i] || gh[i] || bh[i]) {
+ printf("\t%8d", rh[i]);
+ }
+ }
+ printf("\ngreen");
+ for (i = 0; i < 256; ++i) {
+ if (rh[i] || gh[i] || bh[i]) {
+ printf("\t%8d", gh[i]);
+ }
+ }
+ printf("\nblue");
+ for (i = 0; i < 256; ++i) {
+ if (rh[i] || gh[i] || bh[i]) {
+ printf("\t%8d", bh[i]);
+ }
+ }
+ printf("\n");
+}
+
+TEST_F(LibYUVColorTest, TestFullYUV) {
+ int rh[256] =
+ {
+ 0,
+ },
+ gh[256] =
+ {
+ 0,
+ },
+ bh[256] = {
+ 0,
+ };
+ for (int u = 0; u < 256; ++u) {
+ for (int v = 0; v < 256; ++v) {
+ for (int y2 = 0; y2 < 256; ++y2) {
+ int r0, g0, b0, r1, g1, b1;
+ int y = RANDOM256(y2);
+ YUVToRGBReference(y, u, v, &r0, &g0, &b0);
+ YUVToRGB(y, u, v, &r1, &g1, &b1);
+ EXPECT_NEAR(r0, r1, ERROR_R);
+ EXPECT_NEAR(g0, g1, ERROR_G);
+ EXPECT_NEAR(b0, b1, ERROR_B);
+ ++rh[r1 - r0 + 128];
+ ++gh[g1 - g0 + 128];
+ ++bh[b1 - b0 + 128];
+ }
+ }
+ }
+ PrintHistogram(rh, gh, bh);
+}
+
+TEST_F(LibYUVColorTest, TestFullYUVJ) {
+ int rh[256] =
+ {
+ 0,
+ },
+ gh[256] =
+ {
+ 0,
+ },
+ bh[256] = {
+ 0,
+ };
+ for (int u = 0; u < 256; ++u) {
+ for (int v = 0; v < 256; ++v) {
+ for (int y2 = 0; y2 < 256; ++y2) {
+ int r0, g0, b0, r1, g1, b1;
+ int y = RANDOM256(y2);
+ YUVJToRGBReference(y, u, v, &r0, &g0, &b0);
+ YUVJToRGB(y, u, v, &r1, &g1, &b1);
+ EXPECT_NEAR(r0, r1, 1);
+ EXPECT_NEAR(g0, g1, 1);
+ EXPECT_NEAR(b0, b1, 1);
+ ++rh[r1 - r0 + 128];
+ ++gh[g1 - g0 + 128];
+ ++bh[b1 - b0 + 128];
+ }
+ }
+ }
+ PrintHistogram(rh, gh, bh);
+}
+
+TEST_F(LibYUVColorTest, TestGreyYUVJ) {
+ int r0, g0, b0, r1, g1, b1, r2, g2, b2;
+
+ // black
+ YUVJToRGBReference(0, 128, 128, &r0, &g0, &b0);
+ EXPECT_EQ(0, r0);
+ EXPECT_EQ(0, g0);
+ EXPECT_EQ(0, b0);
+
+ YUVJToRGB(0, 128, 128, &r1, &g1, &b1);
+ EXPECT_EQ(0, r1);
+ EXPECT_EQ(0, g1);
+ EXPECT_EQ(0, b1);
+
+ // white
+ YUVJToRGBReference(255, 128, 128, &r0, &g0, &b0);
+ EXPECT_EQ(255, r0);
+ EXPECT_EQ(255, g0);
+ EXPECT_EQ(255, b0);
+
+ YUVJToRGB(255, 128, 128, &r1, &g1, &b1);
+ EXPECT_EQ(255, r1);
+ EXPECT_EQ(255, g1);
+ EXPECT_EQ(255, b1);
+
+ // grey
+ YUVJToRGBReference(128, 128, 128, &r0, &g0, &b0);
+ EXPECT_EQ(128, r0);
+ EXPECT_EQ(128, g0);
+ EXPECT_EQ(128, b0);
+
+ YUVJToRGB(128, 128, 128, &r1, &g1, &b1);
+ EXPECT_EQ(128, r1);
+ EXPECT_EQ(128, g1);
+ EXPECT_EQ(128, b1);
+
+ for (int y = 0; y < 256; ++y) {
+ YUVJToRGBReference(y, 128, 128, &r0, &g0, &b0);
+ YUVJToRGB(y, 128, 128, &r1, &g1, &b1);
+ YJToRGB(y, &r2, &g2, &b2);
+ EXPECT_EQ(r0, r1);
+ EXPECT_EQ(g0, g1);
+ EXPECT_EQ(b0, b1);
+ EXPECT_EQ(r0, r2);
+ EXPECT_EQ(g0, g2);
+ EXPECT_EQ(b0, b2);
+ }
+}
+
+} // namespace libyuv
diff --git a/libyuv/src/main/cpp/libyuv/unit_test/compare_test.cc b/libyuv/src/main/cpp/libyuv/unit_test/compare_test.cc
new file mode 100644
index 0000000..149a9a1
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/unit_test/compare_test.cc
@@ -0,0 +1,615 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include
+#include
+#include
+
+#include "../unit_test/unit_test.h"
+#include "libyuv/basic_types.h"
+#include "libyuv/compare.h"
+#include "libyuv/compare_row.h" /* For HammingDistance_C */
+#include "libyuv/cpu_id.h"
+#include "libyuv/video_common.h"
+
+namespace libyuv {
+
+// hash seed of 5381 recommended.
+static uint32 ReferenceHashDjb2(const uint8* src, uint64 count, uint32 seed) {
+ uint32 hash = seed;
+ if (count > 0) {
+ do {
+ hash = hash * 33 + *src++;
+ } while (--count);
+ }
+ return hash;
+}
+
+TEST_F(LibYUVCompareTest, Djb2_Test) {
+ const int kMaxTest = benchmark_width_ * benchmark_height_;
+ align_buffer_page_end(src_a, kMaxTest);
+ align_buffer_page_end(src_b, kMaxTest);
+
+ const char* fox =
+ "The quick brown fox jumps over the lazy dog"
+ " and feels as if he were in the seventh heaven of typography"
+ " together with Hermann Zapf";
+ uint32 foxhash = HashDjb2(reinterpret_cast(fox), 131, 5381);
+ const uint32 kExpectedFoxHash = 2611006483u;
+ EXPECT_EQ(kExpectedFoxHash, foxhash);
+
+ for (int i = 0; i < kMaxTest; ++i) {
+ src_a[i] = (fastrand() & 0xff);
+ src_b[i] = (fastrand() & 0xff);
+ }
+ // Compare different buffers. Expect hash is different.
+ uint32 h1 = HashDjb2(src_a, kMaxTest, 5381);
+ uint32 h2 = HashDjb2(src_b, kMaxTest, 5381);
+ EXPECT_NE(h1, h2);
+
+ // Make last half same. Expect hash is different.
+ memcpy(src_a + kMaxTest / 2, src_b + kMaxTest / 2, kMaxTest / 2);
+ h1 = HashDjb2(src_a, kMaxTest, 5381);
+ h2 = HashDjb2(src_b, kMaxTest, 5381);
+ EXPECT_NE(h1, h2);
+
+ // Make first half same. Expect hash is different.
+ memcpy(src_a + kMaxTest / 2, src_a, kMaxTest / 2);
+ memcpy(src_b + kMaxTest / 2, src_b, kMaxTest / 2);
+ memcpy(src_a, src_b, kMaxTest / 2);
+ h1 = HashDjb2(src_a, kMaxTest, 5381);
+ h2 = HashDjb2(src_b, kMaxTest, 5381);
+ EXPECT_NE(h1, h2);
+
+ // Make same. Expect hash is same.
+ memcpy(src_a, src_b, kMaxTest);
+ h1 = HashDjb2(src_a, kMaxTest, 5381);
+ h2 = HashDjb2(src_b, kMaxTest, 5381);
+ EXPECT_EQ(h1, h2);
+
+ // Mask seed different. Expect hash is different.
+ memcpy(src_a, src_b, kMaxTest);
+ h1 = HashDjb2(src_a, kMaxTest, 5381);
+ h2 = HashDjb2(src_b, kMaxTest, 1234);
+ EXPECT_NE(h1, h2);
+
+ // Make one byte different in middle. Expect hash is different.
+ memcpy(src_a, src_b, kMaxTest);
+ ++src_b[kMaxTest / 2];
+ h1 = HashDjb2(src_a, kMaxTest, 5381);
+ h2 = HashDjb2(src_b, kMaxTest, 5381);
+ EXPECT_NE(h1, h2);
+
+ // Make first byte different. Expect hash is different.
+ memcpy(src_a, src_b, kMaxTest);
+ ++src_b[0];
+ h1 = HashDjb2(src_a, kMaxTest, 5381);
+ h2 = HashDjb2(src_b, kMaxTest, 5381);
+ EXPECT_NE(h1, h2);
+
+ // Make last byte different. Expect hash is different.
+ memcpy(src_a, src_b, kMaxTest);
+ ++src_b[kMaxTest - 1];
+ h1 = HashDjb2(src_a, kMaxTest, 5381);
+ h2 = HashDjb2(src_b, kMaxTest, 5381);
+ EXPECT_NE(h1, h2);
+
+ // Make a zeros. Test different lengths. Expect hash is different.
+ memset(src_a, 0, kMaxTest);
+ h1 = HashDjb2(src_a, kMaxTest, 5381);
+ h2 = HashDjb2(src_a, kMaxTest / 2, 5381);
+ EXPECT_NE(h1, h2);
+
+ // Make a zeros and seed of zero. Test different lengths. Expect hash is same.
+ memset(src_a, 0, kMaxTest);
+ h1 = HashDjb2(src_a, kMaxTest, 0);
+ h2 = HashDjb2(src_a, kMaxTest / 2, 0);
+ EXPECT_EQ(h1, h2);
+
+ free_aligned_buffer_page_end(src_a);
+ free_aligned_buffer_page_end(src_b);
+}
+
+TEST_F(LibYUVCompareTest, BenchmarkDjb2_Opt) {
+ const int kMaxTest = benchmark_width_ * benchmark_height_;
+ align_buffer_page_end(src_a, kMaxTest);
+
+ for (int i = 0; i < kMaxTest; ++i) {
+ src_a[i] = i;
+ }
+ uint32 h2 = ReferenceHashDjb2(src_a, kMaxTest, 5381);
+ uint32 h1;
+ for (int i = 0; i < benchmark_iterations_; ++i) {
+ h1 = HashDjb2(src_a, kMaxTest, 5381);
+ }
+ EXPECT_EQ(h1, h2);
+ free_aligned_buffer_page_end(src_a);
+}
+
+TEST_F(LibYUVCompareTest, BenchmarkDjb2_Unaligned) {
+ const int kMaxTest = benchmark_width_ * benchmark_height_;
+ align_buffer_page_end(src_a, kMaxTest + 1);
+ for (int i = 0; i < kMaxTest; ++i) {
+ src_a[i + 1] = i;
+ }
+ uint32 h2 = ReferenceHashDjb2(src_a + 1, kMaxTest, 5381);
+ uint32 h1;
+ for (int i = 0; i < benchmark_iterations_; ++i) {
+ h1 = HashDjb2(src_a + 1, kMaxTest, 5381);
+ }
+ EXPECT_EQ(h1, h2);
+ free_aligned_buffer_page_end(src_a);
+}
+
+TEST_F(LibYUVCompareTest, BenchmarkARGBDetect_Opt) {
+ uint32 fourcc;
+ const int kMaxTest = benchmark_width_ * benchmark_height_ * 4;
+ align_buffer_page_end(src_a, kMaxTest);
+ for (int i = 0; i < kMaxTest; ++i) {
+ src_a[i] = 255;
+ }
+
+ src_a[0] = 0;
+ fourcc = ARGBDetect(src_a, benchmark_width_ * 4, benchmark_width_,
+ benchmark_height_);
+ EXPECT_EQ(static_cast(libyuv::FOURCC_BGRA), fourcc);
+ src_a[0] = 255;
+ src_a[3] = 0;
+ fourcc = ARGBDetect(src_a, benchmark_width_ * 4, benchmark_width_,
+ benchmark_height_);
+ EXPECT_EQ(static_cast(libyuv::FOURCC_ARGB), fourcc);
+ src_a[3] = 255;
+
+ for (int i = 0; i < benchmark_iterations_; ++i) {
+ fourcc = ARGBDetect(src_a, benchmark_width_ * 4, benchmark_width_,
+ benchmark_height_);
+ }
+ EXPECT_EQ(0u, fourcc);
+
+ free_aligned_buffer_page_end(src_a);
+}
+
+TEST_F(LibYUVCompareTest, BenchmarkARGBDetect_Unaligned) {
+ uint32 fourcc;
+ const int kMaxTest = benchmark_width_ * benchmark_height_ * 4 + 1;
+ align_buffer_page_end(src_a, kMaxTest);
+ for (int i = 1; i < kMaxTest; ++i) {
+ src_a[i] = 255;
+ }
+
+ src_a[0 + 1] = 0;
+ fourcc = ARGBDetect(src_a + 1, benchmark_width_ * 4, benchmark_width_,
+ benchmark_height_);
+ EXPECT_EQ(static_cast(libyuv::FOURCC_BGRA), fourcc);
+ src_a[0 + 1] = 255;
+ src_a[3 + 1] = 0;
+ fourcc = ARGBDetect(src_a + 1, benchmark_width_ * 4, benchmark_width_,
+ benchmark_height_);
+ EXPECT_EQ(static_cast(libyuv::FOURCC_ARGB), fourcc);
+ src_a[3 + 1] = 255;
+
+ for (int i = 0; i < benchmark_iterations_; ++i) {
+ fourcc = ARGBDetect(src_a + 1, benchmark_width_ * 4, benchmark_width_,
+ benchmark_height_);
+ }
+ EXPECT_EQ(0u, fourcc);
+
+ free_aligned_buffer_page_end(src_a);
+}
+
+TEST_F(LibYUVCompareTest, BenchmarkHammingDistance_Opt) {
+ const int kMaxWidth = 4096 * 3;
+ align_buffer_page_end(src_a, kMaxWidth);
+ align_buffer_page_end(src_b, kMaxWidth);
+ memset(src_a, 0, kMaxWidth);
+ memset(src_b, 0, kMaxWidth);
+
+ // Test known value
+ memcpy(src_a, "test0123test4567", 16);
+ memcpy(src_b, "tick0123tock4567", 16);
+ uint32 h1 = HammingDistance_C(src_a, src_b, 16);
+ EXPECT_EQ(16u, h1);
+
+ // Test C vs OPT on random buffer
+ MemRandomize(src_a, kMaxWidth);
+ MemRandomize(src_b, kMaxWidth);
+
+ uint32 h0 = HammingDistance_C(src_a, src_b, kMaxWidth);
+
+ int count =
+ benchmark_iterations_ *
+ ((benchmark_width_ * benchmark_height_ + kMaxWidth - 1) / kMaxWidth);
+ for (int i = 0; i < count; ++i) {
+#if defined(HAS_HAMMINGDISTANCE_NEON)
+ h1 = HammingDistance_NEON(src_a, src_b, kMaxWidth);
+#elif defined(HAS_HAMMINGDISTANCE_X86)
+ h1 = HammingDistance_X86(src_a, src_b, kMaxWidth);
+#else
+ h1 = HammingDistance_C(src_a, src_b, kMaxWidth);
+#endif
+ }
+
+ EXPECT_EQ(h0, h1);
+
+ free_aligned_buffer_page_end(src_a);
+ free_aligned_buffer_page_end(src_b);
+}
+
+TEST_F(LibYUVCompareTest, BenchmarkHammingDistance_C) {
+ const int kMaxWidth = 4096 * 3;
+ align_buffer_page_end(src_a, kMaxWidth);
+ align_buffer_page_end(src_b, kMaxWidth);
+ memset(src_a, 0, kMaxWidth);
+ memset(src_b, 0, kMaxWidth);
+
+ // Test known value
+ memcpy(src_a, "test0123test4567", 16);
+ memcpy(src_b, "tick0123tock4567", 16);
+ uint32 h1 = HammingDistance_C(src_a, src_b, 16);
+ EXPECT_EQ(16u, h1);
+
+ // Test C vs OPT on random buffer
+ MemRandomize(src_a, kMaxWidth);
+ MemRandomize(src_b, kMaxWidth);
+
+ uint32 h0 = HammingDistance_C(src_a, src_b, kMaxWidth);
+
+ int count =
+ benchmark_iterations_ *
+ ((benchmark_width_ * benchmark_height_ + kMaxWidth - 1) / kMaxWidth);
+ for (int i = 0; i < count; ++i) {
+ h1 = HammingDistance_C(src_a, src_b, kMaxWidth);
+ }
+
+ EXPECT_EQ(h0, h1);
+
+ free_aligned_buffer_page_end(src_a);
+ free_aligned_buffer_page_end(src_b);
+}
+
+TEST_F(LibYUVCompareTest, BenchmarkHammingDistance) {
+ const int kMaxWidth = 4096 * 3;
+ align_buffer_page_end(src_a, kMaxWidth);
+ align_buffer_page_end(src_b, kMaxWidth);
+ memset(src_a, 0, kMaxWidth);
+ memset(src_b, 0, kMaxWidth);
+
+ memcpy(src_a, "test0123test4567", 16);
+ memcpy(src_b, "tick0123tock4567", 16);
+ uint64 h1 = ComputeHammingDistance(src_a, src_b, 16);
+ EXPECT_EQ(16u, h1);
+
+ // Test C vs OPT on random buffer
+ MemRandomize(src_a, kMaxWidth);
+ MemRandomize(src_b, kMaxWidth);
+
+ uint32 h0 = HammingDistance_C(src_a, src_b, kMaxWidth);
+
+ int count =
+ benchmark_iterations_ *
+ ((benchmark_width_ * benchmark_height_ + kMaxWidth - 1) / kMaxWidth);
+ for (int i = 0; i < count; ++i) {
+ h1 = ComputeHammingDistance(src_a, src_b, kMaxWidth);
+ }
+
+ EXPECT_EQ(h0, h1);
+
+ free_aligned_buffer_page_end(src_a);
+ free_aligned_buffer_page_end(src_b);
+}
+
+TEST_F(LibYUVCompareTest, BenchmarkSumSquareError_Opt) {
+ const int kMaxWidth = 4096 * 3;
+ align_buffer_page_end(src_a, kMaxWidth);
+ align_buffer_page_end(src_b, kMaxWidth);
+ memset(src_a, 0, kMaxWidth);
+ memset(src_b, 0, kMaxWidth);
+
+ memcpy(src_a, "test0123test4567", 16);
+ memcpy(src_b, "tick0123tock4567", 16);
+ uint64 h1 = ComputeSumSquareError(src_a, src_b, 16);
+ EXPECT_EQ(790u, h1);
+
+ for (int i = 0; i < kMaxWidth; ++i) {
+ src_a[i] = i;
+ src_b[i] = i;
+ }
+ memset(src_a, 0, kMaxWidth);
+ memset(src_b, 0, kMaxWidth);
+
+ int count =
+ benchmark_iterations_ *
+ ((benchmark_width_ * benchmark_height_ + kMaxWidth - 1) / kMaxWidth);
+ for (int i = 0; i < count; ++i) {
+ h1 = ComputeSumSquareError(src_a, src_b, kMaxWidth);
+ }
+
+ EXPECT_EQ(0u, h1);
+
+ free_aligned_buffer_page_end(src_a);
+ free_aligned_buffer_page_end(src_b);
+}
+
+TEST_F(LibYUVCompareTest, SumSquareError) {
+ const int kMaxWidth = 4096 * 3;
+ align_buffer_page_end(src_a, kMaxWidth);
+ align_buffer_page_end(src_b, kMaxWidth);
+ memset(src_a, 0, kMaxWidth);
+ memset(src_b, 0, kMaxWidth);
+
+ uint64 err;
+ err = ComputeSumSquareError(src_a, src_b, kMaxWidth);
+
+ EXPECT_EQ(0u, err);
+
+ memset(src_a, 1, kMaxWidth);
+ err = ComputeSumSquareError(src_a, src_b, kMaxWidth);
+
+ EXPECT_EQ(static_cast(err), kMaxWidth);
+
+ memset(src_a, 190, kMaxWidth);
+ memset(src_b, 193, kMaxWidth);
+ err = ComputeSumSquareError(src_a, src_b, kMaxWidth);
+
+ EXPECT_EQ(static_cast(err), kMaxWidth * 3 * 3);
+
+ for (int i = 0; i < kMaxWidth; ++i) {
+ src_a[i] = (fastrand() & 0xff);
+ src_b[i] = (fastrand() & 0xff);
+ }
+
+ MaskCpuFlags(disable_cpu_flags_);
+ uint64 c_err = ComputeSumSquareError(src_a, src_b, kMaxWidth);
+
+ MaskCpuFlags(benchmark_cpu_info_);
+ uint64 opt_err = ComputeSumSquareError(src_a, src_b, kMaxWidth);
+
+ EXPECT_EQ(c_err, opt_err);
+
+ free_aligned_buffer_page_end(src_a);
+ free_aligned_buffer_page_end(src_b);
+}
+
+TEST_F(LibYUVCompareTest, BenchmarkPsnr_Opt) {
+ align_buffer_page_end(src_a, benchmark_width_ * benchmark_height_);
+ align_buffer_page_end(src_b, benchmark_width_ * benchmark_height_);
+ for (int i = 0; i < benchmark_width_ * benchmark_height_; ++i) {
+ src_a[i] = i;
+ src_b[i] = i;
+ }
+
+ MaskCpuFlags(benchmark_cpu_info_);
+
+ double opt_time = get_time();
+ for (int i = 0; i < benchmark_iterations_; ++i)
+ CalcFramePsnr(src_a, benchmark_width_, src_b, benchmark_width_,
+ benchmark_width_, benchmark_height_);
+
+ opt_time = (get_time() - opt_time) / benchmark_iterations_;
+ printf("BenchmarkPsnr_Opt - %8.2f us opt\n", opt_time * 1e6);
+
+ EXPECT_EQ(0, 0);
+
+ free_aligned_buffer_page_end(src_a);
+ free_aligned_buffer_page_end(src_b);
+}
+
+TEST_F(LibYUVCompareTest, BenchmarkPsnr_Unaligned) {
+ align_buffer_page_end(src_a, benchmark_width_ * benchmark_height_ + 1);
+ align_buffer_page_end(src_b, benchmark_width_ * benchmark_height_);
+ for (int i = 0; i < benchmark_width_ * benchmark_height_; ++i) {
+ src_a[i + 1] = i;
+ src_b[i] = i;
+ }
+
+ MaskCpuFlags(benchmark_cpu_info_);
+
+ double opt_time = get_time();
+ for (int i = 0; i < benchmark_iterations_; ++i)
+ CalcFramePsnr(src_a + 1, benchmark_width_, src_b, benchmark_width_,
+ benchmark_width_, benchmark_height_);
+
+ opt_time = (get_time() - opt_time) / benchmark_iterations_;
+ printf("BenchmarkPsnr_Opt - %8.2f us opt\n", opt_time * 1e6);
+
+ EXPECT_EQ(0, 0);
+
+ free_aligned_buffer_page_end(src_a);
+ free_aligned_buffer_page_end(src_b);
+}
+
+TEST_F(LibYUVCompareTest, Psnr) {
+ const int kSrcWidth = benchmark_width_;
+ const int kSrcHeight = benchmark_height_;
+ const int b = 128;
+ const int kSrcPlaneSize = (kSrcWidth + b * 2) * (kSrcHeight + b * 2);
+ const int kSrcStride = 2 * b + kSrcWidth;
+ align_buffer_page_end(src_a, kSrcPlaneSize);
+ align_buffer_page_end(src_b, kSrcPlaneSize);
+ memset(src_a, 0, kSrcPlaneSize);
+ memset(src_b, 0, kSrcPlaneSize);
+
+ double err;
+ err = CalcFramePsnr(src_a + kSrcStride * b + b, kSrcStride,
+ src_b + kSrcStride * b + b, kSrcStride, kSrcWidth,
+ kSrcHeight);
+
+ EXPECT_EQ(err, kMaxPsnr);
+
+ memset(src_a, 255, kSrcPlaneSize);
+
+ err = CalcFramePsnr(src_a + kSrcStride * b + b, kSrcStride,
+ src_b + kSrcStride * b + b, kSrcStride, kSrcWidth,
+ kSrcHeight);
+
+ EXPECT_EQ(err, 0.0);
+
+ memset(src_a, 1, kSrcPlaneSize);
+
+ err = CalcFramePsnr(src_a + kSrcStride * b + b, kSrcStride,
+ src_b + kSrcStride * b + b, kSrcStride, kSrcWidth,
+ kSrcHeight);
+
+ EXPECT_GT(err, 48.0);
+ EXPECT_LT(err, 49.0);
+
+ for (int i = 0; i < kSrcPlaneSize; ++i) {
+ src_a[i] = i;
+ }
+
+ err = CalcFramePsnr(src_a + kSrcStride * b + b, kSrcStride,
+ src_b + kSrcStride * b + b, kSrcStride, kSrcWidth,
+ kSrcHeight);
+
+ EXPECT_GT(err, 2.0);
+ if (kSrcWidth * kSrcHeight >= 256) {
+ EXPECT_LT(err, 6.0);
+ }
+
+ memset(src_a, 0, kSrcPlaneSize);
+ memset(src_b, 0, kSrcPlaneSize);
+
+ for (int i = b; i < (kSrcHeight + b); ++i) {
+ for (int j = b; j < (kSrcWidth + b); ++j) {
+ src_a[(i * kSrcStride) + j] = (fastrand() & 0xff);
+ src_b[(i * kSrcStride) + j] = (fastrand() & 0xff);
+ }
+ }
+
+ MaskCpuFlags(disable_cpu_flags_);
+ double c_err, opt_err;
+
+ c_err = CalcFramePsnr(src_a + kSrcStride * b + b, kSrcStride,
+ src_b + kSrcStride * b + b, kSrcStride, kSrcWidth,
+ kSrcHeight);
+
+ MaskCpuFlags(benchmark_cpu_info_);
+
+ opt_err = CalcFramePsnr(src_a + kSrcStride * b + b, kSrcStride,
+ src_b + kSrcStride * b + b, kSrcStride, kSrcWidth,
+ kSrcHeight);
+
+ EXPECT_EQ(opt_err, c_err);
+
+ free_aligned_buffer_page_end(src_a);
+ free_aligned_buffer_page_end(src_b);
+}
+
+TEST_F(LibYUVCompareTest, DISABLED_BenchmarkSsim_Opt) {
+ align_buffer_page_end(src_a, benchmark_width_ * benchmark_height_);
+ align_buffer_page_end(src_b, benchmark_width_ * benchmark_height_);
+ for (int i = 0; i < benchmark_width_ * benchmark_height_; ++i) {
+ src_a[i] = i;
+ src_b[i] = i;
+ }
+
+ MaskCpuFlags(benchmark_cpu_info_);
+
+ double opt_time = get_time();
+ for (int i = 0; i < benchmark_iterations_; ++i)
+ CalcFrameSsim(src_a, benchmark_width_, src_b, benchmark_width_,
+ benchmark_width_, benchmark_height_);
+
+ opt_time = (get_time() - opt_time) / benchmark_iterations_;
+ printf("BenchmarkSsim_Opt - %8.2f us opt\n", opt_time * 1e6);
+
+ EXPECT_EQ(0, 0); // Pass if we get this far.
+
+ free_aligned_buffer_page_end(src_a);
+ free_aligned_buffer_page_end(src_b);
+}
+
+TEST_F(LibYUVCompareTest, Ssim) {
+ const int kSrcWidth = benchmark_width_;
+ const int kSrcHeight = benchmark_height_;
+ const int b = 128;
+ const int kSrcPlaneSize = (kSrcWidth + b * 2) * (kSrcHeight + b * 2);
+ const int kSrcStride = 2 * b + kSrcWidth;
+ align_buffer_page_end(src_a, kSrcPlaneSize);
+ align_buffer_page_end(src_b, kSrcPlaneSize);
+ memset(src_a, 0, kSrcPlaneSize);
+ memset(src_b, 0, kSrcPlaneSize);
+
+ if (kSrcWidth <= 8 || kSrcHeight <= 8) {
+ printf("warning - Ssim size too small. Testing function executes.\n");
+ }
+
+ double err;
+ err = CalcFrameSsim(src_a + kSrcStride * b + b, kSrcStride,
+ src_b + kSrcStride * b + b, kSrcStride, kSrcWidth,
+ kSrcHeight);
+
+ if (kSrcWidth > 8 && kSrcHeight > 8) {
+ EXPECT_EQ(err, 1.0);
+ }
+
+ memset(src_a, 255, kSrcPlaneSize);
+
+ err = CalcFrameSsim(src_a + kSrcStride * b + b, kSrcStride,
+ src_b + kSrcStride * b + b, kSrcStride, kSrcWidth,
+ kSrcHeight);
+
+ if (kSrcWidth > 8 && kSrcHeight > 8) {
+ EXPECT_LT(err, 0.0001);
+ }
+
+ memset(src_a, 1, kSrcPlaneSize);
+
+ err = CalcFrameSsim(src_a + kSrcStride * b + b, kSrcStride,
+ src_b + kSrcStride * b + b, kSrcStride, kSrcWidth,
+ kSrcHeight);
+
+ if (kSrcWidth > 8 && kSrcHeight > 8) {
+ EXPECT_GT(err, 0.0001);
+ EXPECT_LT(err, 0.9);
+ }
+
+ for (int i = 0; i < kSrcPlaneSize; ++i) {
+ src_a[i] = i;
+ }
+
+ err = CalcFrameSsim(src_a + kSrcStride * b + b, kSrcStride,
+ src_b + kSrcStride * b + b, kSrcStride, kSrcWidth,
+ kSrcHeight);
+
+ if (kSrcWidth > 8 && kSrcHeight > 8) {
+ EXPECT_GT(err, 0.0);
+ EXPECT_LT(err, 0.01);
+ }
+
+ for (int i = b; i < (kSrcHeight + b); ++i) {
+ for (int j = b; j < (kSrcWidth + b); ++j) {
+ src_a[(i * kSrcStride) + j] = (fastrand() & 0xff);
+ src_b[(i * kSrcStride) + j] = (fastrand() & 0xff);
+ }
+ }
+
+ MaskCpuFlags(disable_cpu_flags_);
+ double c_err, opt_err;
+
+ c_err = CalcFrameSsim(src_a + kSrcStride * b + b, kSrcStride,
+ src_b + kSrcStride * b + b, kSrcStride, kSrcWidth,
+ kSrcHeight);
+
+ MaskCpuFlags(benchmark_cpu_info_);
+
+ opt_err = CalcFrameSsim(src_a + kSrcStride * b + b, kSrcStride,
+ src_b + kSrcStride * b + b, kSrcStride, kSrcWidth,
+ kSrcHeight);
+
+ if (kSrcWidth > 8 && kSrcHeight > 8) {
+ EXPECT_EQ(opt_err, c_err);
+ }
+
+ free_aligned_buffer_page_end(src_a);
+ free_aligned_buffer_page_end(src_b);
+}
+
+} // namespace libyuv
diff --git a/libyuv/src/main/cpp/libyuv/unit_test/convert_test.cc b/libyuv/src/main/cpp/libyuv/unit_test/convert_test.cc
new file mode 100644
index 0000000..deeb30e
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/unit_test/convert_test.cc
@@ -0,0 +1,1924 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include
+#include
+
+#include "libyuv/basic_types.h"
+#include "libyuv/compare.h"
+#include "libyuv/convert.h"
+#include "libyuv/convert_argb.h"
+#include "libyuv/convert_from.h"
+#include "libyuv/convert_from_argb.h"
+#include "libyuv/cpu_id.h"
+#ifdef HAVE_JPEG
+#include "libyuv/mjpeg_decoder.h"
+#endif
+#include "../unit_test/unit_test.h"
+#include "libyuv/planar_functions.h"
+#include "libyuv/rotate.h"
+#include "libyuv/video_common.h"
+
+namespace libyuv {
+
+#define SUBSAMPLE(v, a) ((((v) + (a)-1)) / (a))
+
+#define TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
+ FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, W1280, N, NEG, OFF) \
+ TEST_F(LibYUVConvertTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) { \
+ const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
+ const int kHeight = benchmark_height_; \
+ align_buffer_page_end(src_y, kWidth* kHeight + OFF); \
+ align_buffer_page_end(src_u, SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * \
+ SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) + \
+ OFF); \
+ align_buffer_page_end(src_v, SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * \
+ SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) + \
+ OFF); \
+ align_buffer_page_end(dst_y_c, kWidth* kHeight); \
+ align_buffer_page_end(dst_u_c, SUBSAMPLE(kWidth, SUBSAMP_X) * \
+ SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ align_buffer_page_end(dst_v_c, SUBSAMPLE(kWidth, SUBSAMP_X) * \
+ SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ align_buffer_page_end(dst_y_opt, kWidth* kHeight); \
+ align_buffer_page_end(dst_u_opt, SUBSAMPLE(kWidth, SUBSAMP_X) * \
+ SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ align_buffer_page_end(dst_v_opt, SUBSAMPLE(kWidth, SUBSAMP_X) * \
+ SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ for (int i = 0; i < kHeight; ++i) \
+ for (int j = 0; j < kWidth; ++j) \
+ src_y[i * kWidth + j + OFF] = (fastrand() & 0xff); \
+ for (int i = 0; i < SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); ++i) { \
+ for (int j = 0; j < SUBSAMPLE(kWidth, SRC_SUBSAMP_X); ++j) { \
+ src_u[(i * SUBSAMPLE(kWidth, SRC_SUBSAMP_X)) + j + OFF] = \
+ (fastrand() & 0xff); \
+ src_v[(i * SUBSAMPLE(kWidth, SRC_SUBSAMP_X)) + j + OFF] = \
+ (fastrand() & 0xff); \
+ } \
+ } \
+ memset(dst_y_c, 1, kWidth* kHeight); \
+ memset(dst_u_c, 2, \
+ SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ memset(dst_v_c, 3, \
+ SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ memset(dst_y_opt, 101, kWidth* kHeight); \
+ memset(dst_u_opt, 102, \
+ SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ memset(dst_v_opt, 103, \
+ SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ MaskCpuFlags(disable_cpu_flags_); \
+ SRC_FMT_PLANAR##To##FMT_PLANAR( \
+ src_y + OFF, kWidth, src_u + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \
+ src_v + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), dst_y_c, kWidth, \
+ dst_u_c, SUBSAMPLE(kWidth, SUBSAMP_X), dst_v_c, \
+ SUBSAMPLE(kWidth, SUBSAMP_X), kWidth, NEG kHeight); \
+ MaskCpuFlags(benchmark_cpu_info_); \
+ for (int i = 0; i < benchmark_iterations_; ++i) { \
+ SRC_FMT_PLANAR##To##FMT_PLANAR( \
+ src_y + OFF, kWidth, src_u + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \
+ src_v + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), dst_y_opt, kWidth, \
+ dst_u_opt, SUBSAMPLE(kWidth, SUBSAMP_X), dst_v_opt, \
+ SUBSAMPLE(kWidth, SUBSAMP_X), kWidth, NEG kHeight); \
+ } \
+ int max_diff = 0; \
+ for (int i = 0; i < kHeight; ++i) { \
+ for (int j = 0; j < kWidth; ++j) { \
+ int abs_diff = abs(static_cast(dst_y_c[i * kWidth + j]) - \
+ static_cast(dst_y_opt[i * kWidth + j])); \
+ if (abs_diff > max_diff) { \
+ max_diff = abs_diff; \
+ } \
+ } \
+ } \
+ EXPECT_EQ(0, max_diff); \
+ for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \
+ for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) { \
+ int abs_diff = abs( \
+ static_cast(dst_u_c[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j]) - \
+ static_cast( \
+ dst_u_opt[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j])); \
+ if (abs_diff > max_diff) { \
+ max_diff = abs_diff; \
+ } \
+ } \
+ } \
+ EXPECT_LE(max_diff, 3); \
+ for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \
+ for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) { \
+ int abs_diff = abs( \
+ static_cast(dst_v_c[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j]) - \
+ static_cast( \
+ dst_v_opt[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j])); \
+ if (abs_diff > max_diff) { \
+ max_diff = abs_diff; \
+ } \
+ } \
+ } \
+ EXPECT_LE(max_diff, 3); \
+ free_aligned_buffer_page_end(dst_y_c); \
+ free_aligned_buffer_page_end(dst_u_c); \
+ free_aligned_buffer_page_end(dst_v_c); \
+ free_aligned_buffer_page_end(dst_y_opt); \
+ free_aligned_buffer_page_end(dst_u_opt); \
+ free_aligned_buffer_page_end(dst_v_opt); \
+ free_aligned_buffer_page_end(src_y); \
+ free_aligned_buffer_page_end(src_u); \
+ free_aligned_buffer_page_end(src_v); \
+ }
+
+#define TESTPLANARTOP(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
+ FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y) \
+ TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, \
+ SUBSAMP_X, SUBSAMP_Y, benchmark_width_ - 4, _Any, +, 0) \
+ TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, \
+ SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _Unaligned, +, 1) \
+ TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, \
+ SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _Invert, -, 0) \
+ TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, \
+ SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _Opt, +, 0)
+
+TESTPLANARTOP(I420, 2, 2, I420, 2, 2)
+TESTPLANARTOP(I422, 2, 1, I420, 2, 2)
+TESTPLANARTOP(I444, 1, 1, I420, 2, 2)
+TESTPLANARTOP(I420, 2, 2, I422, 2, 1)
+TESTPLANARTOP(I420, 2, 2, I444, 1, 1)
+TESTPLANARTOP(I420, 2, 2, I420Mirror, 2, 2)
+TESTPLANARTOP(I422, 2, 1, I422, 2, 1)
+TESTPLANARTOP(I444, 1, 1, I444, 1, 1)
+
+// Test Android 420 to I420
+#define TESTAPLANARTOPI(SRC_FMT_PLANAR, PIXEL_STRIDE, SRC_SUBSAMP_X, \
+ SRC_SUBSAMP_Y, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
+ W1280, N, NEG, OFF, PN, OFF_U, OFF_V) \
+ TEST_F(LibYUVConvertTest, SRC_FMT_PLANAR##To##FMT_PLANAR##_##PN##N) { \
+ const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
+ const int kHeight = benchmark_height_; \
+ const int kSizeUV = \
+ SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); \
+ align_buffer_page_end(src_y, kWidth* kHeight + OFF); \
+ align_buffer_page_end(src_uv, \
+ kSizeUV*((PIXEL_STRIDE == 3) ? 3 : 2) + OFF); \
+ align_buffer_page_end(dst_y_c, kWidth* kHeight); \
+ align_buffer_page_end(dst_u_c, SUBSAMPLE(kWidth, SUBSAMP_X) * \
+ SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ align_buffer_page_end(dst_v_c, SUBSAMPLE(kWidth, SUBSAMP_X) * \
+ SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ align_buffer_page_end(dst_y_opt, kWidth* kHeight); \
+ align_buffer_page_end(dst_u_opt, SUBSAMPLE(kWidth, SUBSAMP_X) * \
+ SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ align_buffer_page_end(dst_v_opt, SUBSAMPLE(kWidth, SUBSAMP_X) * \
+ SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ uint8* src_u = src_uv + OFF_U; \
+ uint8* src_v = src_uv + (PIXEL_STRIDE == 1 ? kSizeUV : OFF_V); \
+ int src_stride_uv = SUBSAMPLE(kWidth, SUBSAMP_X) * PIXEL_STRIDE; \
+ for (int i = 0; i < kHeight; ++i) \
+ for (int j = 0; j < kWidth; ++j) \
+ src_y[i * kWidth + j + OFF] = (fastrand() & 0xff); \
+ for (int i = 0; i < SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); ++i) { \
+ for (int j = 0; j < SUBSAMPLE(kWidth, SRC_SUBSAMP_X); ++j) { \
+ src_u[(i * src_stride_uv) + j * PIXEL_STRIDE + OFF] = \
+ (fastrand() & 0xff); \
+ src_v[(i * src_stride_uv) + j * PIXEL_STRIDE + OFF] = \
+ (fastrand() & 0xff); \
+ } \
+ } \
+ memset(dst_y_c, 1, kWidth* kHeight); \
+ memset(dst_u_c, 2, \
+ SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ memset(dst_v_c, 3, \
+ SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ memset(dst_y_opt, 101, kWidth* kHeight); \
+ memset(dst_u_opt, 102, \
+ SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ memset(dst_v_opt, 103, \
+ SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ MaskCpuFlags(disable_cpu_flags_); \
+ SRC_FMT_PLANAR##To##FMT_PLANAR( \
+ src_y + OFF, kWidth, src_u + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \
+ src_v + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), PIXEL_STRIDE, dst_y_c, \
+ kWidth, dst_u_c, SUBSAMPLE(kWidth, SUBSAMP_X), dst_v_c, \
+ SUBSAMPLE(kWidth, SUBSAMP_X), kWidth, NEG kHeight); \
+ MaskCpuFlags(benchmark_cpu_info_); \
+ for (int i = 0; i < benchmark_iterations_; ++i) { \
+ SRC_FMT_PLANAR##To##FMT_PLANAR( \
+ src_y + OFF, kWidth, src_u + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \
+ src_v + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), PIXEL_STRIDE, \
+ dst_y_opt, kWidth, dst_u_opt, SUBSAMPLE(kWidth, SUBSAMP_X), \
+ dst_v_opt, SUBSAMPLE(kWidth, SUBSAMP_X), kWidth, NEG kHeight); \
+ } \
+ int max_diff = 0; \
+ for (int i = 0; i < kHeight; ++i) { \
+ for (int j = 0; j < kWidth; ++j) { \
+ int abs_diff = abs(static_cast(dst_y_c[i * kWidth + j]) - \
+ static_cast(dst_y_opt[i * kWidth + j])); \
+ if (abs_diff > max_diff) { \
+ max_diff = abs_diff; \
+ } \
+ } \
+ } \
+ EXPECT_EQ(0, max_diff); \
+ for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \
+ for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) { \
+ int abs_diff = abs( \
+ static_cast(dst_u_c[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j]) - \
+ static_cast( \
+ dst_u_opt[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j])); \
+ if (abs_diff > max_diff) { \
+ max_diff = abs_diff; \
+ } \
+ } \
+ } \
+ EXPECT_LE(max_diff, 3); \
+ for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \
+ for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) { \
+ int abs_diff = abs( \
+ static_cast(dst_v_c[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j]) - \
+ static_cast( \
+ dst_v_opt[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j])); \
+ if (abs_diff > max_diff) { \
+ max_diff = abs_diff; \
+ } \
+ } \
+ } \
+ EXPECT_LE(max_diff, 3); \
+ free_aligned_buffer_page_end(dst_y_c); \
+ free_aligned_buffer_page_end(dst_u_c); \
+ free_aligned_buffer_page_end(dst_v_c); \
+ free_aligned_buffer_page_end(dst_y_opt); \
+ free_aligned_buffer_page_end(dst_u_opt); \
+ free_aligned_buffer_page_end(dst_v_opt); \
+ free_aligned_buffer_page_end(src_y); \
+ free_aligned_buffer_page_end(src_uv); \
+ }
+
+#define TESTAPLANARTOP(SRC_FMT_PLANAR, PN, PIXEL_STRIDE, OFF_U, OFF_V, \
+ SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, SUBSAMP_X, \
+ SUBSAMP_Y) \
+ TESTAPLANARTOPI(SRC_FMT_PLANAR, PIXEL_STRIDE, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
+ FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, benchmark_width_ - 4, \
+ _Any, +, 0, PN, OFF_U, OFF_V) \
+ TESTAPLANARTOPI(SRC_FMT_PLANAR, PIXEL_STRIDE, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
+ FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, benchmark_width_, \
+ _Unaligned, +, 1, PN, OFF_U, OFF_V) \
+ TESTAPLANARTOPI(SRC_FMT_PLANAR, PIXEL_STRIDE, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
+ FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _Invert, \
+ -, 0, PN, OFF_U, OFF_V) \
+ TESTAPLANARTOPI(SRC_FMT_PLANAR, PIXEL_STRIDE, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
+ FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _Opt, +, \
+ 0, PN, OFF_U, OFF_V)
+
+TESTAPLANARTOP(Android420, I420, 1, 0, 0, 2, 2, I420, 2, 2)
+TESTAPLANARTOP(Android420, NV12, 2, 0, 1, 2, 2, I420, 2, 2)
+TESTAPLANARTOP(Android420, NV21, 2, 1, 0, 2, 2, I420, 2, 2)
+
+#define TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
+ FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, W1280, N, NEG, OFF) \
+ TEST_F(LibYUVConvertTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) { \
+ const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
+ const int kHeight = benchmark_height_; \
+ align_buffer_page_end(src_y, kWidth* kHeight + OFF); \
+ align_buffer_page_end(src_u, SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * \
+ SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) + \
+ OFF); \
+ align_buffer_page_end(src_v, SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * \
+ SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) + \
+ OFF); \
+ align_buffer_page_end(dst_y_c, kWidth* kHeight); \
+ align_buffer_page_end(dst_uv_c, SUBSAMPLE(kWidth * 2, SUBSAMP_X) * \
+ SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ align_buffer_page_end(dst_y_opt, kWidth* kHeight); \
+ align_buffer_page_end(dst_uv_opt, SUBSAMPLE(kWidth * 2, SUBSAMP_X) * \
+ SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ for (int i = 0; i < kHeight; ++i) \
+ for (int j = 0; j < kWidth; ++j) \
+ src_y[i * kWidth + j + OFF] = (fastrand() & 0xff); \
+ for (int i = 0; i < SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); ++i) { \
+ for (int j = 0; j < SUBSAMPLE(kWidth, SRC_SUBSAMP_X); ++j) { \
+ src_u[(i * SUBSAMPLE(kWidth, SRC_SUBSAMP_X)) + j + OFF] = \
+ (fastrand() & 0xff); \
+ src_v[(i * SUBSAMPLE(kWidth, SRC_SUBSAMP_X)) + j + OFF] = \
+ (fastrand() & 0xff); \
+ } \
+ } \
+ memset(dst_y_c, 1, kWidth* kHeight); \
+ memset(dst_uv_c, 2, \
+ SUBSAMPLE(kWidth * 2, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ memset(dst_y_opt, 101, kWidth* kHeight); \
+ memset(dst_uv_opt, 102, \
+ SUBSAMPLE(kWidth * 2, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ MaskCpuFlags(disable_cpu_flags_); \
+ SRC_FMT_PLANAR##To##FMT_PLANAR( \
+ src_y + OFF, kWidth, src_u + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \
+ src_v + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), dst_y_c, kWidth, \
+ dst_uv_c, SUBSAMPLE(kWidth * 2, SUBSAMP_X), kWidth, NEG kHeight); \
+ MaskCpuFlags(benchmark_cpu_info_); \
+ for (int i = 0; i < benchmark_iterations_; ++i) { \
+ SRC_FMT_PLANAR##To##FMT_PLANAR( \
+ src_y + OFF, kWidth, src_u + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \
+ src_v + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), dst_y_opt, kWidth, \
+ dst_uv_opt, SUBSAMPLE(kWidth * 2, SUBSAMP_X), kWidth, NEG kHeight); \
+ } \
+ int max_diff = 0; \
+ for (int i = 0; i < kHeight; ++i) { \
+ for (int j = 0; j < kWidth; ++j) { \
+ int abs_diff = abs(static_cast(dst_y_c[i * kWidth + j]) - \
+ static_cast(dst_y_opt[i * kWidth + j])); \
+ if (abs_diff > max_diff) { \
+ max_diff = abs_diff; \
+ } \
+ } \
+ } \
+ EXPECT_LE(max_diff, 1); \
+ for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \
+ for (int j = 0; j < SUBSAMPLE(kWidth * 2, SUBSAMP_X); ++j) { \
+ int abs_diff = \
+ abs(static_cast( \
+ dst_uv_c[i * SUBSAMPLE(kWidth * 2, SUBSAMP_X) + j]) - \
+ static_cast( \
+ dst_uv_opt[i * SUBSAMPLE(kWidth * 2, SUBSAMP_X) + j])); \
+ if (abs_diff > max_diff) { \
+ max_diff = abs_diff; \
+ } \
+ } \
+ } \
+ EXPECT_LE(max_diff, 1); \
+ free_aligned_buffer_page_end(dst_y_c); \
+ free_aligned_buffer_page_end(dst_uv_c); \
+ free_aligned_buffer_page_end(dst_y_opt); \
+ free_aligned_buffer_page_end(dst_uv_opt); \
+ free_aligned_buffer_page_end(src_y); \
+ free_aligned_buffer_page_end(src_u); \
+ free_aligned_buffer_page_end(src_v); \
+ }
+
+#define TESTPLANARTOBP(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
+ FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y) \
+ TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, \
+ SUBSAMP_X, SUBSAMP_Y, benchmark_width_ - 4, _Any, +, 0) \
+ TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, \
+ SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _Unaligned, +, 1) \
+ TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, \
+ SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _Invert, -, 0) \
+ TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, \
+ SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _Opt, +, 0)
+
+TESTPLANARTOBP(I420, 2, 2, NV12, 2, 2)
+TESTPLANARTOBP(I420, 2, 2, NV21, 2, 2)
+
+#define TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
+ FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, W1280, N, NEG, OFF, \
+ DOY) \
+ TEST_F(LibYUVConvertTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) { \
+ const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
+ const int kHeight = benchmark_height_; \
+ align_buffer_page_end(src_y, kWidth* kHeight + OFF); \
+ align_buffer_page_end(src_uv, 2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * \
+ SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) + \
+ OFF); \
+ align_buffer_page_end(dst_y_c, kWidth* kHeight); \
+ align_buffer_page_end(dst_u_c, SUBSAMPLE(kWidth, SUBSAMP_X) * \
+ SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ align_buffer_page_end(dst_v_c, SUBSAMPLE(kWidth, SUBSAMP_X) * \
+ SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ align_buffer_page_end(dst_y_opt, kWidth* kHeight); \
+ align_buffer_page_end(dst_u_opt, SUBSAMPLE(kWidth, SUBSAMP_X) * \
+ SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ align_buffer_page_end(dst_v_opt, SUBSAMPLE(kWidth, SUBSAMP_X) * \
+ SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ for (int i = 0; i < kHeight; ++i) \
+ for (int j = 0; j < kWidth; ++j) \
+ src_y[i * kWidth + j + OFF] = (fastrand() & 0xff); \
+ for (int i = 0; i < SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); ++i) { \
+ for (int j = 0; j < 2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X); ++j) { \
+ src_uv[(i * 2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X)) + j + OFF] = \
+ (fastrand() & 0xff); \
+ } \
+ } \
+ memset(dst_y_c, 1, kWidth* kHeight); \
+ memset(dst_u_c, 2, \
+ SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ memset(dst_v_c, 3, \
+ SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ memset(dst_y_opt, 101, kWidth* kHeight); \
+ memset(dst_u_opt, 102, \
+ SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ memset(dst_v_opt, 103, \
+ SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ MaskCpuFlags(disable_cpu_flags_); \
+ SRC_FMT_PLANAR##To##FMT_PLANAR( \
+ src_y + OFF, kWidth, src_uv + OFF, \
+ 2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X), DOY ? dst_y_c : NULL, kWidth, \
+ dst_u_c, SUBSAMPLE(kWidth, SUBSAMP_X), dst_v_c, \
+ SUBSAMPLE(kWidth, SUBSAMP_X), kWidth, NEG kHeight); \
+ MaskCpuFlags(benchmark_cpu_info_); \
+ for (int i = 0; i < benchmark_iterations_; ++i) { \
+ SRC_FMT_PLANAR##To##FMT_PLANAR( \
+ src_y + OFF, kWidth, src_uv + OFF, \
+ 2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X), DOY ? dst_y_opt : NULL, \
+ kWidth, dst_u_opt, SUBSAMPLE(kWidth, SUBSAMP_X), dst_v_opt, \
+ SUBSAMPLE(kWidth, SUBSAMP_X), kWidth, NEG kHeight); \
+ } \
+ int max_diff = 0; \
+ if (DOY) { \
+ for (int i = 0; i < kHeight; ++i) { \
+ for (int j = 0; j < kWidth; ++j) { \
+ int abs_diff = abs(static_cast(dst_y_c[i * kWidth + j]) - \
+ static_cast(dst_y_opt[i * kWidth + j])); \
+ if (abs_diff > max_diff) { \
+ max_diff = abs_diff; \
+ } \
+ } \
+ } \
+ EXPECT_LE(max_diff, 1); \
+ } \
+ for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \
+ for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) { \
+ int abs_diff = abs( \
+ static_cast(dst_u_c[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j]) - \
+ static_cast( \
+ dst_u_opt[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j])); \
+ if (abs_diff > max_diff) { \
+ max_diff = abs_diff; \
+ } \
+ } \
+ } \
+ EXPECT_LE(max_diff, 1); \
+ for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \
+ for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) { \
+ int abs_diff = abs( \
+ static_cast(dst_v_c[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j]) - \
+ static_cast( \
+ dst_v_opt[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j])); \
+ if (abs_diff > max_diff) { \
+ max_diff = abs_diff; \
+ } \
+ } \
+ } \
+ EXPECT_LE(max_diff, 1); \
+ free_aligned_buffer_page_end(dst_y_c); \
+ free_aligned_buffer_page_end(dst_u_c); \
+ free_aligned_buffer_page_end(dst_v_c); \
+ free_aligned_buffer_page_end(dst_y_opt); \
+ free_aligned_buffer_page_end(dst_u_opt); \
+ free_aligned_buffer_page_end(dst_v_opt); \
+ free_aligned_buffer_page_end(src_y); \
+ free_aligned_buffer_page_end(src_uv); \
+ }
+
+#define TESTBIPLANARTOP(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
+ FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y) \
+ TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, \
+ SUBSAMP_X, SUBSAMP_Y, benchmark_width_ - 4, _Any, +, 0, 1) \
+ TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, \
+ SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _Unaligned, +, 1, \
+ 1) \
+ TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, \
+ SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _Invert, -, 0, 1) \
+ TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, \
+ SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _Opt, +, 0, 1) \
+ TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, \
+ SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _NullY, +, 0, 0)
+
+TESTBIPLANARTOP(NV12, 2, 2, I420, 2, 2)
+TESTBIPLANARTOP(NV21, 2, 2, I420, 2, 2)
+
+#define ALIGNINT(V, ALIGN) (((V) + (ALIGN)-1) / (ALIGN) * (ALIGN))
+
+#define TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
+ YALIGN, W1280, DIFF, N, NEG, OFF, FMT_C, BPP_C) \
+ TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##N) { \
+ const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
+ const int kHeight = ALIGNINT(benchmark_height_, YALIGN); \
+ const int kStrideB = ALIGNINT(kWidth * BPP_B, ALIGN); \
+ const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X); \
+ const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y); \
+ align_buffer_page_end(src_y, kWidth* kHeight + OFF); \
+ align_buffer_page_end(src_u, kSizeUV + OFF); \
+ align_buffer_page_end(src_v, kSizeUV + OFF); \
+ align_buffer_page_end(dst_argb_c, kStrideB* kHeight + OFF); \
+ align_buffer_page_end(dst_argb_opt, kStrideB* kHeight + OFF); \
+ for (int i = 0; i < kWidth * kHeight; ++i) { \
+ src_y[i + OFF] = (fastrand() & 0xff); \
+ } \
+ for (int i = 0; i < kSizeUV; ++i) { \
+ src_u[i + OFF] = (fastrand() & 0xff); \
+ src_v[i + OFF] = (fastrand() & 0xff); \
+ } \
+ memset(dst_argb_c + OFF, 1, kStrideB * kHeight); \
+ memset(dst_argb_opt + OFF, 101, kStrideB * kHeight); \
+ MaskCpuFlags(disable_cpu_flags_); \
+ FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, src_u + OFF, kStrideUV, \
+ src_v + OFF, kStrideUV, dst_argb_c + OFF, kStrideB, \
+ kWidth, NEG kHeight); \
+ MaskCpuFlags(benchmark_cpu_info_); \
+ for (int i = 0; i < benchmark_iterations_; ++i) { \
+ FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, src_u + OFF, kStrideUV, \
+ src_v + OFF, kStrideUV, dst_argb_opt + OFF, \
+ kStrideB, kWidth, NEG kHeight); \
+ } \
+ int max_diff = 0; \
+ /* Convert to ARGB so 565 is expanded to bytes that can be compared. */ \
+ align_buffer_page_end(dst_argb32_c, kWidth* BPP_C* kHeight); \
+ align_buffer_page_end(dst_argb32_opt, kWidth* BPP_C* kHeight); \
+ memset(dst_argb32_c, 2, kWidth* BPP_C* kHeight); \
+ memset(dst_argb32_opt, 102, kWidth* BPP_C* kHeight); \
+ FMT_B##To##FMT_C(dst_argb_c + OFF, kStrideB, dst_argb32_c, kWidth * BPP_C, \
+ kWidth, kHeight); \
+ FMT_B##To##FMT_C(dst_argb_opt + OFF, kStrideB, dst_argb32_opt, \
+ kWidth * BPP_C, kWidth, kHeight); \
+ for (int i = 0; i < kWidth * BPP_C * kHeight; ++i) { \
+ int abs_diff = abs(static_cast(dst_argb32_c[i]) - \
+ static_cast(dst_argb32_opt[i])); \
+ if (abs_diff > max_diff) { \
+ max_diff = abs_diff; \
+ } \
+ } \
+ EXPECT_LE(max_diff, DIFF); \
+ free_aligned_buffer_page_end(src_y); \
+ free_aligned_buffer_page_end(src_u); \
+ free_aligned_buffer_page_end(src_v); \
+ free_aligned_buffer_page_end(dst_argb_c); \
+ free_aligned_buffer_page_end(dst_argb_opt); \
+ free_aligned_buffer_page_end(dst_argb32_c); \
+ free_aligned_buffer_page_end(dst_argb32_opt); \
+ }
+
+#define TESTPLANARTOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
+ YALIGN, DIFF, FMT_C, BPP_C) \
+ TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
+ YALIGN, benchmark_width_ - 4, DIFF, _Any, +, 0, FMT_C, BPP_C) \
+ TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
+ YALIGN, benchmark_width_, DIFF, _Unaligned, +, 1, FMT_C, \
+ BPP_C) \
+ TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
+ YALIGN, benchmark_width_, DIFF, _Invert, -, 0, FMT_C, BPP_C) \
+ TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
+ YALIGN, benchmark_width_, DIFF, _Opt, +, 0, FMT_C, BPP_C)
+
+TESTPLANARTOB(I420, 2, 2, ARGB, 4, 4, 1, 2, ARGB, 4)
+TESTPLANARTOB(J420, 2, 2, ARGB, 4, 4, 1, 2, ARGB, 4)
+TESTPLANARTOB(J420, 2, 2, ABGR, 4, 4, 1, 2, ARGB, 4)
+TESTPLANARTOB(H420, 2, 2, ARGB, 4, 4, 1, 2, ARGB, 4)
+TESTPLANARTOB(H420, 2, 2, ABGR, 4, 4, 1, 2, ARGB, 4)
+TESTPLANARTOB(I420, 2, 2, BGRA, 4, 4, 1, 2, ARGB, 4)
+TESTPLANARTOB(I420, 2, 2, ABGR, 4, 4, 1, 2, ARGB, 4)
+TESTPLANARTOB(I420, 2, 2, RGBA, 4, 4, 1, 2, ARGB, 4)
+TESTPLANARTOB(I420, 2, 2, RAW, 3, 3, 1, 2, ARGB, 4)
+TESTPLANARTOB(I420, 2, 2, RGB24, 3, 3, 1, 2, ARGB, 4)
+TESTPLANARTOB(I420, 2, 2, RGB565, 2, 2, 1, 9, ARGB, 4)
+TESTPLANARTOB(I420, 2, 2, ARGB1555, 2, 2, 1, 9, ARGB, 4)
+TESTPLANARTOB(I420, 2, 2, ARGB4444, 2, 2, 1, 17, ARGB, 4)
+TESTPLANARTOB(I422, 2, 1, ARGB, 4, 4, 1, 2, ARGB, 4)
+TESTPLANARTOB(I422, 2, 1, RGB565, 2, 2, 1, 9, ARGB, 4)
+TESTPLANARTOB(J422, 2, 1, ARGB, 4, 4, 1, 2, ARGB, 4)
+TESTPLANARTOB(J422, 2, 1, ABGR, 4, 4, 1, 2, ARGB, 4)
+TESTPLANARTOB(H422, 2, 1, ARGB, 4, 4, 1, 2, ARGB, 4)
+TESTPLANARTOB(H422, 2, 1, ABGR, 4, 4, 1, 2, ARGB, 4)
+TESTPLANARTOB(I422, 2, 1, BGRA, 4, 4, 1, 2, ARGB, 4)
+TESTPLANARTOB(I422, 2, 1, ABGR, 4, 4, 1, 2, ARGB, 4)
+TESTPLANARTOB(I422, 2, 1, RGBA, 4, 4, 1, 2, ARGB, 4)
+TESTPLANARTOB(I444, 1, 1, ARGB, 4, 4, 1, 2, ARGB, 4)
+TESTPLANARTOB(J444, 1, 1, ARGB, 4, 4, 1, 2, ARGB, 4)
+TESTPLANARTOB(I444, 1, 1, ABGR, 4, 4, 1, 2, ARGB, 4)
+TESTPLANARTOB(I420, 2, 2, YUY2, 2, 4, 1, 1, ARGB, 4)
+TESTPLANARTOB(I420, 2, 2, UYVY, 2, 4, 1, 1, ARGB, 4)
+TESTPLANARTOB(I422, 2, 1, YUY2, 2, 4, 1, 0, ARGB, 4)
+TESTPLANARTOB(I422, 2, 1, UYVY, 2, 4, 1, 0, ARGB, 4)
+TESTPLANARTOB(I420, 2, 2, I400, 1, 1, 1, 0, ARGB, 4)
+TESTPLANARTOB(J420, 2, 2, J400, 1, 1, 1, 0, ARGB, 4)
+
+#define TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
+ YALIGN, W1280, DIFF, N, NEG, OFF, ATTEN) \
+ TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##N) { \
+ const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
+ const int kHeight = ALIGNINT(benchmark_height_, YALIGN); \
+ const int kStrideB = ALIGNINT(kWidth * BPP_B, ALIGN); \
+ const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X); \
+ const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y); \
+ align_buffer_page_end(src_y, kWidth* kHeight + OFF); \
+ align_buffer_page_end(src_u, kSizeUV + OFF); \
+ align_buffer_page_end(src_v, kSizeUV + OFF); \
+ align_buffer_page_end(src_a, kWidth* kHeight + OFF); \
+ align_buffer_page_end(dst_argb_c, kStrideB* kHeight + OFF); \
+ align_buffer_page_end(dst_argb_opt, kStrideB* kHeight + OFF); \
+ for (int i = 0; i < kWidth * kHeight; ++i) { \
+ src_y[i + OFF] = (fastrand() & 0xff); \
+ src_a[i + OFF] = (fastrand() & 0xff); \
+ } \
+ for (int i = 0; i < kSizeUV; ++i) { \
+ src_u[i + OFF] = (fastrand() & 0xff); \
+ src_v[i + OFF] = (fastrand() & 0xff); \
+ } \
+ memset(dst_argb_c + OFF, 1, kStrideB * kHeight); \
+ memset(dst_argb_opt + OFF, 101, kStrideB * kHeight); \
+ MaskCpuFlags(disable_cpu_flags_); \
+ FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, src_u + OFF, kStrideUV, \
+ src_v + OFF, kStrideUV, src_a + OFF, kWidth, \
+ dst_argb_c + OFF, kStrideB, kWidth, NEG kHeight, \
+ ATTEN); \
+ MaskCpuFlags(benchmark_cpu_info_); \
+ for (int i = 0; i < benchmark_iterations_; ++i) { \
+ FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, src_u + OFF, kStrideUV, \
+ src_v + OFF, kStrideUV, src_a + OFF, kWidth, \
+ dst_argb_opt + OFF, kStrideB, kWidth, NEG kHeight, \
+ ATTEN); \
+ } \
+ int max_diff = 0; \
+ for (int i = 0; i < kWidth * BPP_B * kHeight; ++i) { \
+ int abs_diff = abs(static_cast(dst_argb_c[i + OFF]) - \
+ static_cast(dst_argb_opt[i + OFF])); \
+ if (abs_diff > max_diff) { \
+ max_diff = abs_diff; \
+ } \
+ } \
+ EXPECT_LE(max_diff, DIFF); \
+ free_aligned_buffer_page_end(src_y); \
+ free_aligned_buffer_page_end(src_u); \
+ free_aligned_buffer_page_end(src_v); \
+ free_aligned_buffer_page_end(src_a); \
+ free_aligned_buffer_page_end(dst_argb_c); \
+ free_aligned_buffer_page_end(dst_argb_opt); \
+ }
+
+#define TESTQPLANARTOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
+ YALIGN, DIFF) \
+ TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
+ YALIGN, benchmark_width_ - 4, DIFF, _Any, +, 0, 0) \
+ TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
+ YALIGN, benchmark_width_, DIFF, _Unaligned, +, 1, 0) \
+ TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
+ YALIGN, benchmark_width_, DIFF, _Invert, -, 0, 0) \
+ TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
+ YALIGN, benchmark_width_, DIFF, _Opt, +, 0, 0) \
+ TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
+ YALIGN, benchmark_width_, DIFF, _Premult, +, 0, 1)
+
+TESTQPLANARTOB(I420Alpha, 2, 2, ARGB, 4, 4, 1, 2)
+TESTQPLANARTOB(I420Alpha, 2, 2, ABGR, 4, 4, 1, 2)
+
+#define TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, \
+ W1280, DIFF, N, NEG, OFF) \
+ TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##N) { \
+ const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
+ const int kHeight = benchmark_height_; \
+ const int kStrideB = kWidth * BPP_B; \
+ const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X); \
+ align_buffer_page_end(src_y, kWidth* kHeight + OFF); \
+ align_buffer_page_end(src_uv, \
+ kStrideUV* SUBSAMPLE(kHeight, SUBSAMP_Y) * 2 + OFF); \
+ align_buffer_page_end(dst_argb_c, kStrideB* kHeight); \
+ align_buffer_page_end(dst_argb_opt, kStrideB* kHeight); \
+ for (int i = 0; i < kHeight; ++i) \
+ for (int j = 0; j < kWidth; ++j) \
+ src_y[i * kWidth + j + OFF] = (fastrand() & 0xff); \
+ for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \
+ for (int j = 0; j < kStrideUV * 2; ++j) { \
+ src_uv[i * kStrideUV * 2 + j + OFF] = (fastrand() & 0xff); \
+ } \
+ } \
+ memset(dst_argb_c, 1, kStrideB* kHeight); \
+ memset(dst_argb_opt, 101, kStrideB* kHeight); \
+ MaskCpuFlags(disable_cpu_flags_); \
+ FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, src_uv + OFF, kStrideUV * 2, \
+ dst_argb_c, kWidth * BPP_B, kWidth, NEG kHeight); \
+ MaskCpuFlags(benchmark_cpu_info_); \
+ for (int i = 0; i < benchmark_iterations_; ++i) { \
+ FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, src_uv + OFF, kStrideUV * 2, \
+ dst_argb_opt, kWidth * BPP_B, kWidth, \
+ NEG kHeight); \
+ } \
+ /* Convert to ARGB so 565 is expanded to bytes that can be compared. */ \
+ align_buffer_page_end(dst_argb32_c, kWidth * 4 * kHeight); \
+ align_buffer_page_end(dst_argb32_opt, kWidth * 4 * kHeight); \
+ memset(dst_argb32_c, 2, kWidth * 4 * kHeight); \
+ memset(dst_argb32_opt, 102, kWidth * 4 * kHeight); \
+ FMT_B##ToARGB(dst_argb_c, kStrideB, dst_argb32_c, kWidth * 4, kWidth, \
+ kHeight); \
+ FMT_B##ToARGB(dst_argb_opt, kStrideB, dst_argb32_opt, kWidth * 4, kWidth, \
+ kHeight); \
+ int max_diff = 0; \
+ for (int i = 0; i < kHeight; ++i) { \
+ for (int j = 0; j < kWidth * 4; ++j) { \
+ int abs_diff = \
+ abs(static_cast(dst_argb32_c[i * kWidth * 4 + j]) - \
+ static_cast(dst_argb32_opt[i * kWidth * 4 + j])); \
+ if (abs_diff > max_diff) { \
+ max_diff = abs_diff; \
+ } \
+ } \
+ } \
+ EXPECT_LE(max_diff, DIFF); \
+ free_aligned_buffer_page_end(src_y); \
+ free_aligned_buffer_page_end(src_uv); \
+ free_aligned_buffer_page_end(dst_argb_c); \
+ free_aligned_buffer_page_end(dst_argb_opt); \
+ free_aligned_buffer_page_end(dst_argb32_c); \
+ free_aligned_buffer_page_end(dst_argb32_opt); \
+ }
+
+#define TESTBIPLANARTOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, DIFF) \
+ TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, \
+ benchmark_width_ - 4, DIFF, _Any, +, 0) \
+ TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, \
+ benchmark_width_, DIFF, _Unaligned, +, 1) \
+ TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, \
+ benchmark_width_, DIFF, _Invert, -, 0) \
+ TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, \
+ benchmark_width_, DIFF, _Opt, +, 0)
+
+TESTBIPLANARTOB(NV12, 2, 2, ARGB, 4, 2)
+TESTBIPLANARTOB(NV21, 2, 2, ARGB, 4, 2)
+TESTBIPLANARTOB(NV12, 2, 2, RGB565, 2, 9)
+
+#ifdef DO_THREE_PLANES
+// Do 3 allocations for yuv. conventional but slower.
+#define TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
+ W1280, DIFF, N, NEG, OFF) \
+ TEST_F(LibYUVConvertTest, FMT_A##To##FMT_PLANAR##N) { \
+ const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
+ const int kHeight = ALIGNINT(benchmark_height_, YALIGN); \
+ const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X); \
+ const int kStride = (kStrideUV * SUBSAMP_X * 8 * BPP_A + 7) / 8; \
+ align_buffer_page_end(src_argb, kStride* kHeight + OFF); \
+ align_buffer_page_end(dst_y_c, kWidth* kHeight); \
+ align_buffer_page_end(dst_u_c, kStrideUV* SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ align_buffer_page_end(dst_v_c, kStrideUV* SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ align_buffer_page_end(dst_y_opt, kWidth* kHeight); \
+ align_buffer_page_end(dst_u_opt, \
+ kStrideUV* SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ align_buffer_page_end(dst_v_opt, \
+ kStrideUV* SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ memset(dst_y_c, 1, kWidth* kHeight); \
+ memset(dst_u_c, 2, kStrideUV* SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ memset(dst_v_c, 3, kStrideUV* SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ memset(dst_y_opt, 101, kWidth* kHeight); \
+ memset(dst_u_opt, 102, kStrideUV* SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ memset(dst_v_opt, 103, kStrideUV* SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ for (int i = 0; i < kHeight; ++i) \
+ for (int j = 0; j < kStride; ++j) \
+ src_argb[(i * kStride) + j + OFF] = (fastrand() & 0xff); \
+ MaskCpuFlags(disable_cpu_flags_); \
+ FMT_A##To##FMT_PLANAR(src_argb + OFF, kStride, dst_y_c, kWidth, dst_u_c, \
+ kStrideUV, dst_v_c, kStrideUV, kWidth, NEG kHeight); \
+ MaskCpuFlags(benchmark_cpu_info_); \
+ for (int i = 0; i < benchmark_iterations_; ++i) { \
+ FMT_A##To##FMT_PLANAR(src_argb + OFF, kStride, dst_y_opt, kWidth, \
+ dst_u_opt, kStrideUV, dst_v_opt, kStrideUV, \
+ kWidth, NEG kHeight); \
+ } \
+ for (int i = 0; i < kHeight; ++i) { \
+ for (int j = 0; j < kWidth; ++j) { \
+ EXPECT_NEAR(static_cast(dst_y_c[i * kWidth + j]), \
+ static_cast(dst_y_opt[i * kWidth + j]), DIFF); \
+ } \
+ } \
+ for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \
+ for (int j = 0; j < kStrideUV; ++j) { \
+ EXPECT_NEAR(static_cast(dst_u_c[i * kStrideUV + j]), \
+ static_cast(dst_u_opt[i * kStrideUV + j]), DIFF); \
+ } \
+ } \
+ for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \
+ for (int j = 0; j < kStrideUV; ++j) { \
+ EXPECT_NEAR(static_cast(dst_v_c[i * kStrideUV + j]), \
+ static_cast(dst_v_opt[i * kStrideUV + j]), DIFF); \
+ } \
+ } \
+ free_aligned_buffer_page_end(dst_y_c); \
+ free_aligned_buffer_page_end(dst_u_c); \
+ free_aligned_buffer_page_end(dst_v_c); \
+ free_aligned_buffer_page_end(dst_y_opt); \
+ free_aligned_buffer_page_end(dst_u_opt); \
+ free_aligned_buffer_page_end(dst_v_opt); \
+ free_aligned_buffer_page_end(src_argb); \
+ }
+#else
+#define TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
+ W1280, DIFF, N, NEG, OFF) \
+ TEST_F(LibYUVConvertTest, FMT_A##To##FMT_PLANAR##N) { \
+ const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
+ const int kHeight = ALIGNINT(benchmark_height_, YALIGN); \
+ const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X); \
+ const int kStride = (kStrideUV * SUBSAMP_X * 8 * BPP_A + 7) / 8; \
+ align_buffer_page_end(src_argb, kStride* kHeight + OFF); \
+ align_buffer_page_end(dst_y_c, kWidth* kHeight); \
+ align_buffer_page_end(dst_uv_c, \
+ kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ align_buffer_page_end(dst_y_opt, kWidth* kHeight); \
+ align_buffer_page_end(dst_uv_opt, \
+ kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ memset(dst_y_c, 1, kWidth* kHeight); \
+ memset(dst_uv_c, 2, kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ memset(dst_y_opt, 101, kWidth* kHeight); \
+ memset(dst_uv_opt, 102, kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ for (int i = 0; i < kHeight; ++i) \
+ for (int j = 0; j < kStride; ++j) \
+ src_argb[(i * kStride) + j + OFF] = (fastrand() & 0xff); \
+ MaskCpuFlags(disable_cpu_flags_); \
+ FMT_A##To##FMT_PLANAR(src_argb + OFF, kStride, dst_y_c, kWidth, dst_uv_c, \
+ kStrideUV * 2, dst_uv_c + kStrideUV, kStrideUV * 2, \
+ kWidth, NEG kHeight); \
+ MaskCpuFlags(benchmark_cpu_info_); \
+ for (int i = 0; i < benchmark_iterations_; ++i) { \
+ FMT_A##To##FMT_PLANAR(src_argb + OFF, kStride, dst_y_opt, kWidth, \
+ dst_uv_opt, kStrideUV * 2, dst_uv_opt + kStrideUV, \
+ kStrideUV * 2, kWidth, NEG kHeight); \
+ } \
+ for (int i = 0; i < kHeight; ++i) { \
+ for (int j = 0; j < kWidth; ++j) { \
+ EXPECT_NEAR(static_cast(dst_y_c[i * kWidth + j]), \
+ static_cast(dst_y_opt[i * kWidth + j]), DIFF); \
+ } \
+ } \
+ for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y) * 2; ++i) { \
+ for (int j = 0; j < kStrideUV; ++j) { \
+ EXPECT_NEAR(static_cast(dst_uv_c[i * kStrideUV + j]), \
+ static_cast(dst_uv_opt[i * kStrideUV + j]), DIFF); \
+ } \
+ } \
+ free_aligned_buffer_page_end(dst_y_c); \
+ free_aligned_buffer_page_end(dst_uv_c); \
+ free_aligned_buffer_page_end(dst_y_opt); \
+ free_aligned_buffer_page_end(dst_uv_opt); \
+ free_aligned_buffer_page_end(src_argb); \
+ }
+#endif
+
+#define TESTATOPLANAR(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
+ DIFF) \
+ TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
+ benchmark_width_ - 4, DIFF, _Any, +, 0) \
+ TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
+ benchmark_width_, DIFF, _Unaligned, +, 1) \
+ TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
+ benchmark_width_, DIFF, _Invert, -, 0) \
+ TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
+ benchmark_width_, DIFF, _Opt, +, 0)
+
+TESTATOPLANAR(ARGB, 4, 1, I420, 2, 2, 4)
+#if defined(__arm__) || defined(__aarch64__)
+// arm version subsamples by summing 4 pixels then multiplying by matrix with
+// 4x smaller coefficients which are rounded to nearest integer.
+TESTATOPLANAR(ARGB, 4, 1, J420, 2, 2, 4)
+TESTATOPLANAR(ARGB, 4, 1, J422, 2, 1, 4)
+#else
+TESTATOPLANAR(ARGB, 4, 1, J420, 2, 2, 0)
+TESTATOPLANAR(ARGB, 4, 1, J422, 2, 1, 0)
+#endif
+TESTATOPLANAR(BGRA, 4, 1, I420, 2, 2, 4)
+TESTATOPLANAR(ABGR, 4, 1, I420, 2, 2, 4)
+TESTATOPLANAR(RGBA, 4, 1, I420, 2, 2, 4)
+TESTATOPLANAR(RAW, 3, 1, I420, 2, 2, 4)
+TESTATOPLANAR(RGB24, 3, 1, I420, 2, 2, 4)
+TESTATOPLANAR(RGB565, 2, 1, I420, 2, 2, 5)
+// TODO(fbarchard): Make 1555 neon work same as C code, reduce to diff 9.
+TESTATOPLANAR(ARGB1555, 2, 1, I420, 2, 2, 15)
+TESTATOPLANAR(ARGB4444, 2, 1, I420, 2, 2, 17)
+TESTATOPLANAR(ARGB, 4, 1, I422, 2, 1, 2)
+TESTATOPLANAR(ARGB, 4, 1, I444, 1, 1, 2)
+TESTATOPLANAR(YUY2, 2, 1, I420, 2, 2, 2)
+TESTATOPLANAR(UYVY, 2, 1, I420, 2, 2, 2)
+TESTATOPLANAR(YUY2, 2, 1, I422, 2, 1, 2)
+TESTATOPLANAR(UYVY, 2, 1, I422, 2, 1, 2)
+TESTATOPLANAR(I400, 1, 1, I420, 2, 2, 2)
+TESTATOPLANAR(J400, 1, 1, J420, 2, 2, 2)
+
+#define TESTATOBIPLANARI(FMT_A, SUB_A, BPP_A, FMT_PLANAR, SUBSAMP_X, \
+ SUBSAMP_Y, W1280, N, NEG, OFF) \
+ TEST_F(LibYUVConvertTest, FMT_A##To##FMT_PLANAR##N) { \
+ const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
+ const int kHeight = benchmark_height_; \
+ const int kStride = SUBSAMPLE(kWidth, SUB_A) * BPP_A; \
+ const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X); \
+ align_buffer_page_end(src_argb, kStride* kHeight + OFF); \
+ align_buffer_page_end(dst_y_c, kWidth* kHeight); \
+ align_buffer_page_end(dst_uv_c, \
+ kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ align_buffer_page_end(dst_y_opt, kWidth* kHeight); \
+ align_buffer_page_end(dst_uv_opt, \
+ kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ for (int i = 0; i < kHeight; ++i) \
+ for (int j = 0; j < kStride; ++j) \
+ src_argb[(i * kStride) + j + OFF] = (fastrand() & 0xff); \
+ memset(dst_y_c, 1, kWidth* kHeight); \
+ memset(dst_uv_c, 2, kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ memset(dst_y_opt, 101, kWidth* kHeight); \
+ memset(dst_uv_opt, 102, kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ MaskCpuFlags(disable_cpu_flags_); \
+ FMT_A##To##FMT_PLANAR(src_argb + OFF, kStride, dst_y_c, kWidth, dst_uv_c, \
+ kStrideUV * 2, kWidth, NEG kHeight); \
+ MaskCpuFlags(benchmark_cpu_info_); \
+ for (int i = 0; i < benchmark_iterations_; ++i) { \
+ FMT_A##To##FMT_PLANAR(src_argb + OFF, kStride, dst_y_opt, kWidth, \
+ dst_uv_opt, kStrideUV * 2, kWidth, NEG kHeight); \
+ } \
+ int max_diff = 0; \
+ for (int i = 0; i < kHeight; ++i) { \
+ for (int j = 0; j < kWidth; ++j) { \
+ int abs_diff = abs(static_cast(dst_y_c[i * kWidth + j]) - \
+ static_cast(dst_y_opt[i * kWidth + j])); \
+ if (abs_diff > max_diff) { \
+ max_diff = abs_diff; \
+ } \
+ } \
+ } \
+ EXPECT_LE(max_diff, 4); \
+ for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \
+ for (int j = 0; j < kStrideUV * 2; ++j) { \
+ int abs_diff = \
+ abs(static_cast(dst_uv_c[i * kStrideUV * 2 + j]) - \
+ static_cast(dst_uv_opt[i * kStrideUV * 2 + j])); \
+ if (abs_diff > max_diff) { \
+ max_diff = abs_diff; \
+ } \
+ } \
+ } \
+ EXPECT_LE(max_diff, 4); \
+ free_aligned_buffer_page_end(dst_y_c); \
+ free_aligned_buffer_page_end(dst_uv_c); \
+ free_aligned_buffer_page_end(dst_y_opt); \
+ free_aligned_buffer_page_end(dst_uv_opt); \
+ free_aligned_buffer_page_end(src_argb); \
+ }
+
+#define TESTATOBIPLANAR(FMT_A, SUB_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y) \
+ TESTATOBIPLANARI(FMT_A, SUB_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
+ benchmark_width_ - 4, _Any, +, 0) \
+ TESTATOBIPLANARI(FMT_A, SUB_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
+ benchmark_width_, _Unaligned, +, 1) \
+ TESTATOBIPLANARI(FMT_A, SUB_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
+ benchmark_width_, _Invert, -, 0) \
+ TESTATOBIPLANARI(FMT_A, SUB_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
+ benchmark_width_, _Opt, +, 0)
+
+TESTATOBIPLANAR(ARGB, 1, 4, NV12, 2, 2)
+TESTATOBIPLANAR(ARGB, 1, 4, NV21, 2, 2)
+TESTATOBIPLANAR(YUY2, 2, 4, NV12, 2, 2)
+TESTATOBIPLANAR(UYVY, 2, 4, NV12, 2, 2)
+
+#define TESTATOBI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \
+ HEIGHT_B, W1280, DIFF, N, NEG, OFF) \
+ TEST_F(LibYUVConvertTest, FMT_A##To##FMT_B##N) { \
+ const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
+ const int kHeight = benchmark_height_; \
+ const int kHeightA = (kHeight + HEIGHT_A - 1) / HEIGHT_A * HEIGHT_A; \
+ const int kHeightB = (kHeight + HEIGHT_B - 1) / HEIGHT_B * HEIGHT_B; \
+ const int kStrideA = \
+ (kWidth * BPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A; \
+ const int kStrideB = \
+ (kWidth * BPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B; \
+ align_buffer_page_end(src_argb, kStrideA* kHeightA + OFF); \
+ align_buffer_page_end(dst_argb_c, kStrideB* kHeightB); \
+ align_buffer_page_end(dst_argb_opt, kStrideB* kHeightB); \
+ for (int i = 0; i < kStrideA * kHeightA; ++i) { \
+ src_argb[i + OFF] = (fastrand() & 0xff); \
+ } \
+ memset(dst_argb_c, 1, kStrideB* kHeightB); \
+ memset(dst_argb_opt, 101, kStrideB* kHeightB); \
+ MaskCpuFlags(disable_cpu_flags_); \
+ FMT_A##To##FMT_B(src_argb + OFF, kStrideA, dst_argb_c, kStrideB, kWidth, \
+ NEG kHeight); \
+ MaskCpuFlags(benchmark_cpu_info_); \
+ for (int i = 0; i < benchmark_iterations_; ++i) { \
+ FMT_A##To##FMT_B(src_argb + OFF, kStrideA, dst_argb_opt, kStrideB, \
+ kWidth, NEG kHeight); \
+ } \
+ int max_diff = 0; \
+ for (int i = 0; i < kStrideB * kHeightB; ++i) { \
+ int abs_diff = abs(static_cast(dst_argb_c[i]) - \
+ static_cast(dst_argb_opt[i])); \
+ if (abs_diff > max_diff) { \
+ max_diff = abs_diff; \
+ } \
+ } \
+ EXPECT_LE(max_diff, DIFF); \
+ free_aligned_buffer_page_end(src_argb); \
+ free_aligned_buffer_page_end(dst_argb_c); \
+ free_aligned_buffer_page_end(dst_argb_opt); \
+ }
+
+#define TESTATOBRANDOM(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, \
+ STRIDE_B, HEIGHT_B, DIFF) \
+ TEST_F(LibYUVConvertTest, FMT_A##To##FMT_B##_Random) { \
+ for (int times = 0; times < benchmark_iterations_; ++times) { \
+ const int kWidth = (fastrand() & 63) + 1; \
+ const int kHeight = (fastrand() & 31) + 1; \
+ const int kHeightA = (kHeight + HEIGHT_A - 1) / HEIGHT_A * HEIGHT_A; \
+ const int kHeightB = (kHeight + HEIGHT_B - 1) / HEIGHT_B * HEIGHT_B; \
+ const int kStrideA = \
+ (kWidth * BPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A; \
+ const int kStrideB = \
+ (kWidth * BPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B; \
+ align_buffer_page_end(src_argb, kStrideA* kHeightA); \
+ align_buffer_page_end(dst_argb_c, kStrideB* kHeightB); \
+ align_buffer_page_end(dst_argb_opt, kStrideB* kHeightB); \
+ for (int i = 0; i < kStrideA * kHeightA; ++i) { \
+ src_argb[i] = (fastrand() & 0xff); \
+ } \
+ memset(dst_argb_c, 123, kStrideB* kHeightB); \
+ memset(dst_argb_opt, 123, kStrideB* kHeightB); \
+ MaskCpuFlags(disable_cpu_flags_); \
+ FMT_A##To##FMT_B(src_argb, kStrideA, dst_argb_c, kStrideB, kWidth, \
+ kHeight); \
+ MaskCpuFlags(benchmark_cpu_info_); \
+ FMT_A##To##FMT_B(src_argb, kStrideA, dst_argb_opt, kStrideB, kWidth, \
+ kHeight); \
+ int max_diff = 0; \
+ for (int i = 0; i < kStrideB * kHeightB; ++i) { \
+ int abs_diff = abs(static_cast(dst_argb_c[i]) - \
+ static_cast(dst_argb_opt[i])); \
+ if (abs_diff > max_diff) { \
+ max_diff = abs_diff; \
+ } \
+ } \
+ EXPECT_LE(max_diff, DIFF); \
+ free_aligned_buffer_page_end(src_argb); \
+ free_aligned_buffer_page_end(dst_argb_c); \
+ free_aligned_buffer_page_end(dst_argb_opt); \
+ } \
+ }
+
+#define TESTATOB(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \
+ HEIGHT_B, DIFF) \
+ TESTATOBI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \
+ HEIGHT_B, benchmark_width_ - 4, DIFF, _Any, +, 0) \
+ TESTATOBI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \
+ HEIGHT_B, benchmark_width_, DIFF, _Unaligned, +, 1) \
+ TESTATOBI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \
+ HEIGHT_B, benchmark_width_, DIFF, _Invert, -, 0) \
+ TESTATOBI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \
+ HEIGHT_B, benchmark_width_, DIFF, _Opt, +, 0) \
+ TESTATOBRANDOM(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \
+ HEIGHT_B, DIFF)
+
+TESTATOB(ARGB, 4, 4, 1, ARGB, 4, 4, 1, 0)
+TESTATOB(ARGB, 4, 4, 1, BGRA, 4, 4, 1, 0)
+TESTATOB(ARGB, 4, 4, 1, ABGR, 4, 4, 1, 0)
+TESTATOB(ARGB, 4, 4, 1, RGBA, 4, 4, 1, 0)
+TESTATOB(ARGB, 4, 4, 1, RAW, 3, 3, 1, 0)
+TESTATOB(ARGB, 4, 4, 1, RGB24, 3, 3, 1, 0)
+TESTATOB(ARGB, 4, 4, 1, RGB565, 2, 2, 1, 0)
+TESTATOB(ARGB, 4, 4, 1, ARGB1555, 2, 2, 1, 0)
+TESTATOB(ARGB, 4, 4, 1, ARGB4444, 2, 2, 1, 0)
+TESTATOB(ARGB, 4, 4, 1, YUY2, 2, 4, 1, 4)
+TESTATOB(ARGB, 4, 4, 1, UYVY, 2, 4, 1, 4)
+TESTATOB(ARGB, 4, 4, 1, I400, 1, 1, 1, 2)
+TESTATOB(ARGB, 4, 4, 1, J400, 1, 1, 1, 2)
+TESTATOB(BGRA, 4, 4, 1, ARGB, 4, 4, 1, 0)
+TESTATOB(ABGR, 4, 4, 1, ARGB, 4, 4, 1, 0)
+TESTATOB(RGBA, 4, 4, 1, ARGB, 4, 4, 1, 0)
+TESTATOB(RAW, 3, 3, 1, ARGB, 4, 4, 1, 0)
+TESTATOB(RAW, 3, 3, 1, RGB24, 3, 3, 1, 0)
+TESTATOB(RGB24, 3, 3, 1, ARGB, 4, 4, 1, 0)
+TESTATOB(RGB565, 2, 2, 1, ARGB, 4, 4, 1, 0)
+TESTATOB(ARGB1555, 2, 2, 1, ARGB, 4, 4, 1, 0)
+TESTATOB(ARGB4444, 2, 2, 1, ARGB, 4, 4, 1, 0)
+TESTATOB(YUY2, 2, 4, 1, ARGB, 4, 4, 1, 4)
+TESTATOB(UYVY, 2, 4, 1, ARGB, 4, 4, 1, 4)
+TESTATOB(YUY2, 2, 4, 1, Y, 1, 1, 1, 0)
+TESTATOB(I400, 1, 1, 1, ARGB, 4, 4, 1, 0)
+TESTATOB(J400, 1, 1, 1, ARGB, 4, 4, 1, 0)
+TESTATOB(I400, 1, 1, 1, I400, 1, 1, 1, 0)
+TESTATOB(J400, 1, 1, 1, J400, 1, 1, 1, 0)
+TESTATOB(I400, 1, 1, 1, I400Mirror, 1, 1, 1, 0)
+TESTATOB(ARGB, 4, 4, 1, ARGBMirror, 4, 4, 1, 0)
+
+#define TESTATOBDI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \
+ HEIGHT_B, W1280, DIFF, N, NEG, OFF) \
+ TEST_F(LibYUVConvertTest, FMT_A##To##FMT_B##Dither##N) { \
+ const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
+ const int kHeight = benchmark_height_; \
+ const int kHeightA = (kHeight + HEIGHT_A - 1) / HEIGHT_A * HEIGHT_A; \
+ const int kHeightB = (kHeight + HEIGHT_B - 1) / HEIGHT_B * HEIGHT_B; \
+ const int kStrideA = \
+ (kWidth * BPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A; \
+ const int kStrideB = \
+ (kWidth * BPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B; \
+ align_buffer_page_end(src_argb, kStrideA* kHeightA + OFF); \
+ align_buffer_page_end(dst_argb_c, kStrideB* kHeightB); \
+ align_buffer_page_end(dst_argb_opt, kStrideB* kHeightB); \
+ for (int i = 0; i < kStrideA * kHeightA; ++i) { \
+ src_argb[i + OFF] = (fastrand() & 0xff); \
+ } \
+ memset(dst_argb_c, 1, kStrideB* kHeightB); \
+ memset(dst_argb_opt, 101, kStrideB* kHeightB); \
+ MaskCpuFlags(disable_cpu_flags_); \
+ FMT_A##To##FMT_B##Dither(src_argb + OFF, kStrideA, dst_argb_c, kStrideB, \
+ NULL, kWidth, NEG kHeight); \
+ MaskCpuFlags(benchmark_cpu_info_); \
+ for (int i = 0; i < benchmark_iterations_; ++i) { \
+ FMT_A##To##FMT_B##Dither(src_argb + OFF, kStrideA, dst_argb_opt, \
+ kStrideB, NULL, kWidth, NEG kHeight); \
+ } \
+ int max_diff = 0; \
+ for (int i = 0; i < kStrideB * kHeightB; ++i) { \
+ int abs_diff = abs(static_cast(dst_argb_c[i]) - \
+ static_cast(dst_argb_opt[i])); \
+ if (abs_diff > max_diff) { \
+ max_diff = abs_diff; \
+ } \
+ } \
+ EXPECT_LE(max_diff, DIFF); \
+ free_aligned_buffer_page_end(src_argb); \
+ free_aligned_buffer_page_end(dst_argb_c); \
+ free_aligned_buffer_page_end(dst_argb_opt); \
+ }
+
+#define TESTATOBDRANDOM(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, \
+ STRIDE_B, HEIGHT_B, DIFF) \
+ TEST_F(LibYUVConvertTest, FMT_A##To##FMT_B##Dither_Random) { \
+ for (int times = 0; times < benchmark_iterations_; ++times) { \
+ const int kWidth = (fastrand() & 63) + 1; \
+ const int kHeight = (fastrand() & 31) + 1; \
+ const int kHeightA = (kHeight + HEIGHT_A - 1) / HEIGHT_A * HEIGHT_A; \
+ const int kHeightB = (kHeight + HEIGHT_B - 1) / HEIGHT_B * HEIGHT_B; \
+ const int kStrideA = \
+ (kWidth * BPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A; \
+ const int kStrideB = \
+ (kWidth * BPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B; \
+ align_buffer_page_end(src_argb, kStrideA* kHeightA); \
+ align_buffer_page_end(dst_argb_c, kStrideB* kHeightB); \
+ align_buffer_page_end(dst_argb_opt, kStrideB* kHeightB); \
+ for (int i = 0; i < kStrideA * kHeightA; ++i) { \
+ src_argb[i] = (fastrand() & 0xff); \
+ } \
+ memset(dst_argb_c, 123, kStrideB* kHeightB); \
+ memset(dst_argb_opt, 123, kStrideB* kHeightB); \
+ MaskCpuFlags(disable_cpu_flags_); \
+ FMT_A##To##FMT_B##Dither(src_argb, kStrideA, dst_argb_c, kStrideB, NULL, \
+ kWidth, kHeight); \
+ MaskCpuFlags(benchmark_cpu_info_); \
+ FMT_A##To##FMT_B##Dither(src_argb, kStrideA, dst_argb_opt, kStrideB, \
+ NULL, kWidth, kHeight); \
+ int max_diff = 0; \
+ for (int i = 0; i < kStrideB * kHeightB; ++i) { \
+ int abs_diff = abs(static_cast(dst_argb_c[i]) - \
+ static_cast(dst_argb_opt[i])); \
+ if (abs_diff > max_diff) { \
+ max_diff = abs_diff; \
+ } \
+ } \
+ EXPECT_LE(max_diff, DIFF); \
+ free_aligned_buffer_page_end(src_argb); \
+ free_aligned_buffer_page_end(dst_argb_c); \
+ free_aligned_buffer_page_end(dst_argb_opt); \
+ } \
+ }
+
+#define TESTATOBD(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \
+ HEIGHT_B, DIFF) \
+ TESTATOBDI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \
+ HEIGHT_B, benchmark_width_ - 4, DIFF, _Any, +, 0) \
+ TESTATOBDI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \
+ HEIGHT_B, benchmark_width_, DIFF, _Unaligned, +, 1) \
+ TESTATOBDI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \
+ HEIGHT_B, benchmark_width_, DIFF, _Invert, -, 0) \
+ TESTATOBDI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \
+ HEIGHT_B, benchmark_width_, DIFF, _Opt, +, 0) \
+ TESTATOBDRANDOM(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \
+ HEIGHT_B, DIFF)
+
+TESTATOBD(ARGB, 4, 4, 1, RGB565, 2, 2, 1, 0)
+
+#define TESTSYMI(FMT_ATOB, BPP_A, STRIDE_A, HEIGHT_A, W1280, N, NEG, OFF) \
+ TEST_F(LibYUVConvertTest, FMT_ATOB##_Symetric##N) { \
+ const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
+ const int kHeight = benchmark_height_; \
+ const int kHeightA = (kHeight + HEIGHT_A - 1) / HEIGHT_A * HEIGHT_A; \
+ const int kStrideA = \
+ (kWidth * BPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A; \
+ align_buffer_page_end(src_argb, kStrideA* kHeightA + OFF); \
+ align_buffer_page_end(dst_argb_c, kStrideA* kHeightA); \
+ align_buffer_page_end(dst_argb_opt, kStrideA* kHeightA); \
+ for (int i = 0; i < kStrideA * kHeightA; ++i) { \
+ src_argb[i + OFF] = (fastrand() & 0xff); \
+ } \
+ memset(dst_argb_c, 1, kStrideA* kHeightA); \
+ memset(dst_argb_opt, 101, kStrideA* kHeightA); \
+ MaskCpuFlags(disable_cpu_flags_); \
+ FMT_ATOB(src_argb + OFF, kStrideA, dst_argb_c, kStrideA, kWidth, \
+ NEG kHeight); \
+ MaskCpuFlags(benchmark_cpu_info_); \
+ for (int i = 0; i < benchmark_iterations_; ++i) { \
+ FMT_ATOB(src_argb + OFF, kStrideA, dst_argb_opt, kStrideA, kWidth, \
+ NEG kHeight); \
+ } \
+ MaskCpuFlags(disable_cpu_flags_); \
+ FMT_ATOB(dst_argb_c, kStrideA, dst_argb_c, kStrideA, kWidth, NEG kHeight); \
+ MaskCpuFlags(benchmark_cpu_info_); \
+ FMT_ATOB(dst_argb_opt, kStrideA, dst_argb_opt, kStrideA, kWidth, \
+ NEG kHeight); \
+ for (int i = 0; i < kStrideA * kHeightA; ++i) { \
+ EXPECT_EQ(src_argb[i + OFF], dst_argb_opt[i]); \
+ EXPECT_EQ(dst_argb_c[i], dst_argb_opt[i]); \
+ } \
+ free_aligned_buffer_page_end(src_argb); \
+ free_aligned_buffer_page_end(dst_argb_c); \
+ free_aligned_buffer_page_end(dst_argb_opt); \
+ }
+
+#define TESTSYM(FMT_ATOB, BPP_A, STRIDE_A, HEIGHT_A) \
+ TESTSYMI(FMT_ATOB, BPP_A, STRIDE_A, HEIGHT_A, benchmark_width_ - 4, _Any, +, \
+ 0) \
+ TESTSYMI(FMT_ATOB, BPP_A, STRIDE_A, HEIGHT_A, benchmark_width_, _Unaligned, \
+ +, 1) \
+ TESTSYMI(FMT_ATOB, BPP_A, STRIDE_A, HEIGHT_A, benchmark_width_, _Opt, +, 0)
+
+TESTSYM(ARGBToARGB, 4, 4, 1)
+TESTSYM(ARGBToBGRA, 4, 4, 1)
+TESTSYM(ARGBToABGR, 4, 4, 1)
+TESTSYM(BGRAToARGB, 4, 4, 1)
+TESTSYM(ABGRToARGB, 4, 4, 1)
+
+TEST_F(LibYUVConvertTest, Test565) {
+ SIMD_ALIGNED(uint8 orig_pixels[256][4]);
+ SIMD_ALIGNED(uint8 pixels565[256][2]);
+
+ for (int i = 0; i < 256; ++i) {
+ for (int j = 0; j < 4; ++j) {
+ orig_pixels[i][j] = i;
+ }
+ }
+ ARGBToRGB565(&orig_pixels[0][0], 0, &pixels565[0][0], 0, 256, 1);
+ uint32 checksum = HashDjb2(&pixels565[0][0], sizeof(pixels565), 5381);
+ EXPECT_EQ(610919429u, checksum);
+}
+
+#ifdef HAVE_JPEG
+TEST_F(LibYUVConvertTest, ValidateJpeg) {
+ const int kOff = 10;
+ const int kMinJpeg = 64;
+ const int kImageSize = benchmark_width_ * benchmark_height_ >= kMinJpeg
+ ? benchmark_width_ * benchmark_height_
+ : kMinJpeg;
+ const int kSize = kImageSize + kOff;
+ align_buffer_page_end(orig_pixels, kSize);
+
+ // No SOI or EOI. Expect fail.
+ memset(orig_pixels, 0, kSize);
+ EXPECT_FALSE(ValidateJpeg(orig_pixels, kSize));
+
+ // Test special value that matches marker start.
+ memset(orig_pixels, 0xff, kSize);
+ EXPECT_FALSE(ValidateJpeg(orig_pixels, kSize));
+
+ // EOI, SOI. Expect pass.
+ orig_pixels[0] = 0xff;
+ orig_pixels[1] = 0xd8; // SOI.
+ orig_pixels[kSize - kOff + 0] = 0xff;
+ orig_pixels[kSize - kOff + 1] = 0xd9; // EOI.
+ for (int times = 0; times < benchmark_iterations_; ++times) {
+ EXPECT_TRUE(ValidateJpeg(orig_pixels, kSize));
+ }
+ free_aligned_buffer_page_end(orig_pixels);
+}
+
+TEST_F(LibYUVConvertTest, ValidateJpegLarge) {
+ const int kOff = 10;
+ const int kMinJpeg = 64;
+ const int kImageSize = benchmark_width_ * benchmark_height_ >= kMinJpeg
+ ? benchmark_width_ * benchmark_height_
+ : kMinJpeg;
+ const int kSize = kImageSize + kOff;
+ const int kMultiple = 10;
+ const int kBufSize = kImageSize * kMultiple + kOff;
+ align_buffer_page_end(orig_pixels, kBufSize);
+
+ // No SOI or EOI. Expect fail.
+ memset(orig_pixels, 0, kBufSize);
+ EXPECT_FALSE(ValidateJpeg(orig_pixels, kBufSize));
+
+ // EOI, SOI. Expect pass.
+ orig_pixels[0] = 0xff;
+ orig_pixels[1] = 0xd8; // SOI.
+ orig_pixels[kSize - kOff + 0] = 0xff;
+ orig_pixels[kSize - kOff + 1] = 0xd9; // EOI.
+ for (int times = 0; times < benchmark_iterations_; ++times) {
+ EXPECT_TRUE(ValidateJpeg(orig_pixels, kBufSize));
+ }
+ free_aligned_buffer_page_end(orig_pixels);
+}
+
+TEST_F(LibYUVConvertTest, InvalidateJpeg) {
+ const int kOff = 10;
+ const int kMinJpeg = 64;
+ const int kImageSize = benchmark_width_ * benchmark_height_ >= kMinJpeg
+ ? benchmark_width_ * benchmark_height_
+ : kMinJpeg;
+ const int kSize = kImageSize + kOff;
+ align_buffer_page_end(orig_pixels, kSize);
+
+ // NULL pointer. Expect fail.
+ EXPECT_FALSE(ValidateJpeg(NULL, kSize));
+
+ // Negative size. Expect fail.
+ EXPECT_FALSE(ValidateJpeg(orig_pixels, -1));
+
+ // Too large size. Expect fail.
+ EXPECT_FALSE(ValidateJpeg(orig_pixels, 0xfb000000ull));
+
+ // No SOI or EOI. Expect fail.
+ memset(orig_pixels, 0, kSize);
+ EXPECT_FALSE(ValidateJpeg(orig_pixels, kSize));
+
+ // SOI but no EOI. Expect fail.
+ orig_pixels[0] = 0xff;
+ orig_pixels[1] = 0xd8; // SOI.
+ for (int times = 0; times < benchmark_iterations_; ++times) {
+ EXPECT_FALSE(ValidateJpeg(orig_pixels, kSize));
+ }
+
+ // EOI but no SOI. Expect fail.
+ orig_pixels[0] = 0;
+ orig_pixels[1] = 0;
+ orig_pixels[kSize - kOff + 0] = 0xff;
+ orig_pixels[kSize - kOff + 1] = 0xd9; // EOI.
+ EXPECT_FALSE(ValidateJpeg(orig_pixels, kSize));
+
+ free_aligned_buffer_page_end(orig_pixels);
+}
+
+TEST_F(LibYUVConvertTest, FuzzJpeg) {
+ // SOI but no EOI. Expect fail.
+ for (int times = 0; times < benchmark_iterations_; ++times) {
+ const int kSize = fastrand() % 5000 + 2;
+ align_buffer_page_end(orig_pixels, kSize);
+ MemRandomize(orig_pixels, kSize);
+
+ // Add SOI so frame will be scanned.
+ orig_pixels[0] = 0xff;
+ orig_pixels[1] = 0xd8; // SOI.
+ orig_pixels[kSize - 1] = 0xff;
+ ValidateJpeg(orig_pixels, kSize); // Failure normally expected.
+ free_aligned_buffer_page_end(orig_pixels);
+ }
+}
+
+TEST_F(LibYUVConvertTest, MJPGToI420) {
+ const int kOff = 10;
+ const int kMinJpeg = 64;
+ const int kImageSize = benchmark_width_ * benchmark_height_ >= kMinJpeg
+ ? benchmark_width_ * benchmark_height_
+ : kMinJpeg;
+ const int kSize = kImageSize + kOff;
+ align_buffer_page_end(orig_pixels, kSize);
+ align_buffer_page_end(dst_y_opt, benchmark_width_ * benchmark_height_);
+ align_buffer_page_end(dst_u_opt, SUBSAMPLE(benchmark_width_, 2) *
+ SUBSAMPLE(benchmark_height_, 2));
+ align_buffer_page_end(dst_v_opt, SUBSAMPLE(benchmark_width_, 2) *
+ SUBSAMPLE(benchmark_height_, 2));
+
+ // EOI, SOI to make MJPG appear valid.
+ memset(orig_pixels, 0, kSize);
+ orig_pixels[0] = 0xff;
+ orig_pixels[1] = 0xd8; // SOI.
+ orig_pixels[kSize - kOff + 0] = 0xff;
+ orig_pixels[kSize - kOff + 1] = 0xd9; // EOI.
+
+ for (int times = 0; times < benchmark_iterations_; ++times) {
+ int ret =
+ MJPGToI420(orig_pixels, kSize, dst_y_opt, benchmark_width_, dst_u_opt,
+ SUBSAMPLE(benchmark_width_, 2), dst_v_opt,
+ SUBSAMPLE(benchmark_width_, 2), benchmark_width_,
+ benchmark_height_, benchmark_width_, benchmark_height_);
+ // Expect failure because image is not really valid.
+ EXPECT_EQ(1, ret);
+ }
+
+ free_aligned_buffer_page_end(dst_y_opt);
+ free_aligned_buffer_page_end(dst_u_opt);
+ free_aligned_buffer_page_end(dst_v_opt);
+ free_aligned_buffer_page_end(orig_pixels);
+}
+
+TEST_F(LibYUVConvertTest, MJPGToARGB) {
+ const int kOff = 10;
+ const int kMinJpeg = 64;
+ const int kImageSize = benchmark_width_ * benchmark_height_ >= kMinJpeg
+ ? benchmark_width_ * benchmark_height_
+ : kMinJpeg;
+ const int kSize = kImageSize + kOff;
+ align_buffer_page_end(orig_pixels, kSize);
+ align_buffer_page_end(dst_argb_opt, benchmark_width_ * benchmark_height_ * 4);
+
+ // EOI, SOI to make MJPG appear valid.
+ memset(orig_pixels, 0, kSize);
+ orig_pixels[0] = 0xff;
+ orig_pixels[1] = 0xd8; // SOI.
+ orig_pixels[kSize - kOff + 0] = 0xff;
+ orig_pixels[kSize - kOff + 1] = 0xd9; // EOI.
+
+ for (int times = 0; times < benchmark_iterations_; ++times) {
+ int ret = MJPGToARGB(orig_pixels, kSize, dst_argb_opt, benchmark_width_ * 4,
+ benchmark_width_, benchmark_height_, benchmark_width_,
+ benchmark_height_);
+ // Expect failure because image is not really valid.
+ EXPECT_EQ(1, ret);
+ }
+
+ free_aligned_buffer_page_end(dst_argb_opt);
+ free_aligned_buffer_page_end(orig_pixels);
+}
+
+#endif // HAVE_JPEG
+
+TEST_F(LibYUVConvertTest, NV12Crop) {
+ const int SUBSAMP_X = 2;
+ const int SUBSAMP_Y = 2;
+ const int kWidth = benchmark_width_;
+ const int kHeight = benchmark_height_;
+ const int crop_y =
+ ((benchmark_height_ - (benchmark_height_ * 360 / 480)) / 2 + 1) & ~1;
+ const int kDestWidth = benchmark_width_;
+ const int kDestHeight = benchmark_height_ - crop_y * 2;
+ const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X);
+ const int sample_size =
+ kWidth * kHeight + kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y) * 2;
+ align_buffer_page_end(src_y, sample_size);
+ uint8* src_uv = src_y + kWidth * kHeight;
+
+ align_buffer_page_end(dst_y, kDestWidth * kDestHeight);
+ align_buffer_page_end(dst_u, SUBSAMPLE(kDestWidth, SUBSAMP_X) *
+ SUBSAMPLE(kDestHeight, SUBSAMP_Y));
+ align_buffer_page_end(dst_v, SUBSAMPLE(kDestWidth, SUBSAMP_X) *
+ SUBSAMPLE(kDestHeight, SUBSAMP_Y));
+
+ align_buffer_page_end(dst_y_2, kDestWidth * kDestHeight);
+ align_buffer_page_end(dst_u_2, SUBSAMPLE(kDestWidth, SUBSAMP_X) *
+ SUBSAMPLE(kDestHeight, SUBSAMP_Y));
+ align_buffer_page_end(dst_v_2, SUBSAMPLE(kDestWidth, SUBSAMP_X) *
+ SUBSAMPLE(kDestHeight, SUBSAMP_Y));
+
+ for (int i = 0; i < kHeight * kWidth; ++i) {
+ src_y[i] = (fastrand() & 0xff);
+ }
+ for (int i = 0; i < (SUBSAMPLE(kHeight, SUBSAMP_Y) * kStrideUV) * 2; ++i) {
+ src_uv[i] = (fastrand() & 0xff);
+ }
+ memset(dst_y, 1, kDestWidth * kDestHeight);
+ memset(dst_u, 2,
+ SUBSAMPLE(kDestWidth, SUBSAMP_X) * SUBSAMPLE(kDestHeight, SUBSAMP_Y));
+ memset(dst_v, 3,
+ SUBSAMPLE(kDestWidth, SUBSAMP_X) * SUBSAMPLE(kDestHeight, SUBSAMP_Y));
+ memset(dst_y_2, 1, kDestWidth * kDestHeight);
+ memset(dst_u_2, 2,
+ SUBSAMPLE(kDestWidth, SUBSAMP_X) * SUBSAMPLE(kDestHeight, SUBSAMP_Y));
+ memset(dst_v_2, 3,
+ SUBSAMPLE(kDestWidth, SUBSAMP_X) * SUBSAMPLE(kDestHeight, SUBSAMP_Y));
+
+ ConvertToI420(src_y, sample_size, dst_y_2, kDestWidth, dst_u_2,
+ SUBSAMPLE(kDestWidth, SUBSAMP_X), dst_v_2,
+ SUBSAMPLE(kDestWidth, SUBSAMP_X), 0, crop_y, kWidth, kHeight,
+ kDestWidth, kDestHeight, libyuv::kRotate0, libyuv::FOURCC_NV12);
+
+ NV12ToI420(src_y + crop_y * kWidth, kWidth,
+ src_uv + (crop_y / 2) * kStrideUV * 2, kStrideUV * 2, dst_y,
+ kDestWidth, dst_u, SUBSAMPLE(kDestWidth, SUBSAMP_X), dst_v,
+ SUBSAMPLE(kDestWidth, SUBSAMP_X), kDestWidth, kDestHeight);
+
+ for (int i = 0; i < kDestHeight; ++i) {
+ for (int j = 0; j < kDestWidth; ++j) {
+ EXPECT_EQ(dst_y[i * kWidth + j], dst_y_2[i * kWidth + j]);
+ }
+ }
+ for (int i = 0; i < SUBSAMPLE(kDestHeight, SUBSAMP_Y); ++i) {
+ for (int j = 0; j < SUBSAMPLE(kDestWidth, SUBSAMP_X); ++j) {
+ EXPECT_EQ(dst_u[i * SUBSAMPLE(kDestWidth, SUBSAMP_X) + j],
+ dst_u_2[i * SUBSAMPLE(kDestWidth, SUBSAMP_X) + j]);
+ }
+ }
+ for (int i = 0; i < SUBSAMPLE(kDestHeight, SUBSAMP_Y); ++i) {
+ for (int j = 0; j < SUBSAMPLE(kDestWidth, SUBSAMP_X); ++j) {
+ EXPECT_EQ(dst_v[i * SUBSAMPLE(kDestWidth, SUBSAMP_X) + j],
+ dst_v_2[i * SUBSAMPLE(kDestWidth, SUBSAMP_X) + j]);
+ }
+ }
+ free_aligned_buffer_page_end(dst_y);
+ free_aligned_buffer_page_end(dst_u);
+ free_aligned_buffer_page_end(dst_v);
+ free_aligned_buffer_page_end(dst_y_2);
+ free_aligned_buffer_page_end(dst_u_2);
+ free_aligned_buffer_page_end(dst_v_2);
+ free_aligned_buffer_page_end(src_y);
+}
+
+TEST_F(LibYUVConvertTest, TestYToARGB) {
+ uint8 y[32];
+ uint8 expectedg[32];
+ for (int i = 0; i < 32; ++i) {
+ y[i] = i * 5 + 17;
+ expectedg[i] = static_cast((y[i] - 16) * 1.164f + 0.5f);
+ }
+ uint8 argb[32 * 4];
+ YToARGB(y, 0, argb, 0, 32, 1);
+
+ for (int i = 0; i < 32; ++i) {
+ printf("%2d %d: %d <-> %d,%d,%d,%d\n", i, y[i], expectedg[i],
+ argb[i * 4 + 0], argb[i * 4 + 1], argb[i * 4 + 2], argb[i * 4 + 3]);
+ }
+ for (int i = 0; i < 32; ++i) {
+ EXPECT_EQ(expectedg[i], argb[i * 4 + 0]);
+ }
+}
+
+static const uint8 kNoDither4x4[16] = {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+};
+
+TEST_F(LibYUVConvertTest, TestNoDither) {
+ align_buffer_page_end(src_argb, benchmark_width_ * benchmark_height_ * 4);
+ align_buffer_page_end(dst_rgb565, benchmark_width_ * benchmark_height_ * 2);
+ align_buffer_page_end(dst_rgb565dither,
+ benchmark_width_ * benchmark_height_ * 2);
+ MemRandomize(src_argb, benchmark_width_ * benchmark_height_ * 4);
+ MemRandomize(dst_rgb565, benchmark_width_ * benchmark_height_ * 2);
+ MemRandomize(dst_rgb565dither, benchmark_width_ * benchmark_height_ * 2);
+ ARGBToRGB565(src_argb, benchmark_width_ * 4, dst_rgb565, benchmark_width_ * 2,
+ benchmark_width_, benchmark_height_);
+ ARGBToRGB565Dither(src_argb, benchmark_width_ * 4, dst_rgb565dither,
+ benchmark_width_ * 2, kNoDither4x4, benchmark_width_,
+ benchmark_height_);
+ for (int i = 0; i < benchmark_width_ * benchmark_height_ * 2; ++i) {
+ EXPECT_EQ(dst_rgb565[i], dst_rgb565dither[i]);
+ }
+
+ free_aligned_buffer_page_end(src_argb);
+ free_aligned_buffer_page_end(dst_rgb565);
+ free_aligned_buffer_page_end(dst_rgb565dither);
+}
+
+// Ordered 4x4 dither for 888 to 565. Values from 0 to 7.
+static const uint8 kDither565_4x4[16] = {
+ 0, 4, 1, 5, 6, 2, 7, 3, 1, 5, 0, 4, 7, 3, 6, 2,
+};
+
+TEST_F(LibYUVConvertTest, TestDither) {
+ align_buffer_page_end(src_argb, benchmark_width_ * benchmark_height_ * 4);
+ align_buffer_page_end(dst_rgb565, benchmark_width_ * benchmark_height_ * 2);
+ align_buffer_page_end(dst_rgb565dither,
+ benchmark_width_ * benchmark_height_ * 2);
+ align_buffer_page_end(dst_argb, benchmark_width_ * benchmark_height_ * 4);
+ align_buffer_page_end(dst_argbdither,
+ benchmark_width_ * benchmark_height_ * 4);
+ MemRandomize(src_argb, benchmark_width_ * benchmark_height_ * 4);
+ MemRandomize(dst_rgb565, benchmark_width_ * benchmark_height_ * 2);
+ MemRandomize(dst_rgb565dither, benchmark_width_ * benchmark_height_ * 2);
+ MemRandomize(dst_argb, benchmark_width_ * benchmark_height_ * 4);
+ MemRandomize(dst_argbdither, benchmark_width_ * benchmark_height_ * 4);
+ ARGBToRGB565(src_argb, benchmark_width_ * 4, dst_rgb565, benchmark_width_ * 2,
+ benchmark_width_, benchmark_height_);
+ ARGBToRGB565Dither(src_argb, benchmark_width_ * 4, dst_rgb565dither,
+ benchmark_width_ * 2, kDither565_4x4, benchmark_width_,
+ benchmark_height_);
+ RGB565ToARGB(dst_rgb565, benchmark_width_ * 2, dst_argb, benchmark_width_ * 4,
+ benchmark_width_, benchmark_height_);
+ RGB565ToARGB(dst_rgb565dither, benchmark_width_ * 2, dst_argbdither,
+ benchmark_width_ * 4, benchmark_width_, benchmark_height_);
+
+ for (int i = 0; i < benchmark_width_ * benchmark_height_ * 4; ++i) {
+ EXPECT_NEAR(dst_argb[i], dst_argbdither[i], 9);
+ }
+ free_aligned_buffer_page_end(src_argb);
+ free_aligned_buffer_page_end(dst_rgb565);
+ free_aligned_buffer_page_end(dst_rgb565dither);
+ free_aligned_buffer_page_end(dst_argb);
+ free_aligned_buffer_page_end(dst_argbdither);
+}
+
+#define TESTPLANARTOBID(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
+ YALIGN, W1280, DIFF, N, NEG, OFF, FMT_C, BPP_C) \
+ TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##Dither##N) { \
+ const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
+ const int kHeight = ALIGNINT(benchmark_height_, YALIGN); \
+ const int kStrideB = ALIGNINT(kWidth * BPP_B, ALIGN); \
+ const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X); \
+ const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y); \
+ align_buffer_page_end(src_y, kWidth* kHeight + OFF); \
+ align_buffer_page_end(src_u, kSizeUV + OFF); \
+ align_buffer_page_end(src_v, kSizeUV + OFF); \
+ align_buffer_page_end(dst_argb_c, kStrideB* kHeight + OFF); \
+ align_buffer_page_end(dst_argb_opt, kStrideB* kHeight + OFF); \
+ for (int i = 0; i < kWidth * kHeight; ++i) { \
+ src_y[i + OFF] = (fastrand() & 0xff); \
+ } \
+ for (int i = 0; i < kSizeUV; ++i) { \
+ src_u[i + OFF] = (fastrand() & 0xff); \
+ src_v[i + OFF] = (fastrand() & 0xff); \
+ } \
+ memset(dst_argb_c + OFF, 1, kStrideB * kHeight); \
+ memset(dst_argb_opt + OFF, 101, kStrideB * kHeight); \
+ MaskCpuFlags(disable_cpu_flags_); \
+ FMT_PLANAR##To##FMT_B##Dither(src_y + OFF, kWidth, src_u + OFF, kStrideUV, \
+ src_v + OFF, kStrideUV, dst_argb_c + OFF, \
+ kStrideB, NULL, kWidth, NEG kHeight); \
+ MaskCpuFlags(benchmark_cpu_info_); \
+ for (int i = 0; i < benchmark_iterations_; ++i) { \
+ FMT_PLANAR##To##FMT_B##Dither( \
+ src_y + OFF, kWidth, src_u + OFF, kStrideUV, src_v + OFF, kStrideUV, \
+ dst_argb_opt + OFF, kStrideB, NULL, kWidth, NEG kHeight); \
+ } \
+ int max_diff = 0; \
+ /* Convert to ARGB so 565 is expanded to bytes that can be compared. */ \
+ align_buffer_page_end(dst_argb32_c, kWidth* BPP_C* kHeight); \
+ align_buffer_page_end(dst_argb32_opt, kWidth* BPP_C* kHeight); \
+ memset(dst_argb32_c, 2, kWidth* BPP_C* kHeight); \
+ memset(dst_argb32_opt, 102, kWidth* BPP_C* kHeight); \
+ FMT_B##To##FMT_C(dst_argb_c + OFF, kStrideB, dst_argb32_c, kWidth * BPP_C, \
+ kWidth, kHeight); \
+ FMT_B##To##FMT_C(dst_argb_opt + OFF, kStrideB, dst_argb32_opt, \
+ kWidth * BPP_C, kWidth, kHeight); \
+ for (int i = 0; i < kWidth * BPP_C * kHeight; ++i) { \
+ int abs_diff = abs(static_cast(dst_argb32_c[i]) - \
+ static_cast(dst_argb32_opt[i])); \
+ if (abs_diff > max_diff) { \
+ max_diff = abs_diff; \
+ } \
+ } \
+ EXPECT_LE(max_diff, DIFF); \
+ free_aligned_buffer_page_end(src_y); \
+ free_aligned_buffer_page_end(src_u); \
+ free_aligned_buffer_page_end(src_v); \
+ free_aligned_buffer_page_end(dst_argb_c); \
+ free_aligned_buffer_page_end(dst_argb_opt); \
+ free_aligned_buffer_page_end(dst_argb32_c); \
+ free_aligned_buffer_page_end(dst_argb32_opt); \
+ }
+
+#define TESTPLANARTOBD(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
+ YALIGN, DIFF, FMT_C, BPP_C) \
+ TESTPLANARTOBID(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
+ YALIGN, benchmark_width_ - 4, DIFF, _Any, +, 0, FMT_C, \
+ BPP_C) \
+ TESTPLANARTOBID(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
+ YALIGN, benchmark_width_, DIFF, _Unaligned, +, 1, FMT_C, \
+ BPP_C) \
+ TESTPLANARTOBID(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
+ YALIGN, benchmark_width_, DIFF, _Invert, -, 0, FMT_C, BPP_C) \
+ TESTPLANARTOBID(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
+ YALIGN, benchmark_width_, DIFF, _Opt, +, 0, FMT_C, BPP_C)
+
+TESTPLANARTOBD(I420, 2, 2, RGB565, 2, 2, 1, 9, ARGB, 4)
+
+#define TESTPTOB(NAME, UYVYTOI420, UYVYTONV12) \
+ TEST_F(LibYUVConvertTest, NAME) { \
+ const int kWidth = benchmark_width_; \
+ const int kHeight = benchmark_height_; \
+ \
+ align_buffer_page_end(orig_uyvy, 4 * SUBSAMPLE(kWidth, 2) * kHeight); \
+ align_buffer_page_end(orig_y, kWidth* kHeight); \
+ align_buffer_page_end(orig_u, \
+ SUBSAMPLE(kWidth, 2) * SUBSAMPLE(kHeight, 2)); \
+ align_buffer_page_end(orig_v, \
+ SUBSAMPLE(kWidth, 2) * SUBSAMPLE(kHeight, 2)); \
+ \
+ align_buffer_page_end(dst_y_orig, kWidth* kHeight); \
+ align_buffer_page_end(dst_uv_orig, \
+ 2 * SUBSAMPLE(kWidth, 2) * SUBSAMPLE(kHeight, 2)); \
+ \
+ align_buffer_page_end(dst_y, kWidth* kHeight); \
+ align_buffer_page_end(dst_uv, \
+ 2 * SUBSAMPLE(kWidth, 2) * SUBSAMPLE(kHeight, 2)); \
+ \
+ MemRandomize(orig_uyvy, 4 * SUBSAMPLE(kWidth, 2) * kHeight); \
+ \
+ /* Convert UYVY to NV12 in 2 steps for reference */ \
+ libyuv::UYVYTOI420(orig_uyvy, 4 * SUBSAMPLE(kWidth, 2), orig_y, kWidth, \
+ orig_u, SUBSAMPLE(kWidth, 2), orig_v, \
+ SUBSAMPLE(kWidth, 2), kWidth, kHeight); \
+ libyuv::I420ToNV12(orig_y, kWidth, orig_u, SUBSAMPLE(kWidth, 2), orig_v, \
+ SUBSAMPLE(kWidth, 2), dst_y_orig, kWidth, dst_uv_orig, \
+ 2 * SUBSAMPLE(kWidth, 2), kWidth, kHeight); \
+ \
+ /* Convert to NV12 */ \
+ for (int i = 0; i < benchmark_iterations_; ++i) { \
+ libyuv::UYVYTONV12(orig_uyvy, 4 * SUBSAMPLE(kWidth, 2), dst_y, kWidth, \
+ dst_uv, 2 * SUBSAMPLE(kWidth, 2), kWidth, kHeight); \
+ } \
+ \
+ for (int i = 0; i < kWidth * kHeight; ++i) { \
+ EXPECT_EQ(orig_y[i], dst_y[i]); \
+ } \
+ for (int i = 0; i < kWidth * kHeight; ++i) { \
+ EXPECT_EQ(dst_y_orig[i], dst_y[i]); \
+ } \
+ for (int i = 0; i < 2 * SUBSAMPLE(kWidth, 2) * SUBSAMPLE(kHeight, 2); \
+ ++i) { \
+ EXPECT_EQ(dst_uv_orig[i], dst_uv[i]); \
+ } \
+ \
+ free_aligned_buffer_page_end(orig_uyvy); \
+ free_aligned_buffer_page_end(orig_y); \
+ free_aligned_buffer_page_end(orig_u); \
+ free_aligned_buffer_page_end(orig_v); \
+ free_aligned_buffer_page_end(dst_y_orig); \
+ free_aligned_buffer_page_end(dst_uv_orig); \
+ free_aligned_buffer_page_end(dst_y); \
+ free_aligned_buffer_page_end(dst_uv); \
+ }
+
+TESTPTOB(TestYUY2ToNV12, YUY2ToI420, YUY2ToNV12)
+TESTPTOB(TestUYVYToNV12, UYVYToI420, UYVYToNV12)
+
+#define TESTPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \
+ W1280, N, NEG, OFF, FMT_C, BPP_C) \
+ TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##_##FMT_C##N) { \
+ const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
+ const int kHeight = benchmark_height_; \
+ const int kStrideB = SUBSAMPLE(kWidth, SUB_B) * BPP_B; \
+ const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X); \
+ const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y); \
+ align_buffer_page_end(src_y, kWidth* kHeight + OFF); \
+ align_buffer_page_end(src_u, kSizeUV + OFF); \
+ align_buffer_page_end(src_v, kSizeUV + OFF); \
+ align_buffer_page_end(dst_argb_b, kStrideB* kHeight + OFF); \
+ for (int i = 0; i < kWidth * kHeight; ++i) { \
+ src_y[i + OFF] = (fastrand() & 0xff); \
+ } \
+ for (int i = 0; i < kSizeUV; ++i) { \
+ src_u[i + OFF] = (fastrand() & 0xff); \
+ src_v[i + OFF] = (fastrand() & 0xff); \
+ } \
+ memset(dst_argb_b + OFF, 1, kStrideB * kHeight); \
+ for (int i = 0; i < benchmark_iterations_; ++i) { \
+ FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, src_u + OFF, kStrideUV, \
+ src_v + OFF, kStrideUV, dst_argb_b + OFF, \
+ kStrideB, kWidth, NEG kHeight); \
+ } \
+ /* Convert to a 3rd format in 1 step and 2 steps and compare */ \
+ const int kStrideC = kWidth * BPP_C; \
+ align_buffer_page_end(dst_argb_c, kStrideC* kHeight + OFF); \
+ align_buffer_page_end(dst_argb_bc, kStrideC* kHeight + OFF); \
+ memset(dst_argb_c + OFF, 2, kStrideC * kHeight); \
+ memset(dst_argb_bc + OFF, 3, kStrideC * kHeight); \
+ FMT_PLANAR##To##FMT_C(src_y + OFF, kWidth, src_u + OFF, kStrideUV, \
+ src_v + OFF, kStrideUV, dst_argb_c + OFF, kStrideC, \
+ kWidth, NEG kHeight); \
+ /* Convert B to C */ \
+ FMT_B##To##FMT_C(dst_argb_b + OFF, kStrideB, dst_argb_bc + OFF, kStrideC, \
+ kWidth, kHeight); \
+ for (int i = 0; i < kStrideC * kHeight; ++i) { \
+ EXPECT_EQ(dst_argb_c[i + OFF], dst_argb_bc[i + OFF]); \
+ } \
+ free_aligned_buffer_page_end(src_y); \
+ free_aligned_buffer_page_end(src_u); \
+ free_aligned_buffer_page_end(src_v); \
+ free_aligned_buffer_page_end(dst_argb_b); \
+ free_aligned_buffer_page_end(dst_argb_c); \
+ free_aligned_buffer_page_end(dst_argb_bc); \
+ }
+
+#define TESTPLANARTOE(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \
+ FMT_C, BPP_C) \
+ TESTPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \
+ benchmark_width_ - 4, _Any, +, 0, FMT_C, BPP_C) \
+ TESTPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \
+ benchmark_width_, _Unaligned, +, 1, FMT_C, BPP_C) \
+ TESTPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \
+ benchmark_width_, _Invert, -, 0, FMT_C, BPP_C) \
+ TESTPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \
+ benchmark_width_, _Opt, +, 0, FMT_C, BPP_C)
+
+TESTPLANARTOE(I420, 2, 2, ARGB, 1, 4, ABGR, 4)
+TESTPLANARTOE(J420, 2, 2, ARGB, 1, 4, ARGB, 4)
+TESTPLANARTOE(J420, 2, 2, ABGR, 1, 4, ARGB, 4)
+TESTPLANARTOE(H420, 2, 2, ARGB, 1, 4, ARGB, 4)
+TESTPLANARTOE(H420, 2, 2, ABGR, 1, 4, ARGB, 4)
+TESTPLANARTOE(I420, 2, 2, BGRA, 1, 4, ARGB, 4)
+TESTPLANARTOE(I420, 2, 2, ABGR, 1, 4, ARGB, 4)
+TESTPLANARTOE(I420, 2, 2, RGBA, 1, 4, ARGB, 4)
+TESTPLANARTOE(I420, 2, 2, RGB24, 1, 3, ARGB, 4)
+TESTPLANARTOE(I420, 2, 2, RAW, 1, 3, RGB24, 3)
+TESTPLANARTOE(I420, 2, 2, RGB24, 1, 3, RAW, 3)
+TESTPLANARTOE(I420, 2, 2, ARGB, 1, 4, RAW, 3)
+TESTPLANARTOE(I420, 2, 2, RAW, 1, 3, ARGB, 4)
+TESTPLANARTOE(I420, 2, 2, ARGB, 1, 4, RGB565, 2)
+TESTPLANARTOE(I420, 2, 2, ARGB, 1, 4, ARGB1555, 2)
+TESTPLANARTOE(I420, 2, 2, ARGB, 1, 4, ARGB4444, 2)
+TESTPLANARTOE(I422, 2, 1, ARGB, 1, 4, RGB565, 2)
+TESTPLANARTOE(J422, 2, 1, ARGB, 1, 4, ARGB, 4)
+TESTPLANARTOE(J422, 2, 1, ABGR, 1, 4, ARGB, 4)
+TESTPLANARTOE(H422, 2, 1, ARGB, 1, 4, ARGB, 4)
+TESTPLANARTOE(H422, 2, 1, ABGR, 1, 4, ARGB, 4)
+TESTPLANARTOE(I422, 2, 1, BGRA, 1, 4, ARGB, 4)
+TESTPLANARTOE(I422, 2, 1, ABGR, 1, 4, ARGB, 4)
+TESTPLANARTOE(I422, 2, 1, RGBA, 1, 4, ARGB, 4)
+TESTPLANARTOE(I444, 1, 1, ARGB, 1, 4, ARGB, 4)
+TESTPLANARTOE(J444, 1, 1, ARGB, 1, 4, ARGB, 4)
+TESTPLANARTOE(I444, 1, 1, ABGR, 1, 4, ARGB, 4)
+TESTPLANARTOE(I420, 2, 2, YUY2, 2, 4, ARGB, 4)
+TESTPLANARTOE(I420, 2, 2, UYVY, 2, 4, ARGB, 4)
+TESTPLANARTOE(I422, 2, 1, YUY2, 2, 4, ARGB, 4)
+TESTPLANARTOE(I422, 2, 1, UYVY, 2, 4, ARGB, 4)
+
+#define TESTQPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \
+ W1280, N, NEG, OFF, FMT_C, BPP_C, ATTEN) \
+ TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##_##FMT_C##N) { \
+ const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
+ const int kHeight = benchmark_height_; \
+ const int kStrideB = SUBSAMPLE(kWidth, SUB_B) * BPP_B; \
+ const int kSizeUV = \
+ SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y); \
+ align_buffer_page_end(src_y, kWidth* kHeight + OFF); \
+ align_buffer_page_end(src_u, kSizeUV + OFF); \
+ align_buffer_page_end(src_v, kSizeUV + OFF); \
+ align_buffer_page_end(src_a, kWidth* kHeight + OFF); \
+ align_buffer_page_end(dst_argb_b, kStrideB* kHeight + OFF); \
+ for (int i = 0; i < kWidth * kHeight; ++i) { \
+ src_y[i + OFF] = (fastrand() & 0xff); \
+ src_a[i + OFF] = (fastrand() & 0xff); \
+ } \
+ for (int i = 0; i < kSizeUV; ++i) { \
+ src_u[i + OFF] = (fastrand() & 0xff); \
+ src_v[i + OFF] = (fastrand() & 0xff); \
+ } \
+ memset(dst_argb_b + OFF, 1, kStrideB * kHeight); \
+ for (int i = 0; i < benchmark_iterations_; ++i) { \
+ FMT_PLANAR##To##FMT_B( \
+ src_y + OFF, kWidth, src_u + OFF, SUBSAMPLE(kWidth, SUBSAMP_X), \
+ src_v + OFF, SUBSAMPLE(kWidth, SUBSAMP_X), src_a + OFF, kWidth, \
+ dst_argb_b + OFF, kStrideB, kWidth, NEG kHeight, ATTEN); \
+ } \
+ /* Convert to a 3rd format in 1 step and 2 steps and compare */ \
+ const int kStrideC = kWidth * BPP_C; \
+ align_buffer_page_end(dst_argb_c, kStrideC* kHeight + OFF); \
+ align_buffer_page_end(dst_argb_bc, kStrideC* kHeight + OFF); \
+ memset(dst_argb_c + OFF, 2, kStrideC * kHeight); \
+ memset(dst_argb_bc + OFF, 3, kStrideC * kHeight); \
+ FMT_PLANAR##To##FMT_C( \
+ src_y + OFF, kWidth, src_u + OFF, SUBSAMPLE(kWidth, SUBSAMP_X), \
+ src_v + OFF, SUBSAMPLE(kWidth, SUBSAMP_X), src_a + OFF, kWidth, \
+ dst_argb_c + OFF, kStrideC, kWidth, NEG kHeight, ATTEN); \
+ /* Convert B to C */ \
+ FMT_B##To##FMT_C(dst_argb_b + OFF, kStrideB, dst_argb_bc + OFF, kStrideC, \
+ kWidth, kHeight); \
+ for (int i = 0; i < kStrideC * kHeight; ++i) { \
+ EXPECT_EQ(dst_argb_c[i + OFF], dst_argb_bc[i + OFF]); \
+ } \
+ free_aligned_buffer_page_end(src_y); \
+ free_aligned_buffer_page_end(src_u); \
+ free_aligned_buffer_page_end(src_v); \
+ free_aligned_buffer_page_end(src_a); \
+ free_aligned_buffer_page_end(dst_argb_b); \
+ free_aligned_buffer_page_end(dst_argb_c); \
+ free_aligned_buffer_page_end(dst_argb_bc); \
+ }
+
+#define TESTQPLANARTOE(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \
+ FMT_C, BPP_C) \
+ TESTQPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \
+ benchmark_width_ - 4, _Any, +, 0, FMT_C, BPP_C, 0) \
+ TESTQPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \
+ benchmark_width_, _Unaligned, +, 1, FMT_C, BPP_C, 0) \
+ TESTQPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \
+ benchmark_width_, _Invert, -, 0, FMT_C, BPP_C, 0) \
+ TESTQPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \
+ benchmark_width_, _Opt, +, 0, FMT_C, BPP_C, 0) \
+ TESTQPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \
+ benchmark_width_, _Premult, +, 0, FMT_C, BPP_C, 1)
+
+TESTQPLANARTOE(I420Alpha, 2, 2, ARGB, 1, 4, ABGR, 4)
+TESTQPLANARTOE(I420Alpha, 2, 2, ABGR, 1, 4, ARGB, 4)
+
+TEST_F(LibYUVConvertTest, RotateWithARGBSource) {
+ // 2x2 frames
+ uint32_t src[4];
+ uint32_t dst[4];
+ // some random input
+ src[0] = 0x11000000;
+ src[1] = 0x00450000;
+ src[2] = 0x00009f00;
+ src[3] = 0x000000ff;
+ // zeros on destination
+ dst[0] = 0x00000000;
+ dst[1] = 0x00000000;
+ dst[2] = 0x00000000;
+ dst[3] = 0x00000000;
+
+ int r = ConvertToARGB(reinterpret_cast(src),
+ 16, // input size
+ reinterpret_cast(dst),
+ 8, // destination stride
+ 0, // crop_x
+ 0, // crop_y
+ 2, // width
+ 2, // height
+ 2, // crop width
+ 2, // crop height
+ kRotate90, FOURCC_ARGB);
+
+ EXPECT_EQ(r, 0);
+ // 90 degrees rotation, no conversion
+ EXPECT_EQ(dst[0], src[2]);
+ EXPECT_EQ(dst[1], src[0]);
+ EXPECT_EQ(dst[2], src[3]);
+ EXPECT_EQ(dst[3], src[1]);
+}
+
+} // namespace libyuv
diff --git a/libyuv/src/main/cpp/libyuv/unit_test/cpu_test.cc b/libyuv/src/main/cpp/libyuv/unit_test/cpu_test.cc
new file mode 100644
index 0000000..c0b8910
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/unit_test/cpu_test.cc
@@ -0,0 +1,144 @@
+/*
+ * Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include
+#include
+
+#include "../unit_test/unit_test.h"
+#include "libyuv/basic_types.h"
+#include "libyuv/cpu_id.h"
+#include "libyuv/version.h"
+
+namespace libyuv {
+
+TEST_F(LibYUVBaseTest, TestCpuHas) {
+ int cpu_flags = TestCpuFlag(-1);
+ printf("Cpu Flags %x\n", cpu_flags);
+ int has_arm = TestCpuFlag(kCpuHasARM);
+ printf("Has ARM %x\n", has_arm);
+ int has_neon = TestCpuFlag(kCpuHasNEON);
+ printf("Has NEON %x\n", has_neon);
+ int has_x86 = TestCpuFlag(kCpuHasX86);
+ printf("Has X86 %x\n", has_x86);
+ int has_sse2 = TestCpuFlag(kCpuHasSSE2);
+ printf("Has SSE2 %x\n", has_sse2);
+ int has_ssse3 = TestCpuFlag(kCpuHasSSSE3);
+ printf("Has SSSE3 %x\n", has_ssse3);
+ int has_sse41 = TestCpuFlag(kCpuHasSSE41);
+ printf("Has SSE4.1 %x\n", has_sse41);
+ int has_sse42 = TestCpuFlag(kCpuHasSSE42);
+ printf("Has SSE4.2 %x\n", has_sse42);
+ int has_avx = TestCpuFlag(kCpuHasAVX);
+ printf("Has AVX %x\n", has_avx);
+ int has_avx2 = TestCpuFlag(kCpuHasAVX2);
+ printf("Has AVX2 %x\n", has_avx2);
+ int has_erms = TestCpuFlag(kCpuHasERMS);
+ printf("Has ERMS %x\n", has_erms);
+ int has_fma3 = TestCpuFlag(kCpuHasFMA3);
+ printf("Has FMA3 %x\n", has_fma3);
+ int has_avx3 = TestCpuFlag(kCpuHasAVX3);
+ printf("Has AVX3 %x\n", has_avx3);
+ int has_f16c = TestCpuFlag(kCpuHasF16C);
+ printf("Has F16C %x\n", has_f16c);
+ int has_mips = TestCpuFlag(kCpuHasMIPS);
+ printf("Has MIPS %x\n", has_mips);
+ int has_dspr2 = TestCpuFlag(kCpuHasDSPR2);
+ printf("Has DSPR2 %x\n", has_dspr2);
+ int has_msa = TestCpuFlag(kCpuHasMSA);
+ printf("Has MSA %x\n", has_msa);
+}
+
+TEST_F(LibYUVBaseTest, TestCpuCompilerEnabled) {
+#if defined(__aarch64__)
+ printf("Arm64 build\n");
+#endif
+#if defined(__aarch64__) || defined(__ARM_NEON__) || defined(LIBYUV_NEON)
+ printf("Neon build enabled\n");
+#endif
+#if defined(__x86_64__) || defined(_M_X64)
+ printf("x64 build\n");
+#endif
+#ifdef _MSC_VER
+ printf("_MSC_VER %d\n", _MSC_VER);
+#endif
+#if !defined(LIBYUV_DISABLE_X86) && \
+ (defined(GCC_HAS_AVX2) || defined(CLANG_HAS_AVX2) || \
+ defined(VISUALC_HAS_AVX2))
+ printf("Has AVX2 1\n");
+#else
+ printf("Has AVX2 0\n");
+// If compiler does not support AVX2, the following function not expected:
+#endif
+}
+
+#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || \
+ defined(_M_X64)
+TEST_F(LibYUVBaseTest, TestCpuId) {
+ int has_x86 = TestCpuFlag(kCpuHasX86);
+ if (has_x86) {
+ int cpu_info[4];
+ // Vendor ID:
+ // AuthenticAMD AMD processor
+ // CentaurHauls Centaur processor
+ // CyrixInstead Cyrix processor
+ // GenuineIntel Intel processor
+ // GenuineTMx86 Transmeta processor
+ // Geode by NSC National Semiconductor processor
+ // NexGenDriven NexGen processor
+ // RiseRiseRise Rise Technology processor
+ // SiS SiS SiS SiS processor
+ // UMC UMC UMC UMC processor
+ CpuId(0, 0, cpu_info);
+ cpu_info[0] = cpu_info[1]; // Reorder output
+ cpu_info[1] = cpu_info[3];
+ cpu_info[3] = 0;
+ printf("Cpu Vendor: %s %x %x %x\n", reinterpret_cast(&cpu_info[0]),
+ cpu_info[0], cpu_info[1], cpu_info[2]);
+ EXPECT_EQ(12u, strlen(reinterpret_cast(&cpu_info[0])));
+
+ // CPU Family and Model
+ // 3:0 - Stepping
+ // 7:4 - Model
+ // 11:8 - Family
+ // 13:12 - Processor Type
+ // 19:16 - Extended Model
+ // 27:20 - Extended Family
+ CpuId(1, 0, cpu_info);
+ int family = ((cpu_info[0] >> 8) & 0x0f) | ((cpu_info[0] >> 16) & 0xff0);
+ int model = ((cpu_info[0] >> 4) & 0x0f) | ((cpu_info[0] >> 12) & 0xf0);
+ printf("Cpu Family %d (0x%x), Model %d (0x%x)\n", family, family, model,
+ model);
+ }
+}
+#endif
+
+static int FileExists(const char* file_name) {
+ FILE* f = fopen(file_name, "r");
+ if (!f) {
+ return 0;
+ }
+ fclose(f);
+ return 1;
+}
+
+TEST_F(LibYUVBaseTest, TestLinuxNeon) {
+ if (FileExists("../../unit_test/testdata/arm_v7.txt")) {
+ EXPECT_EQ(0, ArmCpuCaps("../../unit_test/testdata/arm_v7.txt"));
+ EXPECT_EQ(kCpuHasNEON, ArmCpuCaps("../../unit_test/testdata/tegra3.txt"));
+ EXPECT_EQ(kCpuHasNEON, ArmCpuCaps("../../unit_test/testdata/juno.txt"));
+ } else {
+ printf("WARNING: unable to load \"../../unit_test/testdata/arm_v7.txt\"\n");
+ }
+#if defined(__linux__) && defined(__ARM_NEON__)
+ EXPECT_EQ(kCpuHasNEON, ArmCpuCaps("/proc/cpuinfo"));
+#endif
+}
+
+} // namespace libyuv
diff --git a/libyuv/src/main/cpp/libyuv/unit_test/cpu_thread_test.cc b/libyuv/src/main/cpp/libyuv/unit_test/cpu_thread_test.cc
new file mode 100644
index 0000000..59061b9
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/unit_test/cpu_thread_test.cc
@@ -0,0 +1,63 @@
+/*
+ * Copyright 2017 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include
+
+#include "libyuv/cpu_id.h"
+
+#if defined(__clang__)
+#if __has_include()
+#define LIBYUV_HAVE_PTHREAD 1
+#endif
+#elif defined(__linux__)
+#define LIBYUV_HAVE_PTHREAD 1
+#endif
+
+#ifdef LIBYUV_HAVE_PTHREAD
+#include
+#endif
+
+namespace libyuv {
+
+#ifdef LIBYUV_HAVE_PTHREAD
+void* ThreadMain(void* arg) {
+ int* flags = static_cast(arg);
+
+ *flags = TestCpuFlag(kCpuHasSSSE3);
+ return nullptr;
+}
+#endif // LIBYUV_HAVE_PTHREAD
+
+// Call TestCpuFlag() from two threads. ThreadSanitizer should not report any
+// data race.
+TEST(LibYUVCpuThreadTest, TestCpuFlagMultipleThreads) {
+#ifdef LIBYUV_HAVE_PTHREAD
+ int cpu_flags1;
+ int cpu_flags2;
+ int ret;
+ pthread_t thread1;
+ pthread_t thread2;
+
+ MaskCpuFlags(0); // Reset to 0 to allow auto detect.
+ ret = pthread_create(&thread1, nullptr, ThreadMain, &cpu_flags1);
+ ASSERT_EQ(ret, 0);
+ ret = pthread_create(&thread2, nullptr, ThreadMain, &cpu_flags2);
+ ASSERT_EQ(ret, 0);
+ ret = pthread_join(thread1, nullptr);
+ EXPECT_EQ(ret, 0);
+ ret = pthread_join(thread2, nullptr);
+ EXPECT_EQ(ret, 0);
+ EXPECT_EQ(cpu_flags1, cpu_flags2);
+#else
+ printf("pthread unavailable; Test skipped.");
+#endif // LIBYUV_HAVE_PTHREAD
+}
+
+} // namespace libyuv
diff --git a/libyuv/src/main/cpp/libyuv/unit_test/math_test.cc b/libyuv/src/main/cpp/libyuv/unit_test/math_test.cc
new file mode 100644
index 0000000..2b4b57b
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/unit_test/math_test.cc
@@ -0,0 +1,155 @@
+/*
+ * Copyright 2013 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include
+#include
+#include
+
+#include "../unit_test/unit_test.h"
+#include "libyuv/basic_types.h"
+#include "libyuv/cpu_id.h"
+#include "libyuv/scale.h"
+#include "libyuv/scale_row.h"
+
+namespace libyuv {
+
+TEST_F(LibYUVBaseTest, TestFixedDiv) {
+ int num[1280];
+ int div[1280];
+ int result_opt[1280];
+ int result_c[1280];
+
+ EXPECT_EQ(0x10000, libyuv::FixedDiv(1, 1));
+ EXPECT_EQ(0x7fff0000, libyuv::FixedDiv(0x7fff, 1));
+ // TODO(fbarchard): Avoid the following that throw exceptions.
+ // EXPECT_EQ(0x100000000, libyuv::FixedDiv(0x10000, 1));
+ // EXPECT_EQ(0x80000000, libyuv::FixedDiv(0x8000, 1));
+
+ EXPECT_EQ(0x20000, libyuv::FixedDiv(640 * 2, 640));
+ EXPECT_EQ(0x30000, libyuv::FixedDiv(640 * 3, 640));
+ EXPECT_EQ(0x40000, libyuv::FixedDiv(640 * 4, 640));
+ EXPECT_EQ(0x50000, libyuv::FixedDiv(640 * 5, 640));
+ EXPECT_EQ(0x60000, libyuv::FixedDiv(640 * 6, 640));
+ EXPECT_EQ(0x70000, libyuv::FixedDiv(640 * 7, 640));
+ EXPECT_EQ(0x80000, libyuv::FixedDiv(640 * 8, 640));
+ EXPECT_EQ(0xa0000, libyuv::FixedDiv(640 * 10, 640));
+ EXPECT_EQ(0x20000, libyuv::FixedDiv(960 * 2, 960));
+ EXPECT_EQ(0x08000, libyuv::FixedDiv(640 / 2, 640));
+ EXPECT_EQ(0x04000, libyuv::FixedDiv(640 / 4, 640));
+ EXPECT_EQ(0x20000, libyuv::FixedDiv(1080 * 2, 1080));
+ EXPECT_EQ(0x20000, libyuv::FixedDiv(200000, 100000));
+ EXPECT_EQ(0x18000, libyuv::FixedDiv(150000, 100000));
+ EXPECT_EQ(0x20000, libyuv::FixedDiv(40000, 20000));
+ EXPECT_EQ(0x20000, libyuv::FixedDiv(-40000, -20000));
+ EXPECT_EQ(-0x20000, libyuv::FixedDiv(40000, -20000));
+ EXPECT_EQ(-0x20000, libyuv::FixedDiv(-40000, 20000));
+ EXPECT_EQ(0x10000, libyuv::FixedDiv(4095, 4095));
+ EXPECT_EQ(0x10000, libyuv::FixedDiv(4096, 4096));
+ EXPECT_EQ(0x10000, libyuv::FixedDiv(4097, 4097));
+ EXPECT_EQ(123 * 65536, libyuv::FixedDiv(123, 1));
+
+ for (int i = 1; i < 4100; ++i) {
+ EXPECT_EQ(0x10000, libyuv::FixedDiv(i, i));
+ EXPECT_EQ(0x20000, libyuv::FixedDiv(i * 2, i));
+ EXPECT_EQ(0x30000, libyuv::FixedDiv(i * 3, i));
+ EXPECT_EQ(0x40000, libyuv::FixedDiv(i * 4, i));
+ EXPECT_EQ(0x08000, libyuv::FixedDiv(i, i * 2));
+ EXPECT_NEAR(16384 * 65536 / i, libyuv::FixedDiv(16384, i), 1);
+ }
+ EXPECT_EQ(123 * 65536, libyuv::FixedDiv(123, 1));
+
+ MemRandomize(reinterpret_cast(&num[0]), sizeof(num));
+ MemRandomize(reinterpret_cast(&div[0]), sizeof(div));
+ for (int j = 0; j < 1280; ++j) {
+ if (div[j] == 0) {
+ div[j] = 1280;
+ }
+ num[j] &= 0xffff; // Clamp to avoid divide overflow.
+ }
+ for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
+ for (int j = 0; j < 1280; ++j) {
+ result_opt[j] = libyuv::FixedDiv(num[j], div[j]);
+ }
+ }
+ for (int j = 0; j < 1280; ++j) {
+ result_c[j] = libyuv::FixedDiv_C(num[j], div[j]);
+ EXPECT_NEAR(result_c[j], result_opt[j], 1);
+ }
+}
+
+TEST_F(LibYUVBaseTest, TestFixedDiv_Opt) {
+ int num[1280];
+ int div[1280];
+ int result_opt[1280];
+ int result_c[1280];
+
+ MemRandomize(reinterpret_cast(&num[0]), sizeof(num));
+ MemRandomize(reinterpret_cast(&div[0]), sizeof(div));
+ for (int j = 0; j < 1280; ++j) {
+ num[j] &= 4095; // Make numerator smaller.
+ div[j] &= 4095; // Make divisor smaller.
+ if (div[j] == 0) {
+ div[j] = 1280;
+ }
+ }
+
+ int has_x86 = TestCpuFlag(kCpuHasX86);
+ for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
+ if (has_x86) {
+ for (int j = 0; j < 1280; ++j) {
+ result_opt[j] = libyuv::FixedDiv(num[j], div[j]);
+ }
+ } else {
+ for (int j = 0; j < 1280; ++j) {
+ result_opt[j] = libyuv::FixedDiv_C(num[j], div[j]);
+ }
+ }
+ }
+ for (int j = 0; j < 1280; ++j) {
+ result_c[j] = libyuv::FixedDiv_C(num[j], div[j]);
+ EXPECT_NEAR(result_c[j], result_opt[j], 1);
+ }
+}
+
+TEST_F(LibYUVBaseTest, TestFixedDiv1_Opt) {
+ int num[1280];
+ int div[1280];
+ int result_opt[1280];
+ int result_c[1280];
+
+ MemRandomize(reinterpret_cast(&num[0]), sizeof(num));
+ MemRandomize(reinterpret_cast(&div[0]), sizeof(div));
+ for (int j = 0; j < 1280; ++j) {
+ num[j] &= 4095; // Make numerator smaller.
+ div[j] &= 4095; // Make divisor smaller.
+ if (div[j] <= 1) {
+ div[j] = 1280;
+ }
+ }
+
+ int has_x86 = TestCpuFlag(kCpuHasX86);
+ for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
+ if (has_x86) {
+ for (int j = 0; j < 1280; ++j) {
+ result_opt[j] = libyuv::FixedDiv1(num[j], div[j]);
+ }
+ } else {
+ for (int j = 0; j < 1280; ++j) {
+ result_opt[j] = libyuv::FixedDiv1_C(num[j], div[j]);
+ }
+ }
+ }
+ for (int j = 0; j < 1280; ++j) {
+ result_c[j] = libyuv::FixedDiv1_C(num[j], div[j]);
+ EXPECT_NEAR(result_c[j], result_opt[j], 1);
+ }
+}
+
+} // namespace libyuv
diff --git a/libyuv/src/main/cpp/libyuv/unit_test/planar_test.cc b/libyuv/src/main/cpp/libyuv/unit_test/planar_test.cc
new file mode 100644
index 0000000..28d557a
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/unit_test/planar_test.cc
@@ -0,0 +1,2521 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include
+#include
+
+#include "../unit_test/unit_test.h"
+#include "libyuv/compare.h"
+#include "libyuv/convert.h"
+#include "libyuv/convert_argb.h"
+#include "libyuv/convert_from.h"
+#include "libyuv/convert_from_argb.h"
+#include "libyuv/cpu_id.h"
+#include "libyuv/planar_functions.h"
+#include "libyuv/rotate.h"
+
+namespace libyuv {
+
+TEST_F(LibYUVPlanarTest, TestAttenuate) {
+ const int kSize = 1280 * 4;
+ align_buffer_page_end(orig_pixels, kSize);
+ align_buffer_page_end(atten_pixels, kSize);
+ align_buffer_page_end(unatten_pixels, kSize);
+ align_buffer_page_end(atten2_pixels, kSize);
+
+ // Test unattenuation clamps
+ orig_pixels[0 * 4 + 0] = 200u;
+ orig_pixels[0 * 4 + 1] = 129u;
+ orig_pixels[0 * 4 + 2] = 127u;
+ orig_pixels[0 * 4 + 3] = 128u;
+ // Test unattenuation transparent and opaque are unaffected
+ orig_pixels[1 * 4 + 0] = 16u;
+ orig_pixels[1 * 4 + 1] = 64u;
+ orig_pixels[1 * 4 + 2] = 192u;
+ orig_pixels[1 * 4 + 3] = 0u;
+ orig_pixels[2 * 4 + 0] = 16u;
+ orig_pixels[2 * 4 + 1] = 64u;
+ orig_pixels[2 * 4 + 2] = 192u;
+ orig_pixels[2 * 4 + 3] = 255u;
+ orig_pixels[3 * 4 + 0] = 16u;
+ orig_pixels[3 * 4 + 1] = 64u;
+ orig_pixels[3 * 4 + 2] = 192u;
+ orig_pixels[3 * 4 + 3] = 128u;
+ ARGBUnattenuate(orig_pixels, 0, unatten_pixels, 0, 4, 1);
+ EXPECT_EQ(255u, unatten_pixels[0 * 4 + 0]);
+ EXPECT_EQ(255u, unatten_pixels[0 * 4 + 1]);
+ EXPECT_EQ(254u, unatten_pixels[0 * 4 + 2]);
+ EXPECT_EQ(128u, unatten_pixels[0 * 4 + 3]);
+ EXPECT_EQ(0u, unatten_pixels[1 * 4 + 0]);
+ EXPECT_EQ(0u, unatten_pixels[1 * 4 + 1]);
+ EXPECT_EQ(0u, unatten_pixels[1 * 4 + 2]);
+ EXPECT_EQ(0u, unatten_pixels[1 * 4 + 3]);
+ EXPECT_EQ(16u, unatten_pixels[2 * 4 + 0]);
+ EXPECT_EQ(64u, unatten_pixels[2 * 4 + 1]);
+ EXPECT_EQ(192u, unatten_pixels[2 * 4 + 2]);
+ EXPECT_EQ(255u, unatten_pixels[2 * 4 + 3]);
+ EXPECT_EQ(32u, unatten_pixels[3 * 4 + 0]);
+ EXPECT_EQ(128u, unatten_pixels[3 * 4 + 1]);
+ EXPECT_EQ(255u, unatten_pixels[3 * 4 + 2]);
+ EXPECT_EQ(128u, unatten_pixels[3 * 4 + 3]);
+
+ for (int i = 0; i < 1280; ++i) {
+ orig_pixels[i * 4 + 0] = i;
+ orig_pixels[i * 4 + 1] = i / 2;
+ orig_pixels[i * 4 + 2] = i / 3;
+ orig_pixels[i * 4 + 3] = i;
+ }
+ ARGBAttenuate(orig_pixels, 0, atten_pixels, 0, 1280, 1);
+ ARGBUnattenuate(atten_pixels, 0, unatten_pixels, 0, 1280, 1);
+ for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
+ ARGBAttenuate(unatten_pixels, 0, atten2_pixels, 0, 1280, 1);
+ }
+ for (int i = 0; i < 1280; ++i) {
+ EXPECT_NEAR(atten_pixels[i * 4 + 0], atten2_pixels[i * 4 + 0], 2);
+ EXPECT_NEAR(atten_pixels[i * 4 + 1], atten2_pixels[i * 4 + 1], 2);
+ EXPECT_NEAR(atten_pixels[i * 4 + 2], atten2_pixels[i * 4 + 2], 2);
+ EXPECT_NEAR(atten_pixels[i * 4 + 3], atten2_pixels[i * 4 + 3], 2);
+ }
+ // Make sure transparent, 50% and opaque are fully accurate.
+ EXPECT_EQ(0, atten_pixels[0 * 4 + 0]);
+ EXPECT_EQ(0, atten_pixels[0 * 4 + 1]);
+ EXPECT_EQ(0, atten_pixels[0 * 4 + 2]);
+ EXPECT_EQ(0, atten_pixels[0 * 4 + 3]);
+ EXPECT_EQ(64, atten_pixels[128 * 4 + 0]);
+ EXPECT_EQ(32, atten_pixels[128 * 4 + 1]);
+ EXPECT_EQ(21, atten_pixels[128 * 4 + 2]);
+ EXPECT_EQ(128, atten_pixels[128 * 4 + 3]);
+ EXPECT_NEAR(255, atten_pixels[255 * 4 + 0], 1);
+ EXPECT_NEAR(127, atten_pixels[255 * 4 + 1], 1);
+ EXPECT_NEAR(85, atten_pixels[255 * 4 + 2], 1);
+ EXPECT_EQ(255, atten_pixels[255 * 4 + 3]);
+
+ free_aligned_buffer_page_end(atten2_pixels);
+ free_aligned_buffer_page_end(unatten_pixels);
+ free_aligned_buffer_page_end(atten_pixels);
+ free_aligned_buffer_page_end(orig_pixels);
+}
+
+static int TestAttenuateI(int width,
+ int height,
+ int benchmark_iterations,
+ int disable_cpu_flags,
+ int benchmark_cpu_info,
+ int invert,
+ int off) {
+ if (width < 1) {
+ width = 1;
+ }
+ const int kBpp = 4;
+ const int kStride = width * kBpp;
+ align_buffer_page_end(src_argb, kStride * height + off);
+ align_buffer_page_end(dst_argb_c, kStride * height);
+ align_buffer_page_end(dst_argb_opt, kStride * height);
+ for (int i = 0; i < kStride * height; ++i) {
+ src_argb[i + off] = (fastrand() & 0xff);
+ }
+ memset(dst_argb_c, 0, kStride * height);
+ memset(dst_argb_opt, 0, kStride * height);
+
+ MaskCpuFlags(disable_cpu_flags);
+ ARGBAttenuate(src_argb + off, kStride, dst_argb_c, kStride, width,
+ invert * height);
+ MaskCpuFlags(benchmark_cpu_info);
+ for (int i = 0; i < benchmark_iterations; ++i) {
+ ARGBAttenuate(src_argb + off, kStride, dst_argb_opt, kStride, width,
+ invert * height);
+ }
+ int max_diff = 0;
+ for (int i = 0; i < kStride * height; ++i) {
+ int abs_diff = abs(static_cast(dst_argb_c[i]) -
+ static_cast(dst_argb_opt[i]));
+ if (abs_diff > max_diff) {
+ max_diff = abs_diff;
+ }
+ }
+ free_aligned_buffer_page_end(src_argb);
+ free_aligned_buffer_page_end(dst_argb_c);
+ free_aligned_buffer_page_end(dst_argb_opt);
+ return max_diff;
+}
+
+TEST_F(LibYUVPlanarTest, ARGBAttenuate_Any) {
+ int max_diff = TestAttenuateI(benchmark_width_ - 1, benchmark_height_,
+ benchmark_iterations_, disable_cpu_flags_,
+ benchmark_cpu_info_, +1, 0);
+ EXPECT_LE(max_diff, 2);
+}
+
+TEST_F(LibYUVPlanarTest, ARGBAttenuate_Unaligned) {
+ int max_diff =
+ TestAttenuateI(benchmark_width_, benchmark_height_, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_, +1, 1);
+ EXPECT_LE(max_diff, 2);
+}
+
+TEST_F(LibYUVPlanarTest, ARGBAttenuate_Invert) {
+ int max_diff =
+ TestAttenuateI(benchmark_width_, benchmark_height_, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_, -1, 0);
+ EXPECT_LE(max_diff, 2);
+}
+
+TEST_F(LibYUVPlanarTest, ARGBAttenuate_Opt) {
+ int max_diff =
+ TestAttenuateI(benchmark_width_, benchmark_height_, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_, +1, 0);
+ EXPECT_LE(max_diff, 2);
+}
+
+static int TestUnattenuateI(int width,
+ int height,
+ int benchmark_iterations,
+ int disable_cpu_flags,
+ int benchmark_cpu_info,
+ int invert,
+ int off) {
+ if (width < 1) {
+ width = 1;
+ }
+ const int kBpp = 4;
+ const int kStride = width * kBpp;
+ align_buffer_page_end(src_argb, kStride * height + off);
+ align_buffer_page_end(dst_argb_c, kStride * height);
+ align_buffer_page_end(dst_argb_opt, kStride * height);
+ for (int i = 0; i < kStride * height; ++i) {
+ src_argb[i + off] = (fastrand() & 0xff);
+ }
+ ARGBAttenuate(src_argb + off, kStride, src_argb + off, kStride, width,
+ height);
+ memset(dst_argb_c, 0, kStride * height);
+ memset(dst_argb_opt, 0, kStride * height);
+
+ MaskCpuFlags(disable_cpu_flags);
+ ARGBUnattenuate(src_argb + off, kStride, dst_argb_c, kStride, width,
+ invert * height);
+ MaskCpuFlags(benchmark_cpu_info);
+ for (int i = 0; i < benchmark_iterations; ++i) {
+ ARGBUnattenuate(src_argb + off, kStride, dst_argb_opt, kStride, width,
+ invert * height);
+ }
+ int max_diff = 0;
+ for (int i = 0; i < kStride * height; ++i) {
+ int abs_diff = abs(static_cast(dst_argb_c[i]) -
+ static_cast(dst_argb_opt[i]));
+ if (abs_diff > max_diff) {
+ max_diff = abs_diff;
+ }
+ }
+ free_aligned_buffer_page_end(src_argb);
+ free_aligned_buffer_page_end(dst_argb_c);
+ free_aligned_buffer_page_end(dst_argb_opt);
+ return max_diff;
+}
+
+TEST_F(LibYUVPlanarTest, ARGBUnattenuate_Any) {
+ int max_diff = TestUnattenuateI(benchmark_width_ - 1, benchmark_height_,
+ benchmark_iterations_, disable_cpu_flags_,
+ benchmark_cpu_info_, +1, 0);
+ EXPECT_LE(max_diff, 2);
+}
+
+TEST_F(LibYUVPlanarTest, ARGBUnattenuate_Unaligned) {
+ int max_diff = TestUnattenuateI(benchmark_width_, benchmark_height_,
+ benchmark_iterations_, disable_cpu_flags_,
+ benchmark_cpu_info_, +1, 1);
+ EXPECT_LE(max_diff, 2);
+}
+
+TEST_F(LibYUVPlanarTest, ARGBUnattenuate_Invert) {
+ int max_diff = TestUnattenuateI(benchmark_width_, benchmark_height_,
+ benchmark_iterations_, disable_cpu_flags_,
+ benchmark_cpu_info_, -1, 0);
+ EXPECT_LE(max_diff, 2);
+}
+
+TEST_F(LibYUVPlanarTest, ARGBUnattenuate_Opt) {
+ int max_diff = TestUnattenuateI(benchmark_width_, benchmark_height_,
+ benchmark_iterations_, disable_cpu_flags_,
+ benchmark_cpu_info_, +1, 0);
+ EXPECT_LE(max_diff, 2);
+}
+
+TEST_F(LibYUVPlanarTest, TestARGBComputeCumulativeSum) {
+ SIMD_ALIGNED(uint8 orig_pixels[16][16][4]);
+ SIMD_ALIGNED(int32 added_pixels[16][16][4]);
+
+ for (int y = 0; y < 16; ++y) {
+ for (int x = 0; x < 16; ++x) {
+ orig_pixels[y][x][0] = 1u;
+ orig_pixels[y][x][1] = 2u;
+ orig_pixels[y][x][2] = 3u;
+ orig_pixels[y][x][3] = 255u;
+ }
+ }
+
+ ARGBComputeCumulativeSum(&orig_pixels[0][0][0], 16 * 4,
+ &added_pixels[0][0][0], 16 * 4, 16, 16);
+
+ for (int y = 0; y < 16; ++y) {
+ for (int x = 0; x < 16; ++x) {
+ EXPECT_EQ((x + 1) * (y + 1), added_pixels[y][x][0]);
+ EXPECT_EQ((x + 1) * (y + 1) * 2, added_pixels[y][x][1]);
+ EXPECT_EQ((x + 1) * (y + 1) * 3, added_pixels[y][x][2]);
+ EXPECT_EQ((x + 1) * (y + 1) * 255, added_pixels[y][x][3]);
+ }
+ }
+}
+
+TEST_F(LibYUVPlanarTest, TestARGBGray) {
+ SIMD_ALIGNED(uint8 orig_pixels[1280][4]);
+ memset(orig_pixels, 0, sizeof(orig_pixels));
+
+ // Test blue
+ orig_pixels[0][0] = 255u;
+ orig_pixels[0][1] = 0u;
+ orig_pixels[0][2] = 0u;
+ orig_pixels[0][3] = 128u;
+ // Test green
+ orig_pixels[1][0] = 0u;
+ orig_pixels[1][1] = 255u;
+ orig_pixels[1][2] = 0u;
+ orig_pixels[1][3] = 0u;
+ // Test red
+ orig_pixels[2][0] = 0u;
+ orig_pixels[2][1] = 0u;
+ orig_pixels[2][2] = 255u;
+ orig_pixels[2][3] = 255u;
+ // Test black
+ orig_pixels[3][0] = 0u;
+ orig_pixels[3][1] = 0u;
+ orig_pixels[3][2] = 0u;
+ orig_pixels[3][3] = 255u;
+ // Test white
+ orig_pixels[4][0] = 255u;
+ orig_pixels[4][1] = 255u;
+ orig_pixels[4][2] = 255u;
+ orig_pixels[4][3] = 255u;
+ // Test color
+ orig_pixels[5][0] = 16u;
+ orig_pixels[5][1] = 64u;
+ orig_pixels[5][2] = 192u;
+ orig_pixels[5][3] = 224u;
+ // Do 16 to test asm version.
+ ARGBGray(&orig_pixels[0][0], 0, 0, 0, 16, 1);
+ EXPECT_EQ(30u, orig_pixels[0][0]);
+ EXPECT_EQ(30u, orig_pixels[0][1]);
+ EXPECT_EQ(30u, orig_pixels[0][2]);
+ EXPECT_EQ(128u, orig_pixels[0][3]);
+ EXPECT_EQ(149u, orig_pixels[1][0]);
+ EXPECT_EQ(149u, orig_pixels[1][1]);
+ EXPECT_EQ(149u, orig_pixels[1][2]);
+ EXPECT_EQ(0u, orig_pixels[1][3]);
+ EXPECT_EQ(76u, orig_pixels[2][0]);
+ EXPECT_EQ(76u, orig_pixels[2][1]);
+ EXPECT_EQ(76u, orig_pixels[2][2]);
+ EXPECT_EQ(255u, orig_pixels[2][3]);
+ EXPECT_EQ(0u, orig_pixels[3][0]);
+ EXPECT_EQ(0u, orig_pixels[3][1]);
+ EXPECT_EQ(0u, orig_pixels[3][2]);
+ EXPECT_EQ(255u, orig_pixels[3][3]);
+ EXPECT_EQ(255u, orig_pixels[4][0]);
+ EXPECT_EQ(255u, orig_pixels[4][1]);
+ EXPECT_EQ(255u, orig_pixels[4][2]);
+ EXPECT_EQ(255u, orig_pixels[4][3]);
+ EXPECT_EQ(96u, orig_pixels[5][0]);
+ EXPECT_EQ(96u, orig_pixels[5][1]);
+ EXPECT_EQ(96u, orig_pixels[5][2]);
+ EXPECT_EQ(224u, orig_pixels[5][3]);
+ for (int i = 0; i < 1280; ++i) {
+ orig_pixels[i][0] = i;
+ orig_pixels[i][1] = i / 2;
+ orig_pixels[i][2] = i / 3;
+ orig_pixels[i][3] = i;
+ }
+ for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
+ ARGBGray(&orig_pixels[0][0], 0, 0, 0, 1280, 1);
+ }
+}
+
+TEST_F(LibYUVPlanarTest, TestARGBGrayTo) {
+ SIMD_ALIGNED(uint8 orig_pixels[1280][4]);
+ SIMD_ALIGNED(uint8 gray_pixels[1280][4]);
+ memset(orig_pixels, 0, sizeof(orig_pixels));
+
+ // Test blue
+ orig_pixels[0][0] = 255u;
+ orig_pixels[0][1] = 0u;
+ orig_pixels[0][2] = 0u;
+ orig_pixels[0][3] = 128u;
+ // Test green
+ orig_pixels[1][0] = 0u;
+ orig_pixels[1][1] = 255u;
+ orig_pixels[1][2] = 0u;
+ orig_pixels[1][3] = 0u;
+ // Test red
+ orig_pixels[2][0] = 0u;
+ orig_pixels[2][1] = 0u;
+ orig_pixels[2][2] = 255u;
+ orig_pixels[2][3] = 255u;
+ // Test black
+ orig_pixels[3][0] = 0u;
+ orig_pixels[3][1] = 0u;
+ orig_pixels[3][2] = 0u;
+ orig_pixels[3][3] = 255u;
+ // Test white
+ orig_pixels[4][0] = 255u;
+ orig_pixels[4][1] = 255u;
+ orig_pixels[4][2] = 255u;
+ orig_pixels[4][3] = 255u;
+ // Test color
+ orig_pixels[5][0] = 16u;
+ orig_pixels[5][1] = 64u;
+ orig_pixels[5][2] = 192u;
+ orig_pixels[5][3] = 224u;
+ // Do 16 to test asm version.
+ ARGBGrayTo(&orig_pixels[0][0], 0, &gray_pixels[0][0], 0, 16, 1);
+ EXPECT_EQ(30u, gray_pixels[0][0]);
+ EXPECT_EQ(30u, gray_pixels[0][1]);
+ EXPECT_EQ(30u, gray_pixels[0][2]);
+ EXPECT_EQ(128u, gray_pixels[0][3]);
+ EXPECT_EQ(149u, gray_pixels[1][0]);
+ EXPECT_EQ(149u, gray_pixels[1][1]);
+ EXPECT_EQ(149u, gray_pixels[1][2]);
+ EXPECT_EQ(0u, gray_pixels[1][3]);
+ EXPECT_EQ(76u, gray_pixels[2][0]);
+ EXPECT_EQ(76u, gray_pixels[2][1]);
+ EXPECT_EQ(76u, gray_pixels[2][2]);
+ EXPECT_EQ(255u, gray_pixels[2][3]);
+ EXPECT_EQ(0u, gray_pixels[3][0]);
+ EXPECT_EQ(0u, gray_pixels[3][1]);
+ EXPECT_EQ(0u, gray_pixels[3][2]);
+ EXPECT_EQ(255u, gray_pixels[3][3]);
+ EXPECT_EQ(255u, gray_pixels[4][0]);
+ EXPECT_EQ(255u, gray_pixels[4][1]);
+ EXPECT_EQ(255u, gray_pixels[4][2]);
+ EXPECT_EQ(255u, gray_pixels[4][3]);
+ EXPECT_EQ(96u, gray_pixels[5][0]);
+ EXPECT_EQ(96u, gray_pixels[5][1]);
+ EXPECT_EQ(96u, gray_pixels[5][2]);
+ EXPECT_EQ(224u, gray_pixels[5][3]);
+ for (int i = 0; i < 1280; ++i) {
+ orig_pixels[i][0] = i;
+ orig_pixels[i][1] = i / 2;
+ orig_pixels[i][2] = i / 3;
+ orig_pixels[i][3] = i;
+ }
+ for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
+ ARGBGrayTo(&orig_pixels[0][0], 0, &gray_pixels[0][0], 0, 1280, 1);
+ }
+}
+
+TEST_F(LibYUVPlanarTest, TestARGBSepia) {
+ SIMD_ALIGNED(uint8 orig_pixels[1280][4]);
+ memset(orig_pixels, 0, sizeof(orig_pixels));
+
+ // Test blue
+ orig_pixels[0][0] = 255u;
+ orig_pixels[0][1] = 0u;
+ orig_pixels[0][2] = 0u;
+ orig_pixels[0][3] = 128u;
+ // Test green
+ orig_pixels[1][0] = 0u;
+ orig_pixels[1][1] = 255u;
+ orig_pixels[1][2] = 0u;
+ orig_pixels[1][3] = 0u;
+ // Test red
+ orig_pixels[2][0] = 0u;
+ orig_pixels[2][1] = 0u;
+ orig_pixels[2][2] = 255u;
+ orig_pixels[2][3] = 255u;
+ // Test black
+ orig_pixels[3][0] = 0u;
+ orig_pixels[3][1] = 0u;
+ orig_pixels[3][2] = 0u;
+ orig_pixels[3][3] = 255u;
+ // Test white
+ orig_pixels[4][0] = 255u;
+ orig_pixels[4][1] = 255u;
+ orig_pixels[4][2] = 255u;
+ orig_pixels[4][3] = 255u;
+ // Test color
+ orig_pixels[5][0] = 16u;
+ orig_pixels[5][1] = 64u;
+ orig_pixels[5][2] = 192u;
+ orig_pixels[5][3] = 224u;
+ // Do 16 to test asm version.
+ ARGBSepia(&orig_pixels[0][0], 0, 0, 0, 16, 1);
+ EXPECT_EQ(33u, orig_pixels[0][0]);
+ EXPECT_EQ(43u, orig_pixels[0][1]);
+ EXPECT_EQ(47u, orig_pixels[0][2]);
+ EXPECT_EQ(128u, orig_pixels[0][3]);
+ EXPECT_EQ(135u, orig_pixels[1][0]);
+ EXPECT_EQ(175u, orig_pixels[1][1]);
+ EXPECT_EQ(195u, orig_pixels[1][2]);
+ EXPECT_EQ(0u, orig_pixels[1][3]);
+ EXPECT_EQ(69u, orig_pixels[2][0]);
+ EXPECT_EQ(89u, orig_pixels[2][1]);
+ EXPECT_EQ(99u, orig_pixels[2][2]);
+ EXPECT_EQ(255u, orig_pixels[2][3]);
+ EXPECT_EQ(0u, orig_pixels[3][0]);
+ EXPECT_EQ(0u, orig_pixels[3][1]);
+ EXPECT_EQ(0u, orig_pixels[3][2]);
+ EXPECT_EQ(255u, orig_pixels[3][3]);
+ EXPECT_EQ(239u, orig_pixels[4][0]);
+ EXPECT_EQ(255u, orig_pixels[4][1]);
+ EXPECT_EQ(255u, orig_pixels[4][2]);
+ EXPECT_EQ(255u, orig_pixels[4][3]);
+ EXPECT_EQ(88u, orig_pixels[5][0]);
+ EXPECT_EQ(114u, orig_pixels[5][1]);
+ EXPECT_EQ(127u, orig_pixels[5][2]);
+ EXPECT_EQ(224u, orig_pixels[5][3]);
+
+ for (int i = 0; i < 1280; ++i) {
+ orig_pixels[i][0] = i;
+ orig_pixels[i][1] = i / 2;
+ orig_pixels[i][2] = i / 3;
+ orig_pixels[i][3] = i;
+ }
+ for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
+ ARGBSepia(&orig_pixels[0][0], 0, 0, 0, 1280, 1);
+ }
+}
+
+TEST_F(LibYUVPlanarTest, TestARGBColorMatrix) {
+ SIMD_ALIGNED(uint8 orig_pixels[1280][4]);
+ SIMD_ALIGNED(uint8 dst_pixels_opt[1280][4]);
+ SIMD_ALIGNED(uint8 dst_pixels_c[1280][4]);
+
+ // Matrix for Sepia.
+ SIMD_ALIGNED(static const int8 kRGBToSepia[]) = {
+ 17 / 2, 68 / 2, 35 / 2, 0, 22 / 2, 88 / 2, 45 / 2, 0,
+ 24 / 2, 98 / 2, 50 / 2, 0, 0, 0, 0, 64, // Copy alpha.
+ };
+ memset(orig_pixels, 0, sizeof(orig_pixels));
+
+ // Test blue
+ orig_pixels[0][0] = 255u;
+ orig_pixels[0][1] = 0u;
+ orig_pixels[0][2] = 0u;
+ orig_pixels[0][3] = 128u;
+ // Test green
+ orig_pixels[1][0] = 0u;
+ orig_pixels[1][1] = 255u;
+ orig_pixels[1][2] = 0u;
+ orig_pixels[1][3] = 0u;
+ // Test red
+ orig_pixels[2][0] = 0u;
+ orig_pixels[2][1] = 0u;
+ orig_pixels[2][2] = 255u;
+ orig_pixels[2][3] = 255u;
+ // Test color
+ orig_pixels[3][0] = 16u;
+ orig_pixels[3][1] = 64u;
+ orig_pixels[3][2] = 192u;
+ orig_pixels[3][3] = 224u;
+ // Do 16 to test asm version.
+ ARGBColorMatrix(&orig_pixels[0][0], 0, &dst_pixels_opt[0][0], 0,
+ &kRGBToSepia[0], 16, 1);
+ EXPECT_EQ(31u, dst_pixels_opt[0][0]);
+ EXPECT_EQ(43u, dst_pixels_opt[0][1]);
+ EXPECT_EQ(47u, dst_pixels_opt[0][2]);
+ EXPECT_EQ(128u, dst_pixels_opt[0][3]);
+ EXPECT_EQ(135u, dst_pixels_opt[1][0]);
+ EXPECT_EQ(175u, dst_pixels_opt[1][1]);
+ EXPECT_EQ(195u, dst_pixels_opt[1][2]);
+ EXPECT_EQ(0u, dst_pixels_opt[1][3]);
+ EXPECT_EQ(67u, dst_pixels_opt[2][0]);
+ EXPECT_EQ(87u, dst_pixels_opt[2][1]);
+ EXPECT_EQ(99u, dst_pixels_opt[2][2]);
+ EXPECT_EQ(255u, dst_pixels_opt[2][3]);
+ EXPECT_EQ(87u, dst_pixels_opt[3][0]);
+ EXPECT_EQ(112u, dst_pixels_opt[3][1]);
+ EXPECT_EQ(127u, dst_pixels_opt[3][2]);
+ EXPECT_EQ(224u, dst_pixels_opt[3][3]);
+
+ for (int i = 0; i < 1280; ++i) {
+ orig_pixels[i][0] = i;
+ orig_pixels[i][1] = i / 2;
+ orig_pixels[i][2] = i / 3;
+ orig_pixels[i][3] = i;
+ }
+ MaskCpuFlags(disable_cpu_flags_);
+ ARGBColorMatrix(&orig_pixels[0][0], 0, &dst_pixels_c[0][0], 0,
+ &kRGBToSepia[0], 1280, 1);
+ MaskCpuFlags(benchmark_cpu_info_);
+
+ for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
+ ARGBColorMatrix(&orig_pixels[0][0], 0, &dst_pixels_opt[0][0], 0,
+ &kRGBToSepia[0], 1280, 1);
+ }
+
+ for (int i = 0; i < 1280; ++i) {
+ EXPECT_EQ(dst_pixels_c[i][0], dst_pixels_opt[i][0]);
+ EXPECT_EQ(dst_pixels_c[i][1], dst_pixels_opt[i][1]);
+ EXPECT_EQ(dst_pixels_c[i][2], dst_pixels_opt[i][2]);
+ EXPECT_EQ(dst_pixels_c[i][3], dst_pixels_opt[i][3]);
+ }
+}
+
+TEST_F(LibYUVPlanarTest, TestRGBColorMatrix) {
+ SIMD_ALIGNED(uint8 orig_pixels[1280][4]);
+
+ // Matrix for Sepia.
+ SIMD_ALIGNED(static const int8 kRGBToSepia[]) = {
+ 17, 68, 35, 0, 22, 88, 45, 0,
+ 24, 98, 50, 0, 0, 0, 0, 0, // Unused but makes matrix 16 bytes.
+ };
+ memset(orig_pixels, 0, sizeof(orig_pixels));
+
+ // Test blue
+ orig_pixels[0][0] = 255u;
+ orig_pixels[0][1] = 0u;
+ orig_pixels[0][2] = 0u;
+ orig_pixels[0][3] = 128u;
+ // Test green
+ orig_pixels[1][0] = 0u;
+ orig_pixels[1][1] = 255u;
+ orig_pixels[1][2] = 0u;
+ orig_pixels[1][3] = 0u;
+ // Test red
+ orig_pixels[2][0] = 0u;
+ orig_pixels[2][1] = 0u;
+ orig_pixels[2][2] = 255u;
+ orig_pixels[2][3] = 255u;
+ // Test color
+ orig_pixels[3][0] = 16u;
+ orig_pixels[3][1] = 64u;
+ orig_pixels[3][2] = 192u;
+ orig_pixels[3][3] = 224u;
+ // Do 16 to test asm version.
+ RGBColorMatrix(&orig_pixels[0][0], 0, &kRGBToSepia[0], 0, 0, 16, 1);
+ EXPECT_EQ(31u, orig_pixels[0][0]);
+ EXPECT_EQ(43u, orig_pixels[0][1]);
+ EXPECT_EQ(47u, orig_pixels[0][2]);
+ EXPECT_EQ(128u, orig_pixels[0][3]);
+ EXPECT_EQ(135u, orig_pixels[1][0]);
+ EXPECT_EQ(175u, orig_pixels[1][1]);
+ EXPECT_EQ(195u, orig_pixels[1][2]);
+ EXPECT_EQ(0u, orig_pixels[1][3]);
+ EXPECT_EQ(67u, orig_pixels[2][0]);
+ EXPECT_EQ(87u, orig_pixels[2][1]);
+ EXPECT_EQ(99u, orig_pixels[2][2]);
+ EXPECT_EQ(255u, orig_pixels[2][3]);
+ EXPECT_EQ(87u, orig_pixels[3][0]);
+ EXPECT_EQ(112u, orig_pixels[3][1]);
+ EXPECT_EQ(127u, orig_pixels[3][2]);
+ EXPECT_EQ(224u, orig_pixels[3][3]);
+
+ for (int i = 0; i < 1280; ++i) {
+ orig_pixels[i][0] = i;
+ orig_pixels[i][1] = i / 2;
+ orig_pixels[i][2] = i / 3;
+ orig_pixels[i][3] = i;
+ }
+ for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
+ RGBColorMatrix(&orig_pixels[0][0], 0, &kRGBToSepia[0], 0, 0, 1280, 1);
+ }
+}
+
+TEST_F(LibYUVPlanarTest, TestARGBColorTable) {
+ SIMD_ALIGNED(uint8 orig_pixels[1280][4]);
+ memset(orig_pixels, 0, sizeof(orig_pixels));
+
+ // Matrix for Sepia.
+ static const uint8 kARGBTable[256 * 4] = {
+ 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u, 15u, 16u,
+ };
+
+ orig_pixels[0][0] = 0u;
+ orig_pixels[0][1] = 0u;
+ orig_pixels[0][2] = 0u;
+ orig_pixels[0][3] = 0u;
+ orig_pixels[1][0] = 1u;
+ orig_pixels[1][1] = 1u;
+ orig_pixels[1][2] = 1u;
+ orig_pixels[1][3] = 1u;
+ orig_pixels[2][0] = 2u;
+ orig_pixels[2][1] = 2u;
+ orig_pixels[2][2] = 2u;
+ orig_pixels[2][3] = 2u;
+ orig_pixels[3][0] = 0u;
+ orig_pixels[3][1] = 1u;
+ orig_pixels[3][2] = 2u;
+ orig_pixels[3][3] = 3u;
+ // Do 16 to test asm version.
+ ARGBColorTable(&orig_pixels[0][0], 0, &kARGBTable[0], 0, 0, 16, 1);
+ EXPECT_EQ(1u, orig_pixels[0][0]);
+ EXPECT_EQ(2u, orig_pixels[0][1]);
+ EXPECT_EQ(3u, orig_pixels[0][2]);
+ EXPECT_EQ(4u, orig_pixels[0][3]);
+ EXPECT_EQ(5u, orig_pixels[1][0]);
+ EXPECT_EQ(6u, orig_pixels[1][1]);
+ EXPECT_EQ(7u, orig_pixels[1][2]);
+ EXPECT_EQ(8u, orig_pixels[1][3]);
+ EXPECT_EQ(9u, orig_pixels[2][0]);
+ EXPECT_EQ(10u, orig_pixels[2][1]);
+ EXPECT_EQ(11u, orig_pixels[2][2]);
+ EXPECT_EQ(12u, orig_pixels[2][3]);
+ EXPECT_EQ(1u, orig_pixels[3][0]);
+ EXPECT_EQ(6u, orig_pixels[3][1]);
+ EXPECT_EQ(11u, orig_pixels[3][2]);
+ EXPECT_EQ(16u, orig_pixels[3][3]);
+
+ for (int i = 0; i < 1280; ++i) {
+ orig_pixels[i][0] = i;
+ orig_pixels[i][1] = i / 2;
+ orig_pixels[i][2] = i / 3;
+ orig_pixels[i][3] = i;
+ }
+ for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
+ ARGBColorTable(&orig_pixels[0][0], 0, &kARGBTable[0], 0, 0, 1280, 1);
+ }
+}
+
+// Same as TestARGBColorTable except alpha does not change.
+TEST_F(LibYUVPlanarTest, TestRGBColorTable) {
+ SIMD_ALIGNED(uint8 orig_pixels[1280][4]);
+ memset(orig_pixels, 0, sizeof(orig_pixels));
+
+ // Matrix for Sepia.
+ static const uint8 kARGBTable[256 * 4] = {
+ 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u, 15u, 16u,
+ };
+
+ orig_pixels[0][0] = 0u;
+ orig_pixels[0][1] = 0u;
+ orig_pixels[0][2] = 0u;
+ orig_pixels[0][3] = 0u;
+ orig_pixels[1][0] = 1u;
+ orig_pixels[1][1] = 1u;
+ orig_pixels[1][2] = 1u;
+ orig_pixels[1][3] = 1u;
+ orig_pixels[2][0] = 2u;
+ orig_pixels[2][1] = 2u;
+ orig_pixels[2][2] = 2u;
+ orig_pixels[2][3] = 2u;
+ orig_pixels[3][0] = 0u;
+ orig_pixels[3][1] = 1u;
+ orig_pixels[3][2] = 2u;
+ orig_pixels[3][3] = 3u;
+ // Do 16 to test asm version.
+ RGBColorTable(&orig_pixels[0][0], 0, &kARGBTable[0], 0, 0, 16, 1);
+ EXPECT_EQ(1u, orig_pixels[0][0]);
+ EXPECT_EQ(2u, orig_pixels[0][1]);
+ EXPECT_EQ(3u, orig_pixels[0][2]);
+ EXPECT_EQ(0u, orig_pixels[0][3]); // Alpha unchanged.
+ EXPECT_EQ(5u, orig_pixels[1][0]);
+ EXPECT_EQ(6u, orig_pixels[1][1]);
+ EXPECT_EQ(7u, orig_pixels[1][2]);
+ EXPECT_EQ(1u, orig_pixels[1][3]); // Alpha unchanged.
+ EXPECT_EQ(9u, orig_pixels[2][0]);
+ EXPECT_EQ(10u, orig_pixels[2][1]);
+ EXPECT_EQ(11u, orig_pixels[2][2]);
+ EXPECT_EQ(2u, orig_pixels[2][3]); // Alpha unchanged.
+ EXPECT_EQ(1u, orig_pixels[3][0]);
+ EXPECT_EQ(6u, orig_pixels[3][1]);
+ EXPECT_EQ(11u, orig_pixels[3][2]);
+ EXPECT_EQ(3u, orig_pixels[3][3]); // Alpha unchanged.
+
+ for (int i = 0; i < 1280; ++i) {
+ orig_pixels[i][0] = i;
+ orig_pixels[i][1] = i / 2;
+ orig_pixels[i][2] = i / 3;
+ orig_pixels[i][3] = i;
+ }
+ for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
+ RGBColorTable(&orig_pixels[0][0], 0, &kARGBTable[0], 0, 0, 1280, 1);
+ }
+}
+
+TEST_F(LibYUVPlanarTest, TestARGBQuantize) {
+ SIMD_ALIGNED(uint8 orig_pixels[1280][4]);
+
+ for (int i = 0; i < 1280; ++i) {
+ orig_pixels[i][0] = i;
+ orig_pixels[i][1] = i / 2;
+ orig_pixels[i][2] = i / 3;
+ orig_pixels[i][3] = i;
+ }
+ ARGBQuantize(&orig_pixels[0][0], 0, (65536 + (8 / 2)) / 8, 8, 8 / 2, 0, 0,
+ 1280, 1);
+
+ for (int i = 0; i < 1280; ++i) {
+ EXPECT_EQ((i / 8 * 8 + 8 / 2) & 255, orig_pixels[i][0]);
+ EXPECT_EQ((i / 2 / 8 * 8 + 8 / 2) & 255, orig_pixels[i][1]);
+ EXPECT_EQ((i / 3 / 8 * 8 + 8 / 2) & 255, orig_pixels[i][2]);
+ EXPECT_EQ(i & 255, orig_pixels[i][3]);
+ }
+ for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
+ ARGBQuantize(&orig_pixels[0][0], 0, (65536 + (8 / 2)) / 8, 8, 8 / 2, 0, 0,
+ 1280, 1);
+ }
+}
+
+TEST_F(LibYUVPlanarTest, TestARGBMirror) {
+ SIMD_ALIGNED(uint8 orig_pixels[1280][4]);
+ SIMD_ALIGNED(uint8 dst_pixels[1280][4]);
+
+ for (int i = 0; i < 1280; ++i) {
+ orig_pixels[i][0] = i;
+ orig_pixels[i][1] = i / 2;
+ orig_pixels[i][2] = i / 3;
+ orig_pixels[i][3] = i / 4;
+ }
+ ARGBMirror(&orig_pixels[0][0], 0, &dst_pixels[0][0], 0, 1280, 1);
+
+ for (int i = 0; i < 1280; ++i) {
+ EXPECT_EQ(i & 255, dst_pixels[1280 - 1 - i][0]);
+ EXPECT_EQ((i / 2) & 255, dst_pixels[1280 - 1 - i][1]);
+ EXPECT_EQ((i / 3) & 255, dst_pixels[1280 - 1 - i][2]);
+ EXPECT_EQ((i / 4) & 255, dst_pixels[1280 - 1 - i][3]);
+ }
+ for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
+ ARGBMirror(&orig_pixels[0][0], 0, &dst_pixels[0][0], 0, 1280, 1);
+ }
+}
+
+TEST_F(LibYUVPlanarTest, TestShade) {
+ SIMD_ALIGNED(uint8 orig_pixels[1280][4]);
+ SIMD_ALIGNED(uint8 shade_pixels[1280][4]);
+ memset(orig_pixels, 0, sizeof(orig_pixels));
+
+ orig_pixels[0][0] = 10u;
+ orig_pixels[0][1] = 20u;
+ orig_pixels[0][2] = 40u;
+ orig_pixels[0][3] = 80u;
+ orig_pixels[1][0] = 0u;
+ orig_pixels[1][1] = 0u;
+ orig_pixels[1][2] = 0u;
+ orig_pixels[1][3] = 255u;
+ orig_pixels[2][0] = 0u;
+ orig_pixels[2][1] = 0u;
+ orig_pixels[2][2] = 0u;
+ orig_pixels[2][3] = 0u;
+ orig_pixels[3][0] = 0u;
+ orig_pixels[3][1] = 0u;
+ orig_pixels[3][2] = 0u;
+ orig_pixels[3][3] = 0u;
+ // Do 8 pixels to allow opt version to be used.
+ ARGBShade(&orig_pixels[0][0], 0, &shade_pixels[0][0], 0, 8, 1, 0x80ffffff);
+ EXPECT_EQ(10u, shade_pixels[0][0]);
+ EXPECT_EQ(20u, shade_pixels[0][1]);
+ EXPECT_EQ(40u, shade_pixels[0][2]);
+ EXPECT_EQ(40u, shade_pixels[0][3]);
+ EXPECT_EQ(0u, shade_pixels[1][0]);
+ EXPECT_EQ(0u, shade_pixels[1][1]);
+ EXPECT_EQ(0u, shade_pixels[1][2]);
+ EXPECT_EQ(128u, shade_pixels[1][3]);
+ EXPECT_EQ(0u, shade_pixels[2][0]);
+ EXPECT_EQ(0u, shade_pixels[2][1]);
+ EXPECT_EQ(0u, shade_pixels[2][2]);
+ EXPECT_EQ(0u, shade_pixels[2][3]);
+ EXPECT_EQ(0u, shade_pixels[3][0]);
+ EXPECT_EQ(0u, shade_pixels[3][1]);
+ EXPECT_EQ(0u, shade_pixels[3][2]);
+ EXPECT_EQ(0u, shade_pixels[3][3]);
+
+ ARGBShade(&orig_pixels[0][0], 0, &shade_pixels[0][0], 0, 8, 1, 0x80808080);
+ EXPECT_EQ(5u, shade_pixels[0][0]);
+ EXPECT_EQ(10u, shade_pixels[0][1]);
+ EXPECT_EQ(20u, shade_pixels[0][2]);
+ EXPECT_EQ(40u, shade_pixels[0][3]);
+
+ ARGBShade(&orig_pixels[0][0], 0, &shade_pixels[0][0], 0, 8, 1, 0x10204080);
+ EXPECT_EQ(5u, shade_pixels[0][0]);
+ EXPECT_EQ(5u, shade_pixels[0][1]);
+ EXPECT_EQ(5u, shade_pixels[0][2]);
+ EXPECT_EQ(5u, shade_pixels[0][3]);
+
+ for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
+ ARGBShade(&orig_pixels[0][0], 0, &shade_pixels[0][0], 0, 1280, 1,
+ 0x80808080);
+ }
+}
+
+TEST_F(LibYUVPlanarTest, TestARGBInterpolate) {
+ SIMD_ALIGNED(uint8 orig_pixels_0[1280][4]);
+ SIMD_ALIGNED(uint8 orig_pixels_1[1280][4]);
+ SIMD_ALIGNED(uint8 interpolate_pixels[1280][4]);
+ memset(orig_pixels_0, 0, sizeof(orig_pixels_0));
+ memset(orig_pixels_1, 0, sizeof(orig_pixels_1));
+
+ orig_pixels_0[0][0] = 16u;
+ orig_pixels_0[0][1] = 32u;
+ orig_pixels_0[0][2] = 64u;
+ orig_pixels_0[0][3] = 128u;
+ orig_pixels_0[1][0] = 0u;
+ orig_pixels_0[1][1] = 0u;
+ orig_pixels_0[1][2] = 0u;
+ orig_pixels_0[1][3] = 255u;
+ orig_pixels_0[2][0] = 0u;
+ orig_pixels_0[2][1] = 0u;
+ orig_pixels_0[2][2] = 0u;
+ orig_pixels_0[2][3] = 0u;
+ orig_pixels_0[3][0] = 0u;
+ orig_pixels_0[3][1] = 0u;
+ orig_pixels_0[3][2] = 0u;
+ orig_pixels_0[3][3] = 0u;
+
+ orig_pixels_1[0][0] = 0u;
+ orig_pixels_1[0][1] = 0u;
+ orig_pixels_1[0][2] = 0u;
+ orig_pixels_1[0][3] = 0u;
+ orig_pixels_1[1][0] = 0u;
+ orig_pixels_1[1][1] = 0u;
+ orig_pixels_1[1][2] = 0u;
+ orig_pixels_1[1][3] = 0u;
+ orig_pixels_1[2][0] = 0u;
+ orig_pixels_1[2][1] = 0u;
+ orig_pixels_1[2][2] = 0u;
+ orig_pixels_1[2][3] = 0u;
+ orig_pixels_1[3][0] = 255u;
+ orig_pixels_1[3][1] = 255u;
+ orig_pixels_1[3][2] = 255u;
+ orig_pixels_1[3][3] = 255u;
+
+ ARGBInterpolate(&orig_pixels_0[0][0], 0, &orig_pixels_1[0][0], 0,
+ &interpolate_pixels[0][0], 0, 4, 1, 128);
+ EXPECT_EQ(8u, interpolate_pixels[0][0]);
+ EXPECT_EQ(16u, interpolate_pixels[0][1]);
+ EXPECT_EQ(32u, interpolate_pixels[0][2]);
+ EXPECT_EQ(64u, interpolate_pixels[0][3]);
+ EXPECT_EQ(0u, interpolate_pixels[1][0]);
+ EXPECT_EQ(0u, interpolate_pixels[1][1]);
+ EXPECT_EQ(0u, interpolate_pixels[1][2]);
+ EXPECT_EQ(128u, interpolate_pixels[1][3]);
+ EXPECT_EQ(0u, interpolate_pixels[2][0]);
+ EXPECT_EQ(0u, interpolate_pixels[2][1]);
+ EXPECT_EQ(0u, interpolate_pixels[2][2]);
+ EXPECT_EQ(0u, interpolate_pixels[2][3]);
+ EXPECT_EQ(128u, interpolate_pixels[3][0]);
+ EXPECT_EQ(128u, interpolate_pixels[3][1]);
+ EXPECT_EQ(128u, interpolate_pixels[3][2]);
+ EXPECT_EQ(128u, interpolate_pixels[3][3]);
+
+ ARGBInterpolate(&orig_pixels_0[0][0], 0, &orig_pixels_1[0][0], 0,
+ &interpolate_pixels[0][0], 0, 4, 1, 0);
+ EXPECT_EQ(16u, interpolate_pixels[0][0]);
+ EXPECT_EQ(32u, interpolate_pixels[0][1]);
+ EXPECT_EQ(64u, interpolate_pixels[0][2]);
+ EXPECT_EQ(128u, interpolate_pixels[0][3]);
+
+ ARGBInterpolate(&orig_pixels_0[0][0], 0, &orig_pixels_1[0][0], 0,
+ &interpolate_pixels[0][0], 0, 4, 1, 192);
+
+ EXPECT_EQ(4u, interpolate_pixels[0][0]);
+ EXPECT_EQ(8u, interpolate_pixels[0][1]);
+ EXPECT_EQ(16u, interpolate_pixels[0][2]);
+ EXPECT_EQ(32u, interpolate_pixels[0][3]);
+
+ for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
+ ARGBInterpolate(&orig_pixels_0[0][0], 0, &orig_pixels_1[0][0], 0,
+ &interpolate_pixels[0][0], 0, 1280, 1, 128);
+ }
+}
+
+TEST_F(LibYUVPlanarTest, TestInterpolatePlane) {
+ SIMD_ALIGNED(uint8 orig_pixels_0[1280]);
+ SIMD_ALIGNED(uint8 orig_pixels_1[1280]);
+ SIMD_ALIGNED(uint8 interpolate_pixels[1280]);
+ memset(orig_pixels_0, 0, sizeof(orig_pixels_0));
+ memset(orig_pixels_1, 0, sizeof(orig_pixels_1));
+
+ orig_pixels_0[0] = 16u;
+ orig_pixels_0[1] = 32u;
+ orig_pixels_0[2] = 64u;
+ orig_pixels_0[3] = 128u;
+ orig_pixels_0[4] = 0u;
+ orig_pixels_0[5] = 0u;
+ orig_pixels_0[6] = 0u;
+ orig_pixels_0[7] = 255u;
+ orig_pixels_0[8] = 0u;
+ orig_pixels_0[9] = 0u;
+ orig_pixels_0[10] = 0u;
+ orig_pixels_0[11] = 0u;
+ orig_pixels_0[12] = 0u;
+ orig_pixels_0[13] = 0u;
+ orig_pixels_0[14] = 0u;
+ orig_pixels_0[15] = 0u;
+
+ orig_pixels_1[0] = 0u;
+ orig_pixels_1[1] = 0u;
+ orig_pixels_1[2] = 0u;
+ orig_pixels_1[3] = 0u;
+ orig_pixels_1[4] = 0u;
+ orig_pixels_1[5] = 0u;
+ orig_pixels_1[6] = 0u;
+ orig_pixels_1[7] = 0u;
+ orig_pixels_1[8] = 0u;
+ orig_pixels_1[9] = 0u;
+ orig_pixels_1[10] = 0u;
+ orig_pixels_1[11] = 0u;
+ orig_pixels_1[12] = 255u;
+ orig_pixels_1[13] = 255u;
+ orig_pixels_1[14] = 255u;
+ orig_pixels_1[15] = 255u;
+
+ InterpolatePlane(&orig_pixels_0[0], 0, &orig_pixels_1[0], 0,
+ &interpolate_pixels[0], 0, 16, 1, 128);
+ EXPECT_EQ(8u, interpolate_pixels[0]);
+ EXPECT_EQ(16u, interpolate_pixels[1]);
+ EXPECT_EQ(32u, interpolate_pixels[2]);
+ EXPECT_EQ(64u, interpolate_pixels[3]);
+ EXPECT_EQ(0u, interpolate_pixels[4]);
+ EXPECT_EQ(0u, interpolate_pixels[5]);
+ EXPECT_EQ(0u, interpolate_pixels[6]);
+ EXPECT_EQ(128u, interpolate_pixels[7]);
+ EXPECT_EQ(0u, interpolate_pixels[8]);
+ EXPECT_EQ(0u, interpolate_pixels[9]);
+ EXPECT_EQ(0u, interpolate_pixels[10]);
+ EXPECT_EQ(0u, interpolate_pixels[11]);
+ EXPECT_EQ(128u, interpolate_pixels[12]);
+ EXPECT_EQ(128u, interpolate_pixels[13]);
+ EXPECT_EQ(128u, interpolate_pixels[14]);
+ EXPECT_EQ(128u, interpolate_pixels[15]);
+
+ InterpolatePlane(&orig_pixels_0[0], 0, &orig_pixels_1[0], 0,
+ &interpolate_pixels[0], 0, 16, 1, 0);
+ EXPECT_EQ(16u, interpolate_pixels[0]);
+ EXPECT_EQ(32u, interpolate_pixels[1]);
+ EXPECT_EQ(64u, interpolate_pixels[2]);
+ EXPECT_EQ(128u, interpolate_pixels[3]);
+
+ InterpolatePlane(&orig_pixels_0[0], 0, &orig_pixels_1[0], 0,
+ &interpolate_pixels[0], 0, 16, 1, 192);
+
+ EXPECT_EQ(4u, interpolate_pixels[0]);
+ EXPECT_EQ(8u, interpolate_pixels[1]);
+ EXPECT_EQ(16u, interpolate_pixels[2]);
+ EXPECT_EQ(32u, interpolate_pixels[3]);
+
+ for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
+ InterpolatePlane(&orig_pixels_0[0], 0, &orig_pixels_1[0], 0,
+ &interpolate_pixels[0], 0, 1280, 1, 123);
+ }
+}
+
+#define TESTTERP(FMT_A, BPP_A, STRIDE_A, FMT_B, BPP_B, STRIDE_B, W1280, TERP, \
+ N, NEG, OFF) \
+ TEST_F(LibYUVPlanarTest, ARGBInterpolate##TERP##N) { \
+ const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
+ const int kHeight = benchmark_height_; \
+ const int kStrideA = \
+ (kWidth * BPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A; \
+ const int kStrideB = \
+ (kWidth * BPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B; \
+ align_buffer_page_end(src_argb_a, kStrideA* kHeight + OFF); \
+ align_buffer_page_end(src_argb_b, kStrideA* kHeight + OFF); \
+ align_buffer_page_end(dst_argb_c, kStrideB* kHeight); \
+ align_buffer_page_end(dst_argb_opt, kStrideB* kHeight); \
+ for (int i = 0; i < kStrideA * kHeight; ++i) { \
+ src_argb_a[i + OFF] = (fastrand() & 0xff); \
+ src_argb_b[i + OFF] = (fastrand() & 0xff); \
+ } \
+ MaskCpuFlags(disable_cpu_flags_); \
+ ARGBInterpolate(src_argb_a + OFF, kStrideA, src_argb_b + OFF, kStrideA, \
+ dst_argb_c, kStrideB, kWidth, NEG kHeight, TERP); \
+ MaskCpuFlags(benchmark_cpu_info_); \
+ for (int i = 0; i < benchmark_iterations_; ++i) { \
+ ARGBInterpolate(src_argb_a + OFF, kStrideA, src_argb_b + OFF, kStrideA, \
+ dst_argb_opt, kStrideB, kWidth, NEG kHeight, TERP); \
+ } \
+ for (int i = 0; i < kStrideB * kHeight; ++i) { \
+ EXPECT_EQ(dst_argb_c[i], dst_argb_opt[i]); \
+ } \
+ free_aligned_buffer_page_end(src_argb_a); \
+ free_aligned_buffer_page_end(src_argb_b); \
+ free_aligned_buffer_page_end(dst_argb_c); \
+ free_aligned_buffer_page_end(dst_argb_opt); \
+ }
+
+#define TESTINTERPOLATE(TERP) \
+ TESTTERP(ARGB, 4, 1, ARGB, 4, 1, benchmark_width_ - 1, TERP, _Any, +, 0) \
+ TESTTERP(ARGB, 4, 1, ARGB, 4, 1, benchmark_width_, TERP, _Unaligned, +, 1) \
+ TESTTERP(ARGB, 4, 1, ARGB, 4, 1, benchmark_width_, TERP, _Invert, -, 0) \
+ TESTTERP(ARGB, 4, 1, ARGB, 4, 1, benchmark_width_, TERP, _Opt, +, 0)
+
+TESTINTERPOLATE(0)
+TESTINTERPOLATE(64)
+TESTINTERPOLATE(128)
+TESTINTERPOLATE(192)
+TESTINTERPOLATE(255)
+
+static int TestBlend(int width,
+ int height,
+ int benchmark_iterations,
+ int disable_cpu_flags,
+ int benchmark_cpu_info,
+ int invert,
+ int off) {
+ if (width < 1) {
+ width = 1;
+ }
+ const int kBpp = 4;
+ const int kStride = width * kBpp;
+ align_buffer_page_end(src_argb_a, kStride * height + off);
+ align_buffer_page_end(src_argb_b, kStride * height + off);
+ align_buffer_page_end(dst_argb_c, kStride * height);
+ align_buffer_page_end(dst_argb_opt, kStride * height);
+ for (int i = 0; i < kStride * height; ++i) {
+ src_argb_a[i + off] = (fastrand() & 0xff);
+ src_argb_b[i + off] = (fastrand() & 0xff);
+ }
+ ARGBAttenuate(src_argb_a + off, kStride, src_argb_a + off, kStride, width,
+ height);
+ ARGBAttenuate(src_argb_b + off, kStride, src_argb_b + off, kStride, width,
+ height);
+ memset(dst_argb_c, 255, kStride * height);
+ memset(dst_argb_opt, 255, kStride * height);
+
+ MaskCpuFlags(disable_cpu_flags);
+ ARGBBlend(src_argb_a + off, kStride, src_argb_b + off, kStride, dst_argb_c,
+ kStride, width, invert * height);
+ MaskCpuFlags(benchmark_cpu_info);
+ for (int i = 0; i < benchmark_iterations; ++i) {
+ ARGBBlend(src_argb_a + off, kStride, src_argb_b + off, kStride,
+ dst_argb_opt, kStride, width, invert * height);
+ }
+ int max_diff = 0;
+ for (int i = 0; i < kStride * height; ++i) {
+ int abs_diff = abs(static_cast(dst_argb_c[i]) -
+ static_cast(dst_argb_opt[i]));
+ if (abs_diff > max_diff) {
+ max_diff = abs_diff;
+ }
+ }
+ free_aligned_buffer_page_end(src_argb_a);
+ free_aligned_buffer_page_end(src_argb_b);
+ free_aligned_buffer_page_end(dst_argb_c);
+ free_aligned_buffer_page_end(dst_argb_opt);
+ return max_diff;
+}
+
+TEST_F(LibYUVPlanarTest, ARGBBlend_Any) {
+ int max_diff =
+ TestBlend(benchmark_width_ - 4, benchmark_height_, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_, +1, 0);
+ EXPECT_LE(max_diff, 1);
+}
+
+TEST_F(LibYUVPlanarTest, ARGBBlend_Unaligned) {
+ int max_diff =
+ TestBlend(benchmark_width_, benchmark_height_, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_, +1, 1);
+ EXPECT_LE(max_diff, 1);
+}
+
+TEST_F(LibYUVPlanarTest, ARGBBlend_Invert) {
+ int max_diff =
+ TestBlend(benchmark_width_, benchmark_height_, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_, -1, 0);
+ EXPECT_LE(max_diff, 1);
+}
+
+TEST_F(LibYUVPlanarTest, ARGBBlend_Opt) {
+ int max_diff =
+ TestBlend(benchmark_width_, benchmark_height_, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_, +1, 0);
+ EXPECT_LE(max_diff, 1);
+}
+
+static void TestBlendPlane(int width,
+ int height,
+ int benchmark_iterations,
+ int disable_cpu_flags,
+ int benchmark_cpu_info,
+ int invert,
+ int off) {
+ if (width < 1) {
+ width = 1;
+ }
+ const int kBpp = 1;
+ const int kStride = width * kBpp;
+ align_buffer_page_end(src_argb_a, kStride * height + off);
+ align_buffer_page_end(src_argb_b, kStride * height + off);
+ align_buffer_page_end(src_argb_alpha, kStride * height + off);
+ align_buffer_page_end(dst_argb_c, kStride * height + off);
+ align_buffer_page_end(dst_argb_opt, kStride * height + off);
+ memset(dst_argb_c, 255, kStride * height + off);
+ memset(dst_argb_opt, 255, kStride * height + off);
+
+ // Test source is maintained exactly if alpha is 255.
+ for (int i = 0; i < width; ++i) {
+ src_argb_a[i + off] = i & 255;
+ src_argb_b[i + off] = 255 - (i & 255);
+ }
+ memset(src_argb_alpha + off, 255, width);
+ BlendPlane(src_argb_a + off, width, src_argb_b + off, width,
+ src_argb_alpha + off, width, dst_argb_opt + off, width, width, 1);
+ for (int i = 0; i < width; ++i) {
+ EXPECT_EQ(src_argb_a[i + off], dst_argb_opt[i + off]);
+ }
+ // Test destination is maintained exactly if alpha is 0.
+ memset(src_argb_alpha + off, 0, width);
+ BlendPlane(src_argb_a + off, width, src_argb_b + off, width,
+ src_argb_alpha + off, width, dst_argb_opt + off, width, width, 1);
+ for (int i = 0; i < width; ++i) {
+ EXPECT_EQ(src_argb_b[i + off], dst_argb_opt[i + off]);
+ }
+ for (int i = 0; i < kStride * height; ++i) {
+ src_argb_a[i + off] = (fastrand() & 0xff);
+ src_argb_b[i + off] = (fastrand() & 0xff);
+ src_argb_alpha[i + off] = (fastrand() & 0xff);
+ }
+
+ MaskCpuFlags(disable_cpu_flags);
+ BlendPlane(src_argb_a + off, width, src_argb_b + off, width,
+ src_argb_alpha + off, width, dst_argb_c + off, width, width,
+ invert * height);
+ MaskCpuFlags(benchmark_cpu_info);
+ for (int i = 0; i < benchmark_iterations; ++i) {
+ BlendPlane(src_argb_a + off, width, src_argb_b + off, width,
+ src_argb_alpha + off, width, dst_argb_opt + off, width, width,
+ invert * height);
+ }
+ for (int i = 0; i < kStride * height; ++i) {
+ EXPECT_EQ(dst_argb_c[i + off], dst_argb_opt[i + off]);
+ }
+ free_aligned_buffer_page_end(src_argb_a);
+ free_aligned_buffer_page_end(src_argb_b);
+ free_aligned_buffer_page_end(src_argb_alpha);
+ free_aligned_buffer_page_end(dst_argb_c);
+ free_aligned_buffer_page_end(dst_argb_opt);
+ return;
+}
+
+TEST_F(LibYUVPlanarTest, BlendPlane_Opt) {
+ TestBlendPlane(benchmark_width_, benchmark_height_, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_, +1, 0);
+}
+TEST_F(LibYUVPlanarTest, BlendPlane_Unaligned) {
+ TestBlendPlane(benchmark_width_, benchmark_height_, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_, +1, 1);
+}
+TEST_F(LibYUVPlanarTest, BlendPlane_Any) {
+ TestBlendPlane(benchmark_width_ - 4, benchmark_height_, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_, +1, 1);
+}
+TEST_F(LibYUVPlanarTest, BlendPlane_Invert) {
+ TestBlendPlane(benchmark_width_, benchmark_height_, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_, -1, 1);
+}
+
+#define SUBSAMPLE(v, a) ((((v) + (a)-1)) / (a))
+
+static void TestI420Blend(int width,
+ int height,
+ int benchmark_iterations,
+ int disable_cpu_flags,
+ int benchmark_cpu_info,
+ int invert,
+ int off) {
+ width = ((width) > 0) ? (width) : 1;
+ const int kStrideUV = SUBSAMPLE(width, 2);
+ const int kSizeUV = kStrideUV * SUBSAMPLE(height, 2);
+ align_buffer_page_end(src_y0, width * height + off);
+ align_buffer_page_end(src_u0, kSizeUV + off);
+ align_buffer_page_end(src_v0, kSizeUV + off);
+ align_buffer_page_end(src_y1, width * height + off);
+ align_buffer_page_end(src_u1, kSizeUV + off);
+ align_buffer_page_end(src_v1, kSizeUV + off);
+ align_buffer_page_end(src_a, width * height + off);
+ align_buffer_page_end(dst_y_c, width * height + off);
+ align_buffer_page_end(dst_u_c, kSizeUV + off);
+ align_buffer_page_end(dst_v_c, kSizeUV + off);
+ align_buffer_page_end(dst_y_opt, width * height + off);
+ align_buffer_page_end(dst_u_opt, kSizeUV + off);
+ align_buffer_page_end(dst_v_opt, kSizeUV + off);
+
+ MemRandomize(src_y0, width * height + off);
+ MemRandomize(src_u0, kSizeUV + off);
+ MemRandomize(src_v0, kSizeUV + off);
+ MemRandomize(src_y1, width * height + off);
+ MemRandomize(src_u1, kSizeUV + off);
+ MemRandomize(src_v1, kSizeUV + off);
+ MemRandomize(src_a, width * height + off);
+ memset(dst_y_c, 255, width * height + off);
+ memset(dst_u_c, 255, kSizeUV + off);
+ memset(dst_v_c, 255, kSizeUV + off);
+ memset(dst_y_opt, 255, width * height + off);
+ memset(dst_u_opt, 255, kSizeUV + off);
+ memset(dst_v_opt, 255, kSizeUV + off);
+
+ MaskCpuFlags(disable_cpu_flags);
+ I420Blend(src_y0 + off, width, src_u0 + off, kStrideUV, src_v0 + off,
+ kStrideUV, src_y1 + off, width, src_u1 + off, kStrideUV,
+ src_v1 + off, kStrideUV, src_a + off, width, dst_y_c + off, width,
+ dst_u_c + off, kStrideUV, dst_v_c + off, kStrideUV, width,
+ invert * height);
+ MaskCpuFlags(benchmark_cpu_info);
+ for (int i = 0; i < benchmark_iterations; ++i) {
+ I420Blend(src_y0 + off, width, src_u0 + off, kStrideUV, src_v0 + off,
+ kStrideUV, src_y1 + off, width, src_u1 + off, kStrideUV,
+ src_v1 + off, kStrideUV, src_a + off, width, dst_y_opt + off,
+ width, dst_u_opt + off, kStrideUV, dst_v_opt + off, kStrideUV,
+ width, invert * height);
+ }
+ for (int i = 0; i < width * height; ++i) {
+ EXPECT_EQ(dst_y_c[i + off], dst_y_opt[i + off]);
+ }
+ for (int i = 0; i < kSizeUV; ++i) {
+ EXPECT_EQ(dst_u_c[i + off], dst_u_opt[i + off]);
+ EXPECT_EQ(dst_v_c[i + off], dst_v_opt[i + off]);
+ }
+ free_aligned_buffer_page_end(src_y0);
+ free_aligned_buffer_page_end(src_u0);
+ free_aligned_buffer_page_end(src_v0);
+ free_aligned_buffer_page_end(src_y1);
+ free_aligned_buffer_page_end(src_u1);
+ free_aligned_buffer_page_end(src_v1);
+ free_aligned_buffer_page_end(src_a);
+ free_aligned_buffer_page_end(dst_y_c);
+ free_aligned_buffer_page_end(dst_u_c);
+ free_aligned_buffer_page_end(dst_v_c);
+ free_aligned_buffer_page_end(dst_y_opt);
+ free_aligned_buffer_page_end(dst_u_opt);
+ free_aligned_buffer_page_end(dst_v_opt);
+ return;
+}
+
+TEST_F(LibYUVPlanarTest, I420Blend_Opt) {
+ TestI420Blend(benchmark_width_, benchmark_height_, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_, +1, 0);
+}
+TEST_F(LibYUVPlanarTest, I420Blend_Unaligned) {
+ TestI420Blend(benchmark_width_, benchmark_height_, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_, +1, 1);
+}
+
+// TODO(fbarchard): DISABLED because _Any uses C. Avoid C and re-enable.
+TEST_F(LibYUVPlanarTest, DISABLED_I420Blend_Any) {
+ TestI420Blend(benchmark_width_ - 4, benchmark_height_, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_, +1, 0);
+}
+TEST_F(LibYUVPlanarTest, I420Blend_Invert) {
+ TestI420Blend(benchmark_width_, benchmark_height_, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_, -1, 0);
+}
+
+TEST_F(LibYUVPlanarTest, TestAffine) {
+ SIMD_ALIGNED(uint8 orig_pixels_0[1280][4]);
+ SIMD_ALIGNED(uint8 interpolate_pixels_C[1280][4]);
+
+ for (int i = 0; i < 1280; ++i) {
+ for (int j = 0; j < 4; ++j) {
+ orig_pixels_0[i][j] = i;
+ }
+ }
+
+ float uv_step[4] = {0.f, 0.f, 0.75f, 0.f};
+
+ ARGBAffineRow_C(&orig_pixels_0[0][0], 0, &interpolate_pixels_C[0][0], uv_step,
+ 1280);
+ EXPECT_EQ(0u, interpolate_pixels_C[0][0]);
+ EXPECT_EQ(96u, interpolate_pixels_C[128][0]);
+ EXPECT_EQ(191u, interpolate_pixels_C[255][3]);
+
+#if defined(HAS_ARGBAFFINEROW_SSE2)
+ SIMD_ALIGNED(uint8 interpolate_pixels_Opt[1280][4]);
+ ARGBAffineRow_SSE2(&orig_pixels_0[0][0], 0, &interpolate_pixels_Opt[0][0],
+ uv_step, 1280);
+ EXPECT_EQ(0, memcmp(interpolate_pixels_Opt, interpolate_pixels_C, 1280 * 4));
+
+ int has_sse2 = TestCpuFlag(kCpuHasSSE2);
+ if (has_sse2) {
+ for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
+ ARGBAffineRow_SSE2(&orig_pixels_0[0][0], 0, &interpolate_pixels_Opt[0][0],
+ uv_step, 1280);
+ }
+ }
+#endif
+}
+
+TEST_F(LibYUVPlanarTest, TestCopyPlane) {
+ int err = 0;
+ int yw = benchmark_width_;
+ int yh = benchmark_height_;
+ int b = 12;
+ int i, j;
+
+ int y_plane_size = (yw + b * 2) * (yh + b * 2);
+ align_buffer_page_end(orig_y, y_plane_size);
+ align_buffer_page_end(dst_c, y_plane_size);
+ align_buffer_page_end(dst_opt, y_plane_size);
+
+ memset(orig_y, 0, y_plane_size);
+ memset(dst_c, 0, y_plane_size);
+ memset(dst_opt, 0, y_plane_size);
+
+ // Fill image buffers with random data.
+ for (i = b; i < (yh + b); ++i) {
+ for (j = b; j < (yw + b); ++j) {
+ orig_y[i * (yw + b * 2) + j] = fastrand() & 0xff;
+ }
+ }
+
+ // Fill destination buffers with random data.
+ for (i = 0; i < y_plane_size; ++i) {
+ uint8 random_number = fastrand() & 0x7f;
+ dst_c[i] = random_number;
+ dst_opt[i] = dst_c[i];
+ }
+
+ int y_off = b * (yw + b * 2) + b;
+
+ int y_st = yw + b * 2;
+ int stride = 8;
+
+ // Disable all optimizations.
+ MaskCpuFlags(disable_cpu_flags_);
+ for (j = 0; j < benchmark_iterations_; j++) {
+ CopyPlane(orig_y + y_off, y_st, dst_c + y_off, stride, yw, yh);
+ }
+
+ // Enable optimizations.
+ MaskCpuFlags(benchmark_cpu_info_);
+ for (j = 0; j < benchmark_iterations_; j++) {
+ CopyPlane(orig_y + y_off, y_st, dst_opt + y_off, stride, yw, yh);
+ }
+
+ for (i = 0; i < y_plane_size; ++i) {
+ if (dst_c[i] != dst_opt[i])
+ ++err;
+ }
+
+ free_aligned_buffer_page_end(orig_y);
+ free_aligned_buffer_page_end(dst_c);
+ free_aligned_buffer_page_end(dst_opt);
+
+ EXPECT_EQ(0, err);
+}
+
+static int TestMultiply(int width,
+ int height,
+ int benchmark_iterations,
+ int disable_cpu_flags,
+ int benchmark_cpu_info,
+ int invert,
+ int off) {
+ if (width < 1) {
+ width = 1;
+ }
+ const int kBpp = 4;
+ const int kStride = width * kBpp;
+ align_buffer_page_end(src_argb_a, kStride * height + off);
+ align_buffer_page_end(src_argb_b, kStride * height + off);
+ align_buffer_page_end(dst_argb_c, kStride * height);
+ align_buffer_page_end(dst_argb_opt, kStride * height);
+ for (int i = 0; i < kStride * height; ++i) {
+ src_argb_a[i + off] = (fastrand() & 0xff);
+ src_argb_b[i + off] = (fastrand() & 0xff);
+ }
+ memset(dst_argb_c, 0, kStride * height);
+ memset(dst_argb_opt, 0, kStride * height);
+
+ MaskCpuFlags(disable_cpu_flags);
+ ARGBMultiply(src_argb_a + off, kStride, src_argb_b + off, kStride, dst_argb_c,
+ kStride, width, invert * height);
+ MaskCpuFlags(benchmark_cpu_info);
+ for (int i = 0; i < benchmark_iterations; ++i) {
+ ARGBMultiply(src_argb_a + off, kStride, src_argb_b + off, kStride,
+ dst_argb_opt, kStride, width, invert * height);
+ }
+ int max_diff = 0;
+ for (int i = 0; i < kStride * height; ++i) {
+ int abs_diff = abs(static_cast(dst_argb_c[i]) -
+ static_cast(dst_argb_opt[i]));
+ if (abs_diff > max_diff) {
+ max_diff = abs_diff;
+ }
+ }
+ free_aligned_buffer_page_end(src_argb_a);
+ free_aligned_buffer_page_end(src_argb_b);
+ free_aligned_buffer_page_end(dst_argb_c);
+ free_aligned_buffer_page_end(dst_argb_opt);
+ return max_diff;
+}
+
+TEST_F(LibYUVPlanarTest, ARGBMultiply_Any) {
+ int max_diff = TestMultiply(benchmark_width_ - 1, benchmark_height_,
+ benchmark_iterations_, disable_cpu_flags_,
+ benchmark_cpu_info_, +1, 0);
+ EXPECT_LE(max_diff, 1);
+}
+
+TEST_F(LibYUVPlanarTest, ARGBMultiply_Unaligned) {
+ int max_diff =
+ TestMultiply(benchmark_width_, benchmark_height_, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_, +1, 1);
+ EXPECT_LE(max_diff, 1);
+}
+
+TEST_F(LibYUVPlanarTest, ARGBMultiply_Invert) {
+ int max_diff =
+ TestMultiply(benchmark_width_, benchmark_height_, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_, -1, 0);
+ EXPECT_LE(max_diff, 1);
+}
+
+TEST_F(LibYUVPlanarTest, ARGBMultiply_Opt) {
+ int max_diff =
+ TestMultiply(benchmark_width_, benchmark_height_, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_, +1, 0);
+ EXPECT_LE(max_diff, 1);
+}
+
+static int TestAdd(int width,
+ int height,
+ int benchmark_iterations,
+ int disable_cpu_flags,
+ int benchmark_cpu_info,
+ int invert,
+ int off) {
+ if (width < 1) {
+ width = 1;
+ }
+ const int kBpp = 4;
+ const int kStride = width * kBpp;
+ align_buffer_page_end(src_argb_a, kStride * height + off);
+ align_buffer_page_end(src_argb_b, kStride * height + off);
+ align_buffer_page_end(dst_argb_c, kStride * height);
+ align_buffer_page_end(dst_argb_opt, kStride * height);
+ for (int i = 0; i < kStride * height; ++i) {
+ src_argb_a[i + off] = (fastrand() & 0xff);
+ src_argb_b[i + off] = (fastrand() & 0xff);
+ }
+ memset(dst_argb_c, 0, kStride * height);
+ memset(dst_argb_opt, 0, kStride * height);
+
+ MaskCpuFlags(disable_cpu_flags);
+ ARGBAdd(src_argb_a + off, kStride, src_argb_b + off, kStride, dst_argb_c,
+ kStride, width, invert * height);
+ MaskCpuFlags(benchmark_cpu_info);
+ for (int i = 0; i < benchmark_iterations; ++i) {
+ ARGBAdd(src_argb_a + off, kStride, src_argb_b + off, kStride, dst_argb_opt,
+ kStride, width, invert * height);
+ }
+ int max_diff = 0;
+ for (int i = 0; i < kStride * height; ++i) {
+ int abs_diff = abs(static_cast(dst_argb_c[i]) -
+ static_cast(dst_argb_opt[i]));
+ if (abs_diff > max_diff) {
+ max_diff = abs_diff;
+ }
+ }
+ free_aligned_buffer_page_end(src_argb_a);
+ free_aligned_buffer_page_end(src_argb_b);
+ free_aligned_buffer_page_end(dst_argb_c);
+ free_aligned_buffer_page_end(dst_argb_opt);
+ return max_diff;
+}
+
+TEST_F(LibYUVPlanarTest, ARGBAdd_Any) {
+ int max_diff =
+ TestAdd(benchmark_width_ - 1, benchmark_height_, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_, +1, 0);
+ EXPECT_LE(max_diff, 1);
+}
+
+TEST_F(LibYUVPlanarTest, ARGBAdd_Unaligned) {
+ int max_diff =
+ TestAdd(benchmark_width_, benchmark_height_, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_, +1, 1);
+ EXPECT_LE(max_diff, 1);
+}
+
+TEST_F(LibYUVPlanarTest, ARGBAdd_Invert) {
+ int max_diff =
+ TestAdd(benchmark_width_, benchmark_height_, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_, -1, 0);
+ EXPECT_LE(max_diff, 1);
+}
+
+TEST_F(LibYUVPlanarTest, ARGBAdd_Opt) {
+ int max_diff =
+ TestAdd(benchmark_width_, benchmark_height_, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_, +1, 0);
+ EXPECT_LE(max_diff, 1);
+}
+
+static int TestSubtract(int width,
+ int height,
+ int benchmark_iterations,
+ int disable_cpu_flags,
+ int benchmark_cpu_info,
+ int invert,
+ int off) {
+ if (width < 1) {
+ width = 1;
+ }
+ const int kBpp = 4;
+ const int kStride = width * kBpp;
+ align_buffer_page_end(src_argb_a, kStride * height + off);
+ align_buffer_page_end(src_argb_b, kStride * height + off);
+ align_buffer_page_end(dst_argb_c, kStride * height);
+ align_buffer_page_end(dst_argb_opt, kStride * height);
+ for (int i = 0; i < kStride * height; ++i) {
+ src_argb_a[i + off] = (fastrand() & 0xff);
+ src_argb_b[i + off] = (fastrand() & 0xff);
+ }
+ memset(dst_argb_c, 0, kStride * height);
+ memset(dst_argb_opt, 0, kStride * height);
+
+ MaskCpuFlags(disable_cpu_flags);
+ ARGBSubtract(src_argb_a + off, kStride, src_argb_b + off, kStride, dst_argb_c,
+ kStride, width, invert * height);
+ MaskCpuFlags(benchmark_cpu_info);
+ for (int i = 0; i < benchmark_iterations; ++i) {
+ ARGBSubtract(src_argb_a + off, kStride, src_argb_b + off, kStride,
+ dst_argb_opt, kStride, width, invert * height);
+ }
+ int max_diff = 0;
+ for (int i = 0; i < kStride * height; ++i) {
+ int abs_diff = abs(static_cast(dst_argb_c[i]) -
+ static_cast(dst_argb_opt[i]));
+ if (abs_diff > max_diff) {
+ max_diff = abs_diff;
+ }
+ }
+ free_aligned_buffer_page_end(src_argb_a);
+ free_aligned_buffer_page_end(src_argb_b);
+ free_aligned_buffer_page_end(dst_argb_c);
+ free_aligned_buffer_page_end(dst_argb_opt);
+ return max_diff;
+}
+
+TEST_F(LibYUVPlanarTest, ARGBSubtract_Any) {
+ int max_diff = TestSubtract(benchmark_width_ - 1, benchmark_height_,
+ benchmark_iterations_, disable_cpu_flags_,
+ benchmark_cpu_info_, +1, 0);
+ EXPECT_LE(max_diff, 1);
+}
+
+TEST_F(LibYUVPlanarTest, ARGBSubtract_Unaligned) {
+ int max_diff =
+ TestSubtract(benchmark_width_, benchmark_height_, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_, +1, 1);
+ EXPECT_LE(max_diff, 1);
+}
+
+TEST_F(LibYUVPlanarTest, ARGBSubtract_Invert) {
+ int max_diff =
+ TestSubtract(benchmark_width_, benchmark_height_, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_, -1, 0);
+ EXPECT_LE(max_diff, 1);
+}
+
+TEST_F(LibYUVPlanarTest, ARGBSubtract_Opt) {
+ int max_diff =
+ TestSubtract(benchmark_width_, benchmark_height_, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_, +1, 0);
+ EXPECT_LE(max_diff, 1);
+}
+
+static int TestSobel(int width,
+ int height,
+ int benchmark_iterations,
+ int disable_cpu_flags,
+ int benchmark_cpu_info,
+ int invert,
+ int off) {
+ if (width < 1) {
+ width = 1;
+ }
+ const int kBpp = 4;
+ const int kStride = width * kBpp;
+ align_buffer_page_end(src_argb_a, kStride * height + off);
+ align_buffer_page_end(dst_argb_c, kStride * height);
+ align_buffer_page_end(dst_argb_opt, kStride * height);
+ memset(src_argb_a, 0, kStride * height + off);
+ for (int i = 0; i < kStride * height; ++i) {
+ src_argb_a[i + off] = (fastrand() & 0xff);
+ }
+ memset(dst_argb_c, 0, kStride * height);
+ memset(dst_argb_opt, 0, kStride * height);
+
+ MaskCpuFlags(disable_cpu_flags);
+ ARGBSobel(src_argb_a + off, kStride, dst_argb_c, kStride, width,
+ invert * height);
+ MaskCpuFlags(benchmark_cpu_info);
+ for (int i = 0; i < benchmark_iterations; ++i) {
+ ARGBSobel(src_argb_a + off, kStride, dst_argb_opt, kStride, width,
+ invert * height);
+ }
+ int max_diff = 0;
+ for (int i = 0; i < kStride * height; ++i) {
+ int abs_diff = abs(static_cast(dst_argb_c[i]) -
+ static_cast(dst_argb_opt[i]));
+ if (abs_diff > max_diff) {
+ max_diff = abs_diff;
+ }
+ }
+ free_aligned_buffer_page_end(src_argb_a);
+ free_aligned_buffer_page_end(dst_argb_c);
+ free_aligned_buffer_page_end(dst_argb_opt);
+ return max_diff;
+}
+
+TEST_F(LibYUVPlanarTest, ARGBSobel_Any) {
+ int max_diff =
+ TestSobel(benchmark_width_ - 1, benchmark_height_, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_, +1, 0);
+ EXPECT_EQ(0, max_diff);
+}
+
+TEST_F(LibYUVPlanarTest, ARGBSobel_Unaligned) {
+ int max_diff =
+ TestSobel(benchmark_width_, benchmark_height_, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_, +1, 1);
+ EXPECT_EQ(0, max_diff);
+}
+
+TEST_F(LibYUVPlanarTest, ARGBSobel_Invert) {
+ int max_diff =
+ TestSobel(benchmark_width_, benchmark_height_, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_, -1, 0);
+ EXPECT_EQ(0, max_diff);
+}
+
+TEST_F(LibYUVPlanarTest, ARGBSobel_Opt) {
+ int max_diff =
+ TestSobel(benchmark_width_, benchmark_height_, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_, +1, 0);
+ EXPECT_EQ(0, max_diff);
+}
+
+static int TestSobelToPlane(int width,
+ int height,
+ int benchmark_iterations,
+ int disable_cpu_flags,
+ int benchmark_cpu_info,
+ int invert,
+ int off) {
+ if (width < 1) {
+ width = 1;
+ }
+ const int kSrcBpp = 4;
+ const int kDstBpp = 1;
+ const int kSrcStride = (width * kSrcBpp + 15) & ~15;
+ const int kDstStride = (width * kDstBpp + 15) & ~15;
+ align_buffer_page_end(src_argb_a, kSrcStride * height + off);
+ align_buffer_page_end(dst_argb_c, kDstStride * height);
+ align_buffer_page_end(dst_argb_opt, kDstStride * height);
+ memset(src_argb_a, 0, kSrcStride * height + off);
+ for (int i = 0; i < kSrcStride * height; ++i) {
+ src_argb_a[i + off] = (fastrand() & 0xff);
+ }
+ memset(dst_argb_c, 0, kDstStride * height);
+ memset(dst_argb_opt, 0, kDstStride * height);
+
+ MaskCpuFlags(disable_cpu_flags);
+ ARGBSobelToPlane(src_argb_a + off, kSrcStride, dst_argb_c, kDstStride, width,
+ invert * height);
+ MaskCpuFlags(benchmark_cpu_info);
+ for (int i = 0; i < benchmark_iterations; ++i) {
+ ARGBSobelToPlane(src_argb_a + off, kSrcStride, dst_argb_opt, kDstStride,
+ width, invert * height);
+ }
+ int max_diff = 0;
+ for (int i = 0; i < kDstStride * height; ++i) {
+ int abs_diff = abs(static_cast(dst_argb_c[i]) -
+ static_cast(dst_argb_opt[i]));
+ if (abs_diff > max_diff) {
+ max_diff = abs_diff;
+ }
+ }
+ free_aligned_buffer_page_end(src_argb_a);
+ free_aligned_buffer_page_end(dst_argb_c);
+ free_aligned_buffer_page_end(dst_argb_opt);
+ return max_diff;
+}
+
+TEST_F(LibYUVPlanarTest, ARGBSobelToPlane_Any) {
+ int max_diff = TestSobelToPlane(benchmark_width_ - 1, benchmark_height_,
+ benchmark_iterations_, disable_cpu_flags_,
+ benchmark_cpu_info_, +1, 0);
+ EXPECT_EQ(0, max_diff);
+}
+
+TEST_F(LibYUVPlanarTest, ARGBSobelToPlane_Unaligned) {
+ int max_diff = TestSobelToPlane(benchmark_width_, benchmark_height_,
+ benchmark_iterations_, disable_cpu_flags_,
+ benchmark_cpu_info_, +1, 1);
+ EXPECT_EQ(0, max_diff);
+}
+
+TEST_F(LibYUVPlanarTest, ARGBSobelToPlane_Invert) {
+ int max_diff = TestSobelToPlane(benchmark_width_, benchmark_height_,
+ benchmark_iterations_, disable_cpu_flags_,
+ benchmark_cpu_info_, -1, 0);
+ EXPECT_EQ(0, max_diff);
+}
+
+TEST_F(LibYUVPlanarTest, ARGBSobelToPlane_Opt) {
+ int max_diff = TestSobelToPlane(benchmark_width_, benchmark_height_,
+ benchmark_iterations_, disable_cpu_flags_,
+ benchmark_cpu_info_, +1, 0);
+ EXPECT_EQ(0, max_diff);
+}
+
+static int TestSobelXY(int width,
+ int height,
+ int benchmark_iterations,
+ int disable_cpu_flags,
+ int benchmark_cpu_info,
+ int invert,
+ int off) {
+ if (width < 1) {
+ width = 1;
+ }
+ const int kBpp = 4;
+ const int kStride = width * kBpp;
+ align_buffer_page_end(src_argb_a, kStride * height + off);
+ align_buffer_page_end(dst_argb_c, kStride * height);
+ align_buffer_page_end(dst_argb_opt, kStride * height);
+ memset(src_argb_a, 0, kStride * height + off);
+ for (int i = 0; i < kStride * height; ++i) {
+ src_argb_a[i + off] = (fastrand() & 0xff);
+ }
+ memset(dst_argb_c, 0, kStride * height);
+ memset(dst_argb_opt, 0, kStride * height);
+
+ MaskCpuFlags(disable_cpu_flags);
+ ARGBSobelXY(src_argb_a + off, kStride, dst_argb_c, kStride, width,
+ invert * height);
+ MaskCpuFlags(benchmark_cpu_info);
+ for (int i = 0; i < benchmark_iterations; ++i) {
+ ARGBSobelXY(src_argb_a + off, kStride, dst_argb_opt, kStride, width,
+ invert * height);
+ }
+ int max_diff = 0;
+ for (int i = 0; i < kStride * height; ++i) {
+ int abs_diff = abs(static_cast(dst_argb_c[i]) -
+ static_cast(dst_argb_opt[i]));
+ if (abs_diff > max_diff) {
+ max_diff = abs_diff;
+ }
+ }
+ free_aligned_buffer_page_end(src_argb_a);
+ free_aligned_buffer_page_end(dst_argb_c);
+ free_aligned_buffer_page_end(dst_argb_opt);
+ return max_diff;
+}
+
+TEST_F(LibYUVPlanarTest, ARGBSobelXY_Any) {
+ int max_diff = TestSobelXY(benchmark_width_ - 1, benchmark_height_,
+ benchmark_iterations_, disable_cpu_flags_,
+ benchmark_cpu_info_, +1, 0);
+ EXPECT_EQ(0, max_diff);
+}
+
+TEST_F(LibYUVPlanarTest, ARGBSobelXY_Unaligned) {
+ int max_diff =
+ TestSobelXY(benchmark_width_, benchmark_height_, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_, +1, 1);
+ EXPECT_EQ(0, max_diff);
+}
+
+TEST_F(LibYUVPlanarTest, ARGBSobelXY_Invert) {
+ int max_diff =
+ TestSobelXY(benchmark_width_, benchmark_height_, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_, -1, 0);
+ EXPECT_EQ(0, max_diff);
+}
+
+TEST_F(LibYUVPlanarTest, ARGBSobelXY_Opt) {
+ int max_diff =
+ TestSobelXY(benchmark_width_, benchmark_height_, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_, +1, 0);
+ EXPECT_EQ(0, max_diff);
+}
+
+static int TestBlur(int width,
+ int height,
+ int benchmark_iterations,
+ int disable_cpu_flags,
+ int benchmark_cpu_info,
+ int invert,
+ int off,
+ int radius) {
+ if (width < 1) {
+ width = 1;
+ }
+ const int kBpp = 4;
+ const int kStride = width * kBpp;
+ align_buffer_page_end(src_argb_a, kStride * height + off);
+ align_buffer_page_end(dst_cumsum, width * height * 16);
+ align_buffer_page_end(dst_argb_c, kStride * height);
+ align_buffer_page_end(dst_argb_opt, kStride * height);
+ for (int i = 0; i < kStride * height; ++i) {
+ src_argb_a[i + off] = (fastrand() & 0xff);
+ }
+ memset(dst_cumsum, 0, width * height * 16);
+ memset(dst_argb_c, 0, kStride * height);
+ memset(dst_argb_opt, 0, kStride * height);
+
+ MaskCpuFlags(disable_cpu_flags);
+ ARGBBlur(src_argb_a + off, kStride, dst_argb_c, kStride,
+ reinterpret_cast(dst_cumsum), width * 4, width,
+ invert * height, radius);
+ MaskCpuFlags(benchmark_cpu_info);
+ for (int i = 0; i < benchmark_iterations; ++i) {
+ ARGBBlur(src_argb_a + off, kStride, dst_argb_opt, kStride,
+ reinterpret_cast(dst_cumsum), width * 4, width,
+ invert * height, radius);
+ }
+ int max_diff = 0;
+ for (int i = 0; i < kStride * height; ++i) {
+ int abs_diff = abs(static_cast(dst_argb_c[i]) -
+ static_cast(dst_argb_opt[i]));
+ if (abs_diff > max_diff) {
+ max_diff = abs_diff;
+ }
+ }
+ free_aligned_buffer_page_end(src_argb_a);
+ free_aligned_buffer_page_end(dst_cumsum);
+ free_aligned_buffer_page_end(dst_argb_c);
+ free_aligned_buffer_page_end(dst_argb_opt);
+ return max_diff;
+}
+
+static const int kBlurSize = 55;
+TEST_F(LibYUVPlanarTest, ARGBBlur_Any) {
+ int max_diff =
+ TestBlur(benchmark_width_ - 1, benchmark_height_, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_, +1, 0, kBlurSize);
+ EXPECT_LE(max_diff, 1);
+}
+
+TEST_F(LibYUVPlanarTest, ARGBBlur_Unaligned) {
+ int max_diff =
+ TestBlur(benchmark_width_, benchmark_height_, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_, +1, 1, kBlurSize);
+ EXPECT_LE(max_diff, 1);
+}
+
+TEST_F(LibYUVPlanarTest, ARGBBlur_Invert) {
+ int max_diff =
+ TestBlur(benchmark_width_, benchmark_height_, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_, -1, 0, kBlurSize);
+ EXPECT_LE(max_diff, 1);
+}
+
+TEST_F(LibYUVPlanarTest, ARGBBlur_Opt) {
+ int max_diff =
+ TestBlur(benchmark_width_, benchmark_height_, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_, +1, 0, kBlurSize);
+ EXPECT_LE(max_diff, 1);
+}
+
+static const int kBlurSmallSize = 5;
+TEST_F(LibYUVPlanarTest, ARGBBlurSmall_Any) {
+ int max_diff =
+ TestBlur(benchmark_width_ - 1, benchmark_height_, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_, +1, 0, kBlurSmallSize);
+ EXPECT_LE(max_diff, 1);
+}
+
+TEST_F(LibYUVPlanarTest, ARGBBlurSmall_Unaligned) {
+ int max_diff =
+ TestBlur(benchmark_width_, benchmark_height_, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_, +1, 1, kBlurSmallSize);
+ EXPECT_LE(max_diff, 1);
+}
+
+TEST_F(LibYUVPlanarTest, ARGBBlurSmall_Invert) {
+ int max_diff =
+ TestBlur(benchmark_width_, benchmark_height_, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_, -1, 0, kBlurSmallSize);
+ EXPECT_LE(max_diff, 1);
+}
+
+TEST_F(LibYUVPlanarTest, ARGBBlurSmall_Opt) {
+ int max_diff =
+ TestBlur(benchmark_width_, benchmark_height_, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_, +1, 0, kBlurSmallSize);
+ EXPECT_LE(max_diff, 1);
+}
+
+TEST_F(LibYUVPlanarTest, TestARGBPolynomial) {
+ SIMD_ALIGNED(uint8 orig_pixels[1280][4]);
+ SIMD_ALIGNED(uint8 dst_pixels_opt[1280][4]);
+ SIMD_ALIGNED(uint8 dst_pixels_c[1280][4]);
+ memset(orig_pixels, 0, sizeof(orig_pixels));
+
+ SIMD_ALIGNED(static const float kWarmifyPolynomial[16]) = {
+ 0.94230f, -3.03300f, -2.92500f, 0.f, // C0
+ 0.584500f, 1.112000f, 1.535000f, 1.f, // C1 x
+ 0.001313f, -0.002503f, -0.004496f, 0.f, // C2 x * x
+ 0.0f, 0.000006965f, 0.000008781f, 0.f, // C3 x * x * x
+ };
+
+ // Test blue
+ orig_pixels[0][0] = 255u;
+ orig_pixels[0][1] = 0u;
+ orig_pixels[0][2] = 0u;
+ orig_pixels[0][3] = 128u;
+ // Test green
+ orig_pixels[1][0] = 0u;
+ orig_pixels[1][1] = 255u;
+ orig_pixels[1][2] = 0u;
+ orig_pixels[1][3] = 0u;
+ // Test red
+ orig_pixels[2][0] = 0u;
+ orig_pixels[2][1] = 0u;
+ orig_pixels[2][2] = 255u;
+ orig_pixels[2][3] = 255u;
+ // Test white
+ orig_pixels[3][0] = 255u;
+ orig_pixels[3][1] = 255u;
+ orig_pixels[3][2] = 255u;
+ orig_pixels[3][3] = 255u;
+ // Test color
+ orig_pixels[4][0] = 16u;
+ orig_pixels[4][1] = 64u;
+ orig_pixels[4][2] = 192u;
+ orig_pixels[4][3] = 224u;
+ // Do 16 to test asm version.
+ ARGBPolynomial(&orig_pixels[0][0], 0, &dst_pixels_opt[0][0], 0,
+ &kWarmifyPolynomial[0], 16, 1);
+ EXPECT_EQ(235u, dst_pixels_opt[0][0]);
+ EXPECT_EQ(0u, dst_pixels_opt[0][1]);
+ EXPECT_EQ(0u, dst_pixels_opt[0][2]);
+ EXPECT_EQ(128u, dst_pixels_opt[0][3]);
+ EXPECT_EQ(0u, dst_pixels_opt[1][0]);
+ EXPECT_EQ(233u, dst_pixels_opt[1][1]);
+ EXPECT_EQ(0u, dst_pixels_opt[1][2]);
+ EXPECT_EQ(0u, dst_pixels_opt[1][3]);
+ EXPECT_EQ(0u, dst_pixels_opt[2][0]);
+ EXPECT_EQ(0u, dst_pixels_opt[2][1]);
+ EXPECT_EQ(241u, dst_pixels_opt[2][2]);
+ EXPECT_EQ(255u, dst_pixels_opt[2][3]);
+ EXPECT_EQ(235u, dst_pixels_opt[3][0]);
+ EXPECT_EQ(233u, dst_pixels_opt[3][1]);
+ EXPECT_EQ(241u, dst_pixels_opt[3][2]);
+ EXPECT_EQ(255u, dst_pixels_opt[3][3]);
+ EXPECT_EQ(10u, dst_pixels_opt[4][0]);
+ EXPECT_EQ(59u, dst_pixels_opt[4][1]);
+ EXPECT_EQ(188u, dst_pixels_opt[4][2]);
+ EXPECT_EQ(224u, dst_pixels_opt[4][3]);
+
+ for (int i = 0; i < 1280; ++i) {
+ orig_pixels[i][0] = i;
+ orig_pixels[i][1] = i / 2;
+ orig_pixels[i][2] = i / 3;
+ orig_pixels[i][3] = i;
+ }
+
+ MaskCpuFlags(disable_cpu_flags_);
+ ARGBPolynomial(&orig_pixels[0][0], 0, &dst_pixels_c[0][0], 0,
+ &kWarmifyPolynomial[0], 1280, 1);
+ MaskCpuFlags(benchmark_cpu_info_);
+
+ for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
+ ARGBPolynomial(&orig_pixels[0][0], 0, &dst_pixels_opt[0][0], 0,
+ &kWarmifyPolynomial[0], 1280, 1);
+ }
+
+ for (int i = 0; i < 1280; ++i) {
+ EXPECT_EQ(dst_pixels_c[i][0], dst_pixels_opt[i][0]);
+ EXPECT_EQ(dst_pixels_c[i][1], dst_pixels_opt[i][1]);
+ EXPECT_EQ(dst_pixels_c[i][2], dst_pixels_opt[i][2]);
+ EXPECT_EQ(dst_pixels_c[i][3], dst_pixels_opt[i][3]);
+ }
+}
+
+int TestHalfFloatPlane(int benchmark_width,
+ int benchmark_height,
+ int benchmark_iterations,
+ int disable_cpu_flags,
+ int benchmark_cpu_info,
+ float scale,
+ int mask) {
+ int i, j;
+ const int y_plane_size = benchmark_width * benchmark_height * 2;
+
+ align_buffer_page_end(orig_y, y_plane_size * 3);
+ uint8* dst_opt = orig_y + y_plane_size;
+ uint8* dst_c = orig_y + y_plane_size * 2;
+
+ MemRandomize(orig_y, y_plane_size);
+ memset(dst_c, 0, y_plane_size);
+ memset(dst_opt, 1, y_plane_size);
+
+ for (i = 0; i < y_plane_size / 2; ++i) {
+ reinterpret_cast(orig_y)[i] &= mask;
+ }
+
+ // Disable all optimizations.
+ MaskCpuFlags(disable_cpu_flags);
+ for (j = 0; j < benchmark_iterations; j++) {
+ HalfFloatPlane(reinterpret_cast(orig_y), benchmark_width * 2,
+ reinterpret_cast(dst_c), benchmark_width * 2, scale,
+ benchmark_width, benchmark_height);
+ }
+
+ // Enable optimizations.
+ MaskCpuFlags(benchmark_cpu_info);
+ for (j = 0; j < benchmark_iterations; j++) {
+ HalfFloatPlane(reinterpret_cast(orig_y), benchmark_width * 2,
+ reinterpret_cast(dst_opt), benchmark_width * 2,
+ scale, benchmark_width, benchmark_height);
+ }
+
+ int max_diff = 0;
+ for (i = 0; i < y_plane_size / 2; ++i) {
+ int abs_diff = abs(static_cast(reinterpret_cast(dst_c)[i]) -
+ static_cast(reinterpret_cast(dst_opt)[i]));
+ if (abs_diff > max_diff) {
+ max_diff = abs_diff;
+ }
+ }
+
+ free_aligned_buffer_page_end(orig_y);
+ return max_diff;
+}
+
+#if defined(__arm__)
+static void EnableFlushDenormalToZero(void) {
+ uint32_t cw;
+ __asm__ __volatile__(
+ "vmrs %0, fpscr \n"
+ "orr %0, %0, #0x1000000 \n"
+ "vmsr fpscr, %0 \n"
+ : "=r"(cw)::"memory");
+}
+#endif
+
+// 5 bit exponent with bias of 15 will underflow to a denormal if scale causes
+// exponent to be less than 0. 15 - log2(65536) = -1/ This shouldnt normally
+// happen since scale is 1/(1<(dst_argb_c[i + off]) -
+ static_cast(dst_argb_opt[i + off]));
+ if (abs_diff > max_diff) {
+ max_diff = abs_diff;
+ }
+ }
+ free_aligned_buffer_page_end(dst_argb_c);
+ free_aligned_buffer_page_end(dst_argb_opt);
+ return max_diff;
+}
+
+TEST_F(LibYUVPlanarTest, ARGBRect_Any) {
+ int max_diff = TestARGBRect(benchmark_width_ - 1, benchmark_height_,
+ benchmark_iterations_, disable_cpu_flags_,
+ benchmark_cpu_info_, +1, 0, 4);
+ EXPECT_EQ(0, max_diff);
+}
+
+TEST_F(LibYUVPlanarTest, ARGBRect_Unaligned) {
+ int max_diff =
+ TestARGBRect(benchmark_width_, benchmark_height_, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_, +1, 1, 4);
+ EXPECT_EQ(0, max_diff);
+}
+
+TEST_F(LibYUVPlanarTest, ARGBRect_Invert) {
+ int max_diff =
+ TestARGBRect(benchmark_width_, benchmark_height_, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_, -1, 0, 4);
+ EXPECT_EQ(0, max_diff);
+}
+
+TEST_F(LibYUVPlanarTest, ARGBRect_Opt) {
+ int max_diff =
+ TestARGBRect(benchmark_width_, benchmark_height_, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_, +1, 0, 4);
+ EXPECT_EQ(0, max_diff);
+}
+
+TEST_F(LibYUVPlanarTest, SetPlane_Any) {
+ int max_diff = TestARGBRect(benchmark_width_ - 1, benchmark_height_,
+ benchmark_iterations_, disable_cpu_flags_,
+ benchmark_cpu_info_, +1, 0, 1);
+ EXPECT_EQ(0, max_diff);
+}
+
+TEST_F(LibYUVPlanarTest, SetPlane_Unaligned) {
+ int max_diff =
+ TestARGBRect(benchmark_width_, benchmark_height_, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_, +1, 1, 1);
+ EXPECT_EQ(0, max_diff);
+}
+
+TEST_F(LibYUVPlanarTest, SetPlane_Invert) {
+ int max_diff =
+ TestARGBRect(benchmark_width_, benchmark_height_, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_, -1, 0, 1);
+ EXPECT_EQ(0, max_diff);
+}
+
+TEST_F(LibYUVPlanarTest, SetPlane_Opt) {
+ int max_diff =
+ TestARGBRect(benchmark_width_, benchmark_height_, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_, +1, 0, 1);
+ EXPECT_EQ(0, max_diff);
+}
+
+TEST_F(LibYUVPlanarTest, MergeUVPlane_Opt) {
+ const int kPixels = benchmark_width_ * benchmark_height_;
+ align_buffer_page_end(src_pixels, kPixels * 2);
+ align_buffer_page_end(tmp_pixels_u, kPixels);
+ align_buffer_page_end(tmp_pixels_v, kPixels);
+ align_buffer_page_end(dst_pixels_opt, kPixels * 2);
+ align_buffer_page_end(dst_pixels_c, kPixels * 2);
+
+ MemRandomize(src_pixels, kPixels * 2);
+ MemRandomize(tmp_pixels_u, kPixels);
+ MemRandomize(tmp_pixels_v, kPixels);
+ MemRandomize(dst_pixels_opt, kPixels * 2);
+ MemRandomize(dst_pixels_c, kPixels * 2);
+
+ MaskCpuFlags(disable_cpu_flags_);
+ SplitUVPlane(src_pixels, benchmark_width_ * 2, tmp_pixels_u, benchmark_width_,
+ tmp_pixels_v, benchmark_width_, benchmark_width_,
+ benchmark_height_);
+ MergeUVPlane(tmp_pixels_u, benchmark_width_, tmp_pixels_v, benchmark_width_,
+ dst_pixels_c, benchmark_width_ * 2, benchmark_width_,
+ benchmark_height_);
+ MaskCpuFlags(benchmark_cpu_info_);
+
+ SplitUVPlane(src_pixels, benchmark_width_ * 2, tmp_pixels_u, benchmark_width_,
+ tmp_pixels_v, benchmark_width_, benchmark_width_,
+ benchmark_height_);
+
+ for (int i = 0; i < benchmark_iterations_; ++i) {
+ MergeUVPlane(tmp_pixels_u, benchmark_width_, tmp_pixels_v, benchmark_width_,
+ dst_pixels_opt, benchmark_width_ * 2, benchmark_width_,
+ benchmark_height_);
+ }
+
+ for (int i = 0; i < kPixels * 2; ++i) {
+ EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
+ }
+
+ free_aligned_buffer_page_end(src_pixels);
+ free_aligned_buffer_page_end(tmp_pixels_u);
+ free_aligned_buffer_page_end(tmp_pixels_v);
+ free_aligned_buffer_page_end(dst_pixels_opt);
+ free_aligned_buffer_page_end(dst_pixels_c);
+}
+
+TEST_F(LibYUVPlanarTest, SplitUVPlane_Opt) {
+ const int kPixels = benchmark_width_ * benchmark_height_;
+ align_buffer_page_end(src_pixels, kPixels * 2);
+ align_buffer_page_end(tmp_pixels_u, kPixels);
+ align_buffer_page_end(tmp_pixels_v, kPixels);
+ align_buffer_page_end(dst_pixels_opt, kPixels * 2);
+ align_buffer_page_end(dst_pixels_c, kPixels * 2);
+
+ MemRandomize(src_pixels, kPixels * 2);
+ MemRandomize(tmp_pixels_u, kPixels);
+ MemRandomize(tmp_pixels_v, kPixels);
+ MemRandomize(dst_pixels_opt, kPixels * 2);
+ MemRandomize(dst_pixels_c, kPixels * 2);
+
+ MaskCpuFlags(disable_cpu_flags_);
+ SplitUVPlane(src_pixels, benchmark_width_ * 2, tmp_pixels_u, benchmark_width_,
+ tmp_pixels_v, benchmark_width_, benchmark_width_,
+ benchmark_height_);
+ MergeUVPlane(tmp_pixels_u, benchmark_width_, tmp_pixels_v, benchmark_width_,
+ dst_pixels_c, benchmark_width_ * 2, benchmark_width_,
+ benchmark_height_);
+ MaskCpuFlags(benchmark_cpu_info_);
+
+ for (int i = 0; i < benchmark_iterations_; ++i) {
+ SplitUVPlane(src_pixels, benchmark_width_ * 2, tmp_pixels_u,
+ benchmark_width_, tmp_pixels_v, benchmark_width_,
+ benchmark_width_, benchmark_height_);
+ }
+ MergeUVPlane(tmp_pixels_u, benchmark_width_, tmp_pixels_v, benchmark_width_,
+ dst_pixels_opt, benchmark_width_ * 2, benchmark_width_,
+ benchmark_height_);
+
+ for (int i = 0; i < kPixels * 2; ++i) {
+ EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
+ }
+
+ free_aligned_buffer_page_end(src_pixels);
+ free_aligned_buffer_page_end(tmp_pixels_u);
+ free_aligned_buffer_page_end(tmp_pixels_v);
+ free_aligned_buffer_page_end(dst_pixels_opt);
+ free_aligned_buffer_page_end(dst_pixels_c);
+}
+
+} // namespace libyuv
diff --git a/libyuv/src/main/cpp/libyuv/unit_test/rotate_argb_test.cc b/libyuv/src/main/cpp/libyuv/unit_test/rotate_argb_test.cc
new file mode 100644
index 0000000..d200389
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/unit_test/rotate_argb_test.cc
@@ -0,0 +1,186 @@
+/*
+ * Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include
+
+#include "../unit_test/unit_test.h"
+#include "libyuv/cpu_id.h"
+#include "libyuv/rotate_argb.h"
+
+namespace libyuv {
+
+void TestRotateBpp(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ libyuv::RotationMode mode,
+ int benchmark_iterations,
+ int disable_cpu_flags,
+ int benchmark_cpu_info,
+ const int kBpp) {
+ if (src_width < 1) {
+ src_width = 1;
+ }
+ if (src_height < 1) {
+ src_height = 1;
+ }
+ if (dst_width < 1) {
+ dst_width = 1;
+ }
+ if (dst_height < 1) {
+ dst_height = 1;
+ }
+ int src_stride_argb = src_width * kBpp;
+ int src_argb_plane_size = src_stride_argb * abs(src_height);
+ align_buffer_page_end(src_argb, src_argb_plane_size);
+ for (int i = 0; i < src_argb_plane_size; ++i) {
+ src_argb[i] = fastrand() & 0xff;
+ }
+
+ int dst_stride_argb = dst_width * kBpp;
+ int dst_argb_plane_size = dst_stride_argb * dst_height;
+ align_buffer_page_end(dst_argb_c, dst_argb_plane_size);
+ align_buffer_page_end(dst_argb_opt, dst_argb_plane_size);
+ memset(dst_argb_c, 2, dst_argb_plane_size);
+ memset(dst_argb_opt, 3, dst_argb_plane_size);
+
+ if (kBpp == 1) {
+ MaskCpuFlags(disable_cpu_flags); // Disable all CPU optimization.
+ RotatePlane(src_argb, src_stride_argb, dst_argb_c, dst_stride_argb,
+ src_width, src_height, mode);
+
+ MaskCpuFlags(benchmark_cpu_info); // Enable all CPU optimization.
+ for (int i = 0; i < benchmark_iterations; ++i) {
+ RotatePlane(src_argb, src_stride_argb, dst_argb_opt, dst_stride_argb,
+ src_width, src_height, mode);
+ }
+ } else if (kBpp == 4) {
+ MaskCpuFlags(disable_cpu_flags); // Disable all CPU optimization.
+ ARGBRotate(src_argb, src_stride_argb, dst_argb_c, dst_stride_argb,
+ src_width, src_height, mode);
+
+ MaskCpuFlags(benchmark_cpu_info); // Enable all CPU optimization.
+ for (int i = 0; i < benchmark_iterations; ++i) {
+ ARGBRotate(src_argb, src_stride_argb, dst_argb_opt, dst_stride_argb,
+ src_width, src_height, mode);
+ }
+ }
+
+ // Rotation should be exact.
+ for (int i = 0; i < dst_argb_plane_size; ++i) {
+ EXPECT_EQ(dst_argb_c[i], dst_argb_opt[i]);
+ }
+
+ free_aligned_buffer_page_end(dst_argb_c);
+ free_aligned_buffer_page_end(dst_argb_opt);
+ free_aligned_buffer_page_end(src_argb);
+}
+
+static void ARGBTestRotate(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ libyuv::RotationMode mode,
+ int benchmark_iterations,
+ int disable_cpu_flags,
+ int benchmark_cpu_info) {
+ TestRotateBpp(src_width, src_height, dst_width, dst_height, mode,
+ benchmark_iterations, disable_cpu_flags, benchmark_cpu_info, 4);
+}
+
+TEST_F(LibYUVRotateTest, ARGBRotate0_Opt) {
+ ARGBTestRotate(benchmark_width_, benchmark_height_, benchmark_width_,
+ benchmark_height_, kRotate0, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, ARGBRotate90_Opt) {
+ ARGBTestRotate(benchmark_width_, benchmark_height_, benchmark_height_,
+ benchmark_width_, kRotate90, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, ARGBRotate180_Opt) {
+ ARGBTestRotate(benchmark_width_, benchmark_height_, benchmark_width_,
+ benchmark_height_, kRotate180, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, ARGBRotate270_Opt) {
+ ARGBTestRotate(benchmark_width_, benchmark_height_, benchmark_height_,
+ benchmark_width_, kRotate270, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+static void TestRotatePlane(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ libyuv::RotationMode mode,
+ int benchmark_iterations,
+ int disable_cpu_flags,
+ int benchmark_cpu_info) {
+ TestRotateBpp(src_width, src_height, dst_width, dst_height, mode,
+ benchmark_iterations, disable_cpu_flags, benchmark_cpu_info, 1);
+}
+
+TEST_F(LibYUVRotateTest, RotatePlane0_Opt) {
+ TestRotatePlane(benchmark_width_, benchmark_height_, benchmark_width_,
+ benchmark_height_, kRotate0, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, RotatePlane90_Opt) {
+ TestRotatePlane(benchmark_width_, benchmark_height_, benchmark_height_,
+ benchmark_width_, kRotate90, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, RotatePlane180_Opt) {
+ TestRotatePlane(benchmark_width_, benchmark_height_, benchmark_width_,
+ benchmark_height_, kRotate180, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, RotatePlane270_Opt) {
+ TestRotatePlane(benchmark_width_, benchmark_height_, benchmark_height_,
+ benchmark_width_, kRotate270, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, DISABLED_RotatePlane0_Odd) {
+ TestRotatePlane(benchmark_width_ - 3, benchmark_height_ - 1,
+ benchmark_width_ - 3, benchmark_height_ - 1, kRotate0,
+ benchmark_iterations_, disable_cpu_flags_,
+ benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, DISABLED_RotatePlane90_Odd) {
+ TestRotatePlane(benchmark_width_ - 3, benchmark_height_ - 1,
+ benchmark_height_ - 1, benchmark_width_ - 3, kRotate90,
+ benchmark_iterations_, disable_cpu_flags_,
+ benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, DISABLED_RotatePlane180_Odd) {
+ TestRotatePlane(benchmark_width_ - 3, benchmark_height_ - 1,
+ benchmark_width_ - 3, benchmark_height_ - 1, kRotate180,
+ benchmark_iterations_, disable_cpu_flags_,
+ benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, DISABLED_RotatePlane270_Odd) {
+ TestRotatePlane(benchmark_width_ - 3, benchmark_height_ - 1,
+ benchmark_height_ - 1, benchmark_width_ - 3, kRotate270,
+ benchmark_iterations_, disable_cpu_flags_,
+ benchmark_cpu_info_);
+}
+
+} // namespace libyuv
diff --git a/libyuv/src/main/cpp/libyuv/unit_test/rotate_test.cc b/libyuv/src/main/cpp/libyuv/unit_test/rotate_test.cc
new file mode 100644
index 0000000..d04b96e
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/unit_test/rotate_test.cc
@@ -0,0 +1,277 @@
+/*
+ * Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include
+
+#include "../unit_test/unit_test.h"
+#include "libyuv/cpu_id.h"
+#include "libyuv/rotate.h"
+
+namespace libyuv {
+
+static void I420TestRotate(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ libyuv::RotationMode mode,
+ int benchmark_iterations,
+ int disable_cpu_flags,
+ int benchmark_cpu_info) {
+ if (src_width < 1) {
+ src_width = 1;
+ }
+ if (src_height == 0) {
+ src_height = 1;
+ }
+ if (dst_width < 1) {
+ dst_width = 1;
+ }
+ if (dst_height < 1) {
+ dst_height = 1;
+ }
+ int src_i420_y_size = src_width * Abs(src_height);
+ int src_i420_uv_size = ((src_width + 1) / 2) * ((Abs(src_height) + 1) / 2);
+ int src_i420_size = src_i420_y_size + src_i420_uv_size * 2;
+ align_buffer_page_end(src_i420, src_i420_size);
+ for (int i = 0; i < src_i420_size; ++i) {
+ src_i420[i] = fastrand() & 0xff;
+ }
+
+ int dst_i420_y_size = dst_width * dst_height;
+ int dst_i420_uv_size = ((dst_width + 1) / 2) * ((dst_height + 1) / 2);
+ int dst_i420_size = dst_i420_y_size + dst_i420_uv_size * 2;
+ align_buffer_page_end(dst_i420_c, dst_i420_size);
+ align_buffer_page_end(dst_i420_opt, dst_i420_size);
+ memset(dst_i420_c, 2, dst_i420_size);
+ memset(dst_i420_opt, 3, dst_i420_size);
+
+ MaskCpuFlags(disable_cpu_flags); // Disable all CPU optimization.
+ I420Rotate(src_i420, src_width, src_i420 + src_i420_y_size,
+ (src_width + 1) / 2, src_i420 + src_i420_y_size + src_i420_uv_size,
+ (src_width + 1) / 2, dst_i420_c, dst_width,
+ dst_i420_c + dst_i420_y_size, (dst_width + 1) / 2,
+ dst_i420_c + dst_i420_y_size + dst_i420_uv_size,
+ (dst_width + 1) / 2, src_width, src_height, mode);
+
+ MaskCpuFlags(benchmark_cpu_info); // Enable all CPU optimization.
+ for (int i = 0; i < benchmark_iterations; ++i) {
+ I420Rotate(
+ src_i420, src_width, src_i420 + src_i420_y_size, (src_width + 1) / 2,
+ src_i420 + src_i420_y_size + src_i420_uv_size, (src_width + 1) / 2,
+ dst_i420_opt, dst_width, dst_i420_opt + dst_i420_y_size,
+ (dst_width + 1) / 2, dst_i420_opt + dst_i420_y_size + dst_i420_uv_size,
+ (dst_width + 1) / 2, src_width, src_height, mode);
+ }
+
+ // Rotation should be exact.
+ for (int i = 0; i < dst_i420_size; ++i) {
+ EXPECT_EQ(dst_i420_c[i], dst_i420_opt[i]);
+ }
+
+ free_aligned_buffer_page_end(dst_i420_c);
+ free_aligned_buffer_page_end(dst_i420_opt);
+ free_aligned_buffer_page_end(src_i420);
+}
+
+TEST_F(LibYUVRotateTest, I420Rotate0_Opt) {
+ I420TestRotate(benchmark_width_, benchmark_height_, benchmark_width_,
+ benchmark_height_, kRotate0, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, I420Rotate90_Opt) {
+ I420TestRotate(benchmark_width_, benchmark_height_, benchmark_height_,
+ benchmark_width_, kRotate90, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, I420Rotate180_Opt) {
+ I420TestRotate(benchmark_width_, benchmark_height_, benchmark_width_,
+ benchmark_height_, kRotate180, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, I420Rotate270_Opt) {
+ I420TestRotate(benchmark_width_, benchmark_height_, benchmark_height_,
+ benchmark_width_, kRotate270, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+// TODO(fbarchard): Remove odd width tests.
+// Odd width tests work but disabled because they use C code and can be
+// tested by passing an odd width command line or environment variable.
+TEST_F(LibYUVRotateTest, DISABLED_I420Rotate0_Odd) {
+ I420TestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
+ benchmark_width_ - 3, benchmark_height_ - 1, kRotate0,
+ benchmark_iterations_, disable_cpu_flags_,
+ benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, DISABLED_I420Rotate90_Odd) {
+ I420TestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
+ benchmark_height_ - 1, benchmark_width_ - 3, kRotate90,
+ benchmark_iterations_, disable_cpu_flags_,
+ benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, DISABLED_I420Rotate180_Odd) {
+ I420TestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
+ benchmark_width_ - 3, benchmark_height_ - 1, kRotate180,
+ benchmark_iterations_, disable_cpu_flags_,
+ benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, DISABLED_I420Rotate270_Odd) {
+ I420TestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
+ benchmark_height_ - 1, benchmark_width_ - 3, kRotate270,
+ benchmark_iterations_, disable_cpu_flags_,
+ benchmark_cpu_info_);
+}
+
+static void NV12TestRotate(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ libyuv::RotationMode mode,
+ int benchmark_iterations,
+ int disable_cpu_flags,
+ int benchmark_cpu_info) {
+ if (src_width < 1) {
+ src_width = 1;
+ }
+ if (src_height == 0) { // allow negative for inversion test.
+ src_height = 1;
+ }
+ if (dst_width < 1) {
+ dst_width = 1;
+ }
+ if (dst_height < 1) {
+ dst_height = 1;
+ }
+ int src_nv12_y_size = src_width * Abs(src_height);
+ int src_nv12_uv_size =
+ ((src_width + 1) / 2) * ((Abs(src_height) + 1) / 2) * 2;
+ int src_nv12_size = src_nv12_y_size + src_nv12_uv_size;
+ align_buffer_page_end(src_nv12, src_nv12_size);
+ for (int i = 0; i < src_nv12_size; ++i) {
+ src_nv12[i] = fastrand() & 0xff;
+ }
+
+ int dst_i420_y_size = dst_width * dst_height;
+ int dst_i420_uv_size = ((dst_width + 1) / 2) * ((dst_height + 1) / 2);
+ int dst_i420_size = dst_i420_y_size + dst_i420_uv_size * 2;
+ align_buffer_page_end(dst_i420_c, dst_i420_size);
+ align_buffer_page_end(dst_i420_opt, dst_i420_size);
+ memset(dst_i420_c, 2, dst_i420_size);
+ memset(dst_i420_opt, 3, dst_i420_size);
+
+ MaskCpuFlags(disable_cpu_flags); // Disable all CPU optimization.
+ NV12ToI420Rotate(src_nv12, src_width, src_nv12 + src_nv12_y_size,
+ (src_width + 1) & ~1, dst_i420_c, dst_width,
+ dst_i420_c + dst_i420_y_size, (dst_width + 1) / 2,
+ dst_i420_c + dst_i420_y_size + dst_i420_uv_size,
+ (dst_width + 1) / 2, src_width, src_height, mode);
+
+ MaskCpuFlags(benchmark_cpu_info); // Enable all CPU optimization.
+ for (int i = 0; i < benchmark_iterations; ++i) {
+ NV12ToI420Rotate(src_nv12, src_width, src_nv12 + src_nv12_y_size,
+ (src_width + 1) & ~1, dst_i420_opt, dst_width,
+ dst_i420_opt + dst_i420_y_size, (dst_width + 1) / 2,
+ dst_i420_opt + dst_i420_y_size + dst_i420_uv_size,
+ (dst_width + 1) / 2, src_width, src_height, mode);
+ }
+
+ // Rotation should be exact.
+ for (int i = 0; i < dst_i420_size; ++i) {
+ EXPECT_EQ(dst_i420_c[i], dst_i420_opt[i]);
+ }
+
+ free_aligned_buffer_page_end(dst_i420_c);
+ free_aligned_buffer_page_end(dst_i420_opt);
+ free_aligned_buffer_page_end(src_nv12);
+}
+
+TEST_F(LibYUVRotateTest, NV12Rotate0_Opt) {
+ NV12TestRotate(benchmark_width_, benchmark_height_, benchmark_width_,
+ benchmark_height_, kRotate0, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, NV12Rotate90_Opt) {
+ NV12TestRotate(benchmark_width_, benchmark_height_, benchmark_height_,
+ benchmark_width_, kRotate90, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, NV12Rotate180_Opt) {
+ NV12TestRotate(benchmark_width_, benchmark_height_, benchmark_width_,
+ benchmark_height_, kRotate180, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, NV12Rotate270_Opt) {
+ NV12TestRotate(benchmark_width_, benchmark_height_, benchmark_height_,
+ benchmark_width_, kRotate270, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, DISABLED_NV12Rotate0_Odd) {
+ NV12TestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
+ benchmark_width_ - 3, benchmark_height_ - 1, kRotate0,
+ benchmark_iterations_, disable_cpu_flags_,
+ benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, DISABLED_NV12Rotate90_Odd) {
+ NV12TestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
+ benchmark_height_ - 1, benchmark_width_ - 3, kRotate90,
+ benchmark_iterations_, disable_cpu_flags_,
+ benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, DISABLED_NV12Rotate180_Odd) {
+ NV12TestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
+ benchmark_width_ - 3, benchmark_height_ - 1, kRotate180,
+ benchmark_iterations_, disable_cpu_flags_,
+ benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, DISABLED_NV12Rotate270_Odd) {
+ NV12TestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
+ benchmark_height_ - 1, benchmark_width_ - 3, kRotate270,
+ benchmark_iterations_, disable_cpu_flags_,
+ benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, NV12Rotate0_Invert) {
+ NV12TestRotate(benchmark_width_, -benchmark_height_, benchmark_width_,
+ benchmark_height_, kRotate0, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, NV12Rotate90_Invert) {
+ NV12TestRotate(benchmark_width_, -benchmark_height_, benchmark_height_,
+ benchmark_width_, kRotate90, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, NV12Rotate180_Invert) {
+ NV12TestRotate(benchmark_width_, -benchmark_height_, benchmark_width_,
+ benchmark_height_, kRotate180, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, NV12Rotate270_Invert) {
+ NV12TestRotate(benchmark_width_, -benchmark_height_, benchmark_height_,
+ benchmark_width_, kRotate270, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+} // namespace libyuv
diff --git a/libyuv/src/main/cpp/libyuv/unit_test/scale_argb_test.cc b/libyuv/src/main/cpp/libyuv/unit_test/scale_argb_test.cc
new file mode 100644
index 0000000..d11aec2
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/unit_test/scale_argb_test.cc
@@ -0,0 +1,449 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include
+#include
+
+#include "../unit_test/unit_test.h"
+#include "libyuv/convert_argb.h"
+#include "libyuv/cpu_id.h"
+#include "libyuv/scale_argb.h"
+#include "libyuv/video_common.h"
+
+namespace libyuv {
+
+#define STRINGIZE(line) #line
+#define FILELINESTR(file, line) file ":" STRINGIZE(line)
+
+// Test scaling with C vs Opt and return maximum pixel difference. 0 = exact.
+static int ARGBTestFilter(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ FilterMode f,
+ int benchmark_iterations,
+ int disable_cpu_flags,
+ int benchmark_cpu_info) {
+ if (!SizeValid(src_width, src_height, dst_width, dst_height)) {
+ return 0;
+ }
+
+ int i, j;
+ const int b = 0; // 128 to test for padding/stride.
+ int64 src_argb_plane_size =
+ (Abs(src_width) + b * 2) * (Abs(src_height) + b * 2) * 4LL;
+ int src_stride_argb = (b * 2 + Abs(src_width)) * 4;
+
+ align_buffer_page_end(src_argb, src_argb_plane_size);
+ if (!src_argb) {
+ printf("Skipped. Alloc failed " FILELINESTR(__FILE__, __LINE__) "\n");
+ return 0;
+ }
+ MemRandomize(src_argb, src_argb_plane_size);
+
+ int64 dst_argb_plane_size = (dst_width + b * 2) * (dst_height + b * 2) * 4LL;
+ int dst_stride_argb = (b * 2 + dst_width) * 4;
+
+ align_buffer_page_end(dst_argb_c, dst_argb_plane_size);
+ align_buffer_page_end(dst_argb_opt, dst_argb_plane_size);
+ if (!dst_argb_c || !dst_argb_opt) {
+ printf("Skipped. Alloc failed " FILELINESTR(__FILE__, __LINE__) "\n");
+ return 0;
+ }
+ memset(dst_argb_c, 2, dst_argb_plane_size);
+ memset(dst_argb_opt, 3, dst_argb_plane_size);
+
+ // Warm up both versions for consistent benchmarks.
+ MaskCpuFlags(disable_cpu_flags); // Disable all CPU optimization.
+ ARGBScale(src_argb + (src_stride_argb * b) + b * 4, src_stride_argb,
+ src_width, src_height, dst_argb_c + (dst_stride_argb * b) + b * 4,
+ dst_stride_argb, dst_width, dst_height, f);
+ MaskCpuFlags(benchmark_cpu_info); // Enable all CPU optimization.
+ ARGBScale(src_argb + (src_stride_argb * b) + b * 4, src_stride_argb,
+ src_width, src_height, dst_argb_opt + (dst_stride_argb * b) + b * 4,
+ dst_stride_argb, dst_width, dst_height, f);
+
+ MaskCpuFlags(disable_cpu_flags); // Disable all CPU optimization.
+ double c_time = get_time();
+ ARGBScale(src_argb + (src_stride_argb * b) + b * 4, src_stride_argb,
+ src_width, src_height, dst_argb_c + (dst_stride_argb * b) + b * 4,
+ dst_stride_argb, dst_width, dst_height, f);
+
+ c_time = (get_time() - c_time);
+
+ MaskCpuFlags(benchmark_cpu_info); // Enable all CPU optimization.
+ double opt_time = get_time();
+ for (i = 0; i < benchmark_iterations; ++i) {
+ ARGBScale(src_argb + (src_stride_argb * b) + b * 4, src_stride_argb,
+ src_width, src_height,
+ dst_argb_opt + (dst_stride_argb * b) + b * 4, dst_stride_argb,
+ dst_width, dst_height, f);
+ }
+ opt_time = (get_time() - opt_time) / benchmark_iterations;
+
+ // Report performance of C vs OPT
+ printf("filter %d - %8d us C - %8d us OPT\n", f,
+ static_cast(c_time * 1e6), static_cast(opt_time * 1e6));
+
+ // C version may be a little off from the optimized. Order of
+ // operations may introduce rounding somewhere. So do a difference
+ // of the buffers and look to see that the max difference isn't
+ // over 2.
+ int max_diff = 0;
+ for (i = b; i < (dst_height + b); ++i) {
+ for (j = b * 4; j < (dst_width + b) * 4; ++j) {
+ int abs_diff = Abs(dst_argb_c[(i * dst_stride_argb) + j] -
+ dst_argb_opt[(i * dst_stride_argb) + j]);
+ if (abs_diff > max_diff) {
+ max_diff = abs_diff;
+ }
+ }
+ }
+
+ free_aligned_buffer_page_end(dst_argb_c);
+ free_aligned_buffer_page_end(dst_argb_opt);
+ free_aligned_buffer_page_end(src_argb);
+ return max_diff;
+}
+
+static const int kTileX = 8;
+static const int kTileY = 8;
+
+static int TileARGBScale(const uint8* src_argb,
+ int src_stride_argb,
+ int src_width,
+ int src_height,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int dst_width,
+ int dst_height,
+ FilterMode filtering) {
+ for (int y = 0; y < dst_height; y += kTileY) {
+ for (int x = 0; x < dst_width; x += kTileX) {
+ int clip_width = kTileX;
+ if (x + clip_width > dst_width) {
+ clip_width = dst_width - x;
+ }
+ int clip_height = kTileY;
+ if (y + clip_height > dst_height) {
+ clip_height = dst_height - y;
+ }
+ int r = ARGBScaleClip(src_argb, src_stride_argb, src_width, src_height,
+ dst_argb, dst_stride_argb, dst_width, dst_height, x,
+ y, clip_width, clip_height, filtering);
+ if (r) {
+ return r;
+ }
+ }
+ }
+ return 0;
+}
+
+static int ARGBClipTestFilter(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ FilterMode f,
+ int benchmark_iterations) {
+ if (!SizeValid(src_width, src_height, dst_width, dst_height)) {
+ return 0;
+ }
+
+ const int b = 128;
+ int64 src_argb_plane_size =
+ (Abs(src_width) + b * 2) * (Abs(src_height) + b * 2) * 4;
+ int src_stride_argb = (b * 2 + Abs(src_width)) * 4;
+
+ align_buffer_page_end(src_argb, src_argb_plane_size);
+ if (!src_argb) {
+ printf("Skipped. Alloc failed " FILELINESTR(__FILE__, __LINE__) "\n");
+ return 0;
+ }
+ memset(src_argb, 1, src_argb_plane_size);
+
+ int64 dst_argb_plane_size = (dst_width + b * 2) * (dst_height + b * 2) * 4;
+ int dst_stride_argb = (b * 2 + dst_width) * 4;
+
+ int i, j;
+ for (i = b; i < (Abs(src_height) + b); ++i) {
+ for (j = b; j < (Abs(src_width) + b) * 4; ++j) {
+ src_argb[(i * src_stride_argb) + j] = (fastrand() & 0xff);
+ }
+ }
+
+ align_buffer_page_end(dst_argb_c, dst_argb_plane_size);
+ align_buffer_page_end(dst_argb_opt, dst_argb_plane_size);
+ if (!dst_argb_c || !dst_argb_opt) {
+ printf("Skipped. Alloc failed " FILELINESTR(__FILE__, __LINE__) "\n");
+ return 0;
+ }
+ memset(dst_argb_c, 2, dst_argb_plane_size);
+ memset(dst_argb_opt, 3, dst_argb_plane_size);
+
+ // Do full image, no clipping.
+ double c_time = get_time();
+ ARGBScale(src_argb + (src_stride_argb * b) + b * 4, src_stride_argb,
+ src_width, src_height, dst_argb_c + (dst_stride_argb * b) + b * 4,
+ dst_stride_argb, dst_width, dst_height, f);
+ c_time = (get_time() - c_time);
+
+ // Do tiled image, clipping scale to a tile at a time.
+ double opt_time = get_time();
+ for (i = 0; i < benchmark_iterations; ++i) {
+ TileARGBScale(src_argb + (src_stride_argb * b) + b * 4, src_stride_argb,
+ src_width, src_height,
+ dst_argb_opt + (dst_stride_argb * b) + b * 4, dst_stride_argb,
+ dst_width, dst_height, f);
+ }
+ opt_time = (get_time() - opt_time) / benchmark_iterations;
+
+ // Report performance of Full vs Tiled.
+ printf("filter %d - %8d us Full - %8d us Tiled\n", f,
+ static_cast(c_time * 1e6), static_cast(opt_time * 1e6));
+
+ // Compare full scaled image vs tiled image.
+ int max_diff = 0;
+ for (i = b; i < (dst_height + b); ++i) {
+ for (j = b * 4; j < (dst_width + b) * 4; ++j) {
+ int abs_diff = Abs(dst_argb_c[(i * dst_stride_argb) + j] -
+ dst_argb_opt[(i * dst_stride_argb) + j]);
+ if (abs_diff > max_diff) {
+ max_diff = abs_diff;
+ }
+ }
+ }
+
+ free_aligned_buffer_page_end(dst_argb_c);
+ free_aligned_buffer_page_end(dst_argb_opt);
+ free_aligned_buffer_page_end(src_argb);
+ return max_diff;
+}
+
+// The following adjustments in dimensions ensure the scale factor will be
+// exactly achieved.
+#define DX(x, nom, denom) static_cast((Abs(x) / nom) * nom)
+#define SX(x, nom, denom) static_cast((x / nom) * denom)
+
+#define TEST_FACTOR1(name, filter, nom, denom, max_diff) \
+ TEST_F(LibYUVScaleTest, ARGBScaleDownBy##name##_##filter) { \
+ int diff = ARGBTestFilter( \
+ SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom), \
+ DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom), \
+ kFilter##filter, benchmark_iterations_, disable_cpu_flags_, \
+ benchmark_cpu_info_); \
+ EXPECT_LE(diff, max_diff); \
+ } \
+ TEST_F(LibYUVScaleTest, ARGBScaleDownClipBy##name##_##filter) { \
+ int diff = ARGBClipTestFilter( \
+ SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom), \
+ DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom), \
+ kFilter##filter, benchmark_iterations_); \
+ EXPECT_LE(diff, max_diff); \
+ }
+
+// Test a scale factor with all 4 filters. Expect unfiltered to be exact, but
+// filtering is different fixed point implementations for SSSE3, Neon and C.
+#define TEST_FACTOR(name, nom, denom) \
+ TEST_FACTOR1(name, None, nom, denom, 0) \
+ TEST_FACTOR1(name, Linear, nom, denom, 3) \
+ TEST_FACTOR1(name, Bilinear, nom, denom, 3) \
+ TEST_FACTOR1(name, Box, nom, denom, 3)
+
+TEST_FACTOR(2, 1, 2)
+TEST_FACTOR(4, 1, 4)
+TEST_FACTOR(8, 1, 8)
+TEST_FACTOR(3by4, 3, 4)
+TEST_FACTOR(3by8, 3, 8)
+TEST_FACTOR(3, 1, 3)
+#undef TEST_FACTOR1
+#undef TEST_FACTOR
+#undef SX
+#undef DX
+
+#define TEST_SCALETO1(name, width, height, filter, max_diff) \
+ TEST_F(LibYUVScaleTest, name##To##width##x##height##_##filter) { \
+ int diff = ARGBTestFilter(benchmark_width_, benchmark_height_, width, \
+ height, kFilter##filter, benchmark_iterations_, \
+ disable_cpu_flags_, benchmark_cpu_info_); \
+ EXPECT_LE(diff, max_diff); \
+ } \
+ TEST_F(LibYUVScaleTest, name##From##width##x##height##_##filter) { \
+ int diff = ARGBTestFilter(width, height, Abs(benchmark_width_), \
+ Abs(benchmark_height_), kFilter##filter, \
+ benchmark_iterations_, disable_cpu_flags_, \
+ benchmark_cpu_info_); \
+ EXPECT_LE(diff, max_diff); \
+ } \
+ TEST_F(LibYUVScaleTest, name##ClipTo##width##x##height##_##filter) { \
+ int diff = \
+ ARGBClipTestFilter(benchmark_width_, benchmark_height_, width, height, \
+ kFilter##filter, benchmark_iterations_); \
+ EXPECT_LE(diff, max_diff); \
+ } \
+ TEST_F(LibYUVScaleTest, name##ClipFrom##width##x##height##_##filter) { \
+ int diff = ARGBClipTestFilter(width, height, Abs(benchmark_width_), \
+ Abs(benchmark_height_), kFilter##filter, \
+ benchmark_iterations_); \
+ EXPECT_LE(diff, max_diff); \
+ }
+
+/// Test scale to a specified size with all 4 filters.
+#define TEST_SCALETO(name, width, height) \
+ TEST_SCALETO1(name, width, height, None, 0) \
+ TEST_SCALETO1(name, width, height, Linear, 3) \
+ TEST_SCALETO1(name, width, height, Bilinear, 3)
+
+TEST_SCALETO(ARGBScale, 1, 1)
+TEST_SCALETO(ARGBScale, 320, 240)
+TEST_SCALETO(ARGBScale, 352, 288)
+TEST_SCALETO(ARGBScale, 569, 480)
+TEST_SCALETO(ARGBScale, 640, 360)
+TEST_SCALETO(ARGBScale, 1280, 720)
+#undef TEST_SCALETO1
+#undef TEST_SCALETO
+
+// Scale with YUV conversion to ARGB and clipping.
+LIBYUV_API
+int YUVToARGBScaleReference2(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint32 /* src_fourcc */, // TODO: Add support.
+ int src_width,
+ int src_height,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ uint32 /* dst_fourcc */, // TODO: Add support.
+ int dst_width,
+ int dst_height,
+ int clip_x,
+ int clip_y,
+ int clip_width,
+ int clip_height,
+ enum FilterMode filtering) {
+ uint8* argb_buffer = static_cast(malloc(src_width * src_height * 4));
+ int r;
+ I420ToARGB(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v,
+ argb_buffer, src_width * 4, src_width, src_height);
+
+ r = ARGBScaleClip(argb_buffer, src_width * 4, src_width, src_height, dst_argb,
+ dst_stride_argb, dst_width, dst_height, clip_x, clip_y,
+ clip_width, clip_height, filtering);
+ free(argb_buffer);
+ return r;
+}
+
+static void FillRamp(uint8* buf, int width, int height, int v, int dx, int dy) {
+ int rv = v;
+ for (int y = 0; y < height; ++y) {
+ for (int x = 0; x < width; ++x) {
+ *buf++ = v;
+ v += dx;
+ if (v < 0 || v > 255) {
+ dx = -dx;
+ v += dx;
+ }
+ }
+ v = rv + dy;
+ if (v < 0 || v > 255) {
+ dy = -dy;
+ v += dy;
+ }
+ rv = v;
+ }
+}
+
+// Test scaling with C vs Opt and return maximum pixel difference. 0 = exact.
+static int YUVToARGBTestFilter(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ FilterMode f,
+ int benchmark_iterations) {
+ int64 src_y_plane_size = Abs(src_width) * Abs(src_height);
+ int64 src_uv_plane_size =
+ ((Abs(src_width) + 1) / 2) * ((Abs(src_height) + 1) / 2);
+ int src_stride_y = Abs(src_width);
+ int src_stride_uv = (Abs(src_width) + 1) / 2;
+
+ align_buffer_page_end(src_y, src_y_plane_size);
+ align_buffer_page_end(src_u, src_uv_plane_size);
+ align_buffer_page_end(src_v, src_uv_plane_size);
+
+ int64 dst_argb_plane_size = (dst_width) * (dst_height)*4LL;
+ int dst_stride_argb = (dst_width)*4;
+ align_buffer_page_end(dst_argb_c, dst_argb_plane_size);
+ align_buffer_page_end(dst_argb_opt, dst_argb_plane_size);
+ if (!dst_argb_c || !dst_argb_opt || !src_y || !src_u || !src_v) {
+ printf("Skipped. Alloc failed " FILELINESTR(__FILE__, __LINE__) "\n");
+ return 0;
+ }
+ // Fill YUV image with continuous ramp, which is less sensitive to
+ // subsampling and filtering differences for test purposes.
+ FillRamp(src_y, Abs(src_width), Abs(src_height), 128, 1, 1);
+ FillRamp(src_u, (Abs(src_width) + 1) / 2, (Abs(src_height) + 1) / 2, 3, 1, 1);
+ FillRamp(src_v, (Abs(src_width) + 1) / 2, (Abs(src_height) + 1) / 2, 4, 1, 1);
+ memset(dst_argb_c, 2, dst_argb_plane_size);
+ memset(dst_argb_opt, 3, dst_argb_plane_size);
+
+ YUVToARGBScaleReference2(src_y, src_stride_y, src_u, src_stride_uv, src_v,
+ src_stride_uv, libyuv::FOURCC_I420, src_width,
+ src_height, dst_argb_c, dst_stride_argb,
+ libyuv::FOURCC_I420, dst_width, dst_height, 0, 0,
+ dst_width, dst_height, f);
+
+ for (int i = 0; i < benchmark_iterations; ++i) {
+ YUVToARGBScaleClip(src_y, src_stride_y, src_u, src_stride_uv, src_v,
+ src_stride_uv, libyuv::FOURCC_I420, src_width,
+ src_height, dst_argb_opt, dst_stride_argb,
+ libyuv::FOURCC_I420, dst_width, dst_height, 0, 0,
+ dst_width, dst_height, f);
+ }
+ int max_diff = 0;
+ for (int i = 0; i < dst_height; ++i) {
+ for (int j = 0; j < dst_width * 4; ++j) {
+ int abs_diff = Abs(dst_argb_c[(i * dst_stride_argb) + j] -
+ dst_argb_opt[(i * dst_stride_argb) + j]);
+ if (abs_diff > max_diff) {
+ printf("error %d at %d,%d c %d opt %d", abs_diff, j, i,
+ dst_argb_c[(i * dst_stride_argb) + j],
+ dst_argb_opt[(i * dst_stride_argb) + j]);
+ EXPECT_LE(abs_diff, 40);
+ max_diff = abs_diff;
+ }
+ }
+ }
+
+ free_aligned_buffer_page_end(dst_argb_c);
+ free_aligned_buffer_page_end(dst_argb_opt);
+ free_aligned_buffer_page_end(src_y);
+ free_aligned_buffer_page_end(src_u);
+ free_aligned_buffer_page_end(src_v);
+ return max_diff;
+}
+
+TEST_F(LibYUVScaleTest, YUVToRGBScaleUp) {
+ int diff =
+ YUVToARGBTestFilter(benchmark_width_, benchmark_height_,
+ benchmark_width_ * 3 / 2, benchmark_height_ * 3 / 2,
+ libyuv::kFilterBilinear, benchmark_iterations_);
+ EXPECT_LE(diff, 10);
+}
+
+TEST_F(LibYUVScaleTest, YUVToRGBScaleDown) {
+ int diff = YUVToARGBTestFilter(
+ benchmark_width_ * 3 / 2, benchmark_height_ * 3 / 2, benchmark_width_,
+ benchmark_height_, libyuv::kFilterBilinear, benchmark_iterations_);
+ EXPECT_LE(diff, 10);
+}
+
+} // namespace libyuv
diff --git a/libyuv/src/main/cpp/libyuv/unit_test/scale_test.cc b/libyuv/src/main/cpp/libyuv/unit_test/scale_test.cc
new file mode 100644
index 0000000..8367dd2
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/unit_test/scale_test.cc
@@ -0,0 +1,453 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include
+#include
+
+#include "../unit_test/unit_test.h"
+#include "libyuv/cpu_id.h"
+#include "libyuv/scale.h"
+#include "libyuv/scale_row.h" // For ScaleRowDown2Box_Odd_C
+
+#define STRINGIZE(line) #line
+#define FILELINESTR(file, line) file ":" STRINGIZE(line)
+
+namespace libyuv {
+
+// Test scaling with C vs Opt and return maximum pixel difference. 0 = exact.
+static int TestFilter(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ FilterMode f,
+ int benchmark_iterations,
+ int disable_cpu_flags,
+ int benchmark_cpu_info) {
+ if (!SizeValid(src_width, src_height, dst_width, dst_height)) {
+ return 0;
+ }
+
+ int i, j;
+ const int b = 0; // 128 to test for padding/stride.
+ int src_width_uv = (Abs(src_width) + 1) >> 1;
+ int src_height_uv = (Abs(src_height) + 1) >> 1;
+
+ int64 src_y_plane_size = (Abs(src_width) + b * 2) * (Abs(src_height) + b * 2);
+ int64 src_uv_plane_size = (src_width_uv + b * 2) * (src_height_uv + b * 2);
+
+ int src_stride_y = b * 2 + Abs(src_width);
+ int src_stride_uv = b * 2 + src_width_uv;
+
+ align_buffer_page_end(src_y, src_y_plane_size)
+ align_buffer_page_end(src_u, src_uv_plane_size) align_buffer_page_end(
+ src_v, src_uv_plane_size) if (!src_y || !src_u || !src_v) {
+ printf("Skipped. Alloc failed " FILELINESTR(__FILE__, __LINE__) "\n");
+ return 0;
+ }
+ MemRandomize(src_y, src_y_plane_size);
+ MemRandomize(src_u, src_uv_plane_size);
+ MemRandomize(src_v, src_uv_plane_size);
+
+ int dst_width_uv = (dst_width + 1) >> 1;
+ int dst_height_uv = (dst_height + 1) >> 1;
+
+ int64 dst_y_plane_size = (dst_width + b * 2) * (dst_height + b * 2);
+ int64 dst_uv_plane_size = (dst_width_uv + b * 2) * (dst_height_uv + b * 2);
+
+ int dst_stride_y = b * 2 + dst_width;
+ int dst_stride_uv = b * 2 + dst_width_uv;
+
+ align_buffer_page_end(dst_y_c, dst_y_plane_size)
+ align_buffer_page_end(dst_u_c, dst_uv_plane_size)
+ align_buffer_page_end(dst_v_c, dst_uv_plane_size)
+ align_buffer_page_end(dst_y_opt, dst_y_plane_size)
+ align_buffer_page_end(dst_u_opt, dst_uv_plane_size)
+ align_buffer_page_end(
+ dst_v_opt,
+ dst_uv_plane_size) if (!dst_y_c || !dst_u_c ||
+ !dst_v_c || !dst_y_opt ||
+ !dst_u_opt || !dst_v_opt) {
+ printf("Skipped. Alloc failed " FILELINESTR(__FILE__, __LINE__) "\n");
+ return 0;
+ }
+
+ MaskCpuFlags(disable_cpu_flags); // Disable all CPU optimization.
+ double c_time = get_time();
+ I420Scale(src_y + (src_stride_y * b) + b, src_stride_y,
+ src_u + (src_stride_uv * b) + b, src_stride_uv,
+ src_v + (src_stride_uv * b) + b, src_stride_uv, src_width,
+ src_height, dst_y_c + (dst_stride_y * b) + b, dst_stride_y,
+ dst_u_c + (dst_stride_uv * b) + b, dst_stride_uv,
+ dst_v_c + (dst_stride_uv * b) + b, dst_stride_uv, dst_width,
+ dst_height, f);
+ c_time = (get_time() - c_time);
+
+ MaskCpuFlags(benchmark_cpu_info); // Enable all CPU optimization.
+ double opt_time = get_time();
+ for (i = 0; i < benchmark_iterations; ++i) {
+ I420Scale(src_y + (src_stride_y * b) + b, src_stride_y,
+ src_u + (src_stride_uv * b) + b, src_stride_uv,
+ src_v + (src_stride_uv * b) + b, src_stride_uv, src_width,
+ src_height, dst_y_opt + (dst_stride_y * b) + b, dst_stride_y,
+ dst_u_opt + (dst_stride_uv * b) + b, dst_stride_uv,
+ dst_v_opt + (dst_stride_uv * b) + b, dst_stride_uv, dst_width,
+ dst_height, f);
+ }
+ opt_time = (get_time() - opt_time) / benchmark_iterations;
+ // Report performance of C vs OPT
+ printf("filter %d - %8d us C - %8d us OPT\n", f,
+ static_cast(c_time * 1e6), static_cast(opt_time * 1e6));
+
+ // C version may be a little off from the optimized. Order of
+ // operations may introduce rounding somewhere. So do a difference
+ // of the buffers and look to see that the max difference isn't
+ // over 2.
+ int max_diff = 0;
+ for (i = b; i < (dst_height + b); ++i) {
+ for (j = b; j < (dst_width + b); ++j) {
+ int abs_diff = Abs(dst_y_c[(i * dst_stride_y) + j] -
+ dst_y_opt[(i * dst_stride_y) + j]);
+ if (abs_diff > max_diff) {
+ max_diff = abs_diff;
+ }
+ }
+ }
+
+ for (i = b; i < (dst_height_uv + b); ++i) {
+ for (j = b; j < (dst_width_uv + b); ++j) {
+ int abs_diff = Abs(dst_u_c[(i * dst_stride_uv) + j] -
+ dst_u_opt[(i * dst_stride_uv) + j]);
+ if (abs_diff > max_diff) {
+ max_diff = abs_diff;
+ }
+ abs_diff = Abs(dst_v_c[(i * dst_stride_uv) + j] -
+ dst_v_opt[(i * dst_stride_uv) + j]);
+ if (abs_diff > max_diff) {
+ max_diff = abs_diff;
+ }
+ }
+ }
+
+ free_aligned_buffer_page_end(dst_y_c) free_aligned_buffer_page_end(dst_u_c)
+ free_aligned_buffer_page_end(dst_v_c)
+ free_aligned_buffer_page_end(dst_y_opt)
+ free_aligned_buffer_page_end(dst_u_opt)
+ free_aligned_buffer_page_end(dst_v_opt)
+
+ free_aligned_buffer_page_end(src_y)
+ free_aligned_buffer_page_end(src_u)
+ free_aligned_buffer_page_end(src_v)
+
+ return max_diff;
+}
+
+// Test scaling with 8 bit C vs 16 bit C and return maximum pixel difference.
+// 0 = exact.
+static int TestFilter_16(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ FilterMode f,
+ int benchmark_iterations) {
+ if (!SizeValid(src_width, src_height, dst_width, dst_height)) {
+ return 0;
+ }
+
+ int i, j;
+ const int b = 0; // 128 to test for padding/stride.
+ int src_width_uv = (Abs(src_width) + 1) >> 1;
+ int src_height_uv = (Abs(src_height) + 1) >> 1;
+
+ int64 src_y_plane_size = (Abs(src_width) + b * 2) * (Abs(src_height) + b * 2);
+ int64 src_uv_plane_size = (src_width_uv + b * 2) * (src_height_uv + b * 2);
+
+ int src_stride_y = b * 2 + Abs(src_width);
+ int src_stride_uv = b * 2 + src_width_uv;
+
+ align_buffer_page_end(src_y, src_y_plane_size) align_buffer_page_end(
+ src_u, src_uv_plane_size) align_buffer_page_end(src_v, src_uv_plane_size)
+ align_buffer_page_end(src_y_16, src_y_plane_size * 2)
+ align_buffer_page_end(src_u_16, src_uv_plane_size * 2)
+ align_buffer_page_end(src_v_16, src_uv_plane_size * 2)
+ uint16* p_src_y_16 = reinterpret_cast(src_y_16);
+ uint16* p_src_u_16 = reinterpret_cast(src_u_16);
+ uint16* p_src_v_16 = reinterpret_cast(src_v_16);
+
+ MemRandomize(src_y, src_y_plane_size);
+ MemRandomize(src_u, src_uv_plane_size);
+ MemRandomize(src_v, src_uv_plane_size);
+
+ for (i = b; i < src_height + b; ++i) {
+ for (j = b; j < src_width + b; ++j) {
+ p_src_y_16[(i * src_stride_y) + j] = src_y[(i * src_stride_y) + j];
+ }
+ }
+
+ for (i = b; i < (src_height_uv + b); ++i) {
+ for (j = b; j < (src_width_uv + b); ++j) {
+ p_src_u_16[(i * src_stride_uv) + j] = src_u[(i * src_stride_uv) + j];
+ p_src_v_16[(i * src_stride_uv) + j] = src_v[(i * src_stride_uv) + j];
+ }
+ }
+
+ int dst_width_uv = (dst_width + 1) >> 1;
+ int dst_height_uv = (dst_height + 1) >> 1;
+
+ int dst_y_plane_size = (dst_width + b * 2) * (dst_height + b * 2);
+ int dst_uv_plane_size = (dst_width_uv + b * 2) * (dst_height_uv + b * 2);
+
+ int dst_stride_y = b * 2 + dst_width;
+ int dst_stride_uv = b * 2 + dst_width_uv;
+
+ align_buffer_page_end(dst_y_8, dst_y_plane_size)
+ align_buffer_page_end(dst_u_8, dst_uv_plane_size)
+ align_buffer_page_end(dst_v_8, dst_uv_plane_size)
+ align_buffer_page_end(dst_y_16, dst_y_plane_size * 2)
+ align_buffer_page_end(dst_u_16, dst_uv_plane_size * 2)
+ align_buffer_page_end(dst_v_16, dst_uv_plane_size * 2)
+
+ uint16* p_dst_y_16 =
+ reinterpret_cast(dst_y_16);
+ uint16* p_dst_u_16 = reinterpret_cast(dst_u_16);
+ uint16* p_dst_v_16 = reinterpret_cast(dst_v_16);
+
+ I420Scale(src_y + (src_stride_y * b) + b, src_stride_y,
+ src_u + (src_stride_uv * b) + b, src_stride_uv,
+ src_v + (src_stride_uv * b) + b, src_stride_uv, src_width,
+ src_height, dst_y_8 + (dst_stride_y * b) + b, dst_stride_y,
+ dst_u_8 + (dst_stride_uv * b) + b, dst_stride_uv,
+ dst_v_8 + (dst_stride_uv * b) + b, dst_stride_uv, dst_width,
+ dst_height, f);
+
+ for (i = 0; i < benchmark_iterations; ++i) {
+ I420Scale_16(p_src_y_16 + (src_stride_y * b) + b, src_stride_y,
+ p_src_u_16 + (src_stride_uv * b) + b, src_stride_uv,
+ p_src_v_16 + (src_stride_uv * b) + b, src_stride_uv, src_width,
+ src_height, p_dst_y_16 + (dst_stride_y * b) + b, dst_stride_y,
+ p_dst_u_16 + (dst_stride_uv * b) + b, dst_stride_uv,
+ p_dst_v_16 + (dst_stride_uv * b) + b, dst_stride_uv, dst_width,
+ dst_height, f);
+ }
+
+ // Expect an exact match
+ int max_diff = 0;
+ for (i = b; i < (dst_height + b); ++i) {
+ for (j = b; j < (dst_width + b); ++j) {
+ int abs_diff = Abs(dst_y_8[(i * dst_stride_y) + j] -
+ p_dst_y_16[(i * dst_stride_y) + j]);
+ if (abs_diff > max_diff) {
+ max_diff = abs_diff;
+ }
+ }
+ }
+
+ for (i = b; i < (dst_height_uv + b); ++i) {
+ for (j = b; j < (dst_width_uv + b); ++j) {
+ int abs_diff = Abs(dst_u_8[(i * dst_stride_uv) + j] -
+ p_dst_u_16[(i * dst_stride_uv) + j]);
+ if (abs_diff > max_diff) {
+ max_diff = abs_diff;
+ }
+ abs_diff = Abs(dst_v_8[(i * dst_stride_uv) + j] -
+ p_dst_v_16[(i * dst_stride_uv) + j]);
+ if (abs_diff > max_diff) {
+ max_diff = abs_diff;
+ }
+ }
+ }
+
+ free_aligned_buffer_page_end(dst_y_8) free_aligned_buffer_page_end(dst_u_8)
+ free_aligned_buffer_page_end(dst_v_8)
+ free_aligned_buffer_page_end(dst_y_16)
+ free_aligned_buffer_page_end(dst_u_16)
+ free_aligned_buffer_page_end(dst_v_16)
+
+ free_aligned_buffer_page_end(src_y)
+ free_aligned_buffer_page_end(src_u)
+ free_aligned_buffer_page_end(src_v)
+ free_aligned_buffer_page_end(src_y_16)
+ free_aligned_buffer_page_end(src_u_16)
+ free_aligned_buffer_page_end(src_v_16)
+
+ return max_diff;
+}
+
+// The following adjustments in dimensions ensure the scale factor will be
+// exactly achieved.
+// 2 is chroma subsample
+#define DX(x, nom, denom) static_cast(((Abs(x) / nom + 1) / 2) * nom * 2)
+#define SX(x, nom, denom) static_cast(((x / nom + 1) / 2) * denom * 2)
+
+#define TEST_FACTOR1(name, filter, nom, denom, max_diff) \
+ TEST_F(LibYUVScaleTest, ScaleDownBy##name##_##filter) { \
+ int diff = TestFilter( \
+ SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom), \
+ DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom), \
+ kFilter##filter, benchmark_iterations_, disable_cpu_flags_, \
+ benchmark_cpu_info_); \
+ EXPECT_LE(diff, max_diff); \
+ } \
+ TEST_F(LibYUVScaleTest, DISABLED_ScaleDownBy##name##_##filter##_16) { \
+ int diff = TestFilter_16( \
+ SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom), \
+ DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom), \
+ kFilter##filter, benchmark_iterations_); \
+ EXPECT_LE(diff, max_diff); \
+ }
+
+// Test a scale factor with all 4 filters. Expect unfiltered to be exact, but
+// filtering is different fixed point implementations for SSSE3, Neon and C.
+#define TEST_FACTOR(name, nom, denom, boxdiff) \
+ TEST_FACTOR1(name, None, nom, denom, 0) \
+ TEST_FACTOR1(name, Linear, nom, denom, 3) \
+ TEST_FACTOR1(name, Bilinear, nom, denom, 3) \
+ TEST_FACTOR1(name, Box, nom, denom, boxdiff)
+
+TEST_FACTOR(2, 1, 2, 0)
+TEST_FACTOR(4, 1, 4, 0)
+TEST_FACTOR(8, 1, 8, 0)
+TEST_FACTOR(3by4, 3, 4, 1)
+TEST_FACTOR(3by8, 3, 8, 1)
+TEST_FACTOR(3, 1, 3, 0)
+#undef TEST_FACTOR1
+#undef TEST_FACTOR
+#undef SX
+#undef DX
+
+#define TEST_SCALETO1(name, width, height, filter, max_diff) \
+ TEST_F(LibYUVScaleTest, name##To##width##x##height##_##filter) { \
+ int diff = TestFilter(benchmark_width_, benchmark_height_, width, height, \
+ kFilter##filter, benchmark_iterations_, \
+ disable_cpu_flags_, benchmark_cpu_info_); \
+ EXPECT_LE(diff, max_diff); \
+ } \
+ TEST_F(LibYUVScaleTest, name##From##width##x##height##_##filter) { \
+ int diff = TestFilter(width, height, Abs(benchmark_width_), \
+ Abs(benchmark_height_), kFilter##filter, \
+ benchmark_iterations_, disable_cpu_flags_, \
+ benchmark_cpu_info_); \
+ EXPECT_LE(diff, max_diff); \
+ } \
+ TEST_F(LibYUVScaleTest, \
+ DISABLED_##name##To##width##x##height##_##filter##_16) { \
+ int diff = TestFilter_16(benchmark_width_, benchmark_height_, width, \
+ height, kFilter##filter, benchmark_iterations_); \
+ EXPECT_LE(diff, max_diff); \
+ } \
+ TEST_F(LibYUVScaleTest, \
+ DISABLED_##name##From##width##x##height##_##filter##_16) { \
+ int diff = TestFilter_16(width, height, Abs(benchmark_width_), \
+ Abs(benchmark_height_), kFilter##filter, \
+ benchmark_iterations_); \
+ EXPECT_LE(diff, max_diff); \
+ }
+
+// Test scale to a specified size with all 4 filters.
+#define TEST_SCALETO(name, width, height) \
+ TEST_SCALETO1(name, width, height, None, 0) \
+ TEST_SCALETO1(name, width, height, Linear, 0) \
+ TEST_SCALETO1(name, width, height, Bilinear, 0) \
+ TEST_SCALETO1(name, width, height, Box, 0)
+
+TEST_SCALETO(Scale, 1, 1)
+TEST_SCALETO(Scale, 320, 240)
+TEST_SCALETO(Scale, 352, 288)
+TEST_SCALETO(Scale, 569, 480)
+TEST_SCALETO(Scale, 640, 360)
+TEST_SCALETO(Scale, 1280, 720)
+#undef TEST_SCALETO1
+#undef TEST_SCALETO
+
+#ifdef HAS_SCALEROWDOWN2_SSSE3
+TEST_F(LibYUVScaleTest, TestScaleOdd) {
+ SIMD_ALIGNED(uint8 orig_pixels[128 * 2]);
+ SIMD_ALIGNED(uint8 dst_pixels_opt[64]);
+ SIMD_ALIGNED(uint8 dst_pixels_c[64]);
+ memset(orig_pixels, 0, sizeof(orig_pixels));
+ memset(dst_pixels_opt, 0, sizeof(dst_pixels_opt));
+ memset(dst_pixels_c, 0, sizeof(dst_pixels_c));
+
+ // TL
+ orig_pixels[0] = 255u;
+ orig_pixels[1] = 0u;
+ orig_pixels[128 + 0] = 0u;
+ orig_pixels[128 + 1] = 0u;
+ // TR
+ orig_pixels[2] = 0u;
+ orig_pixels[3] = 100u;
+ orig_pixels[128 + 2] = 0u;
+ orig_pixels[128 + 3] = 0u;
+ // BL
+ orig_pixels[4] = 0u;
+ orig_pixels[5] = 0u;
+ orig_pixels[128 + 4] = 50u;
+ orig_pixels[128 + 5] = 0u;
+ // BR
+ orig_pixels[6] = 0u;
+ orig_pixels[7] = 0u;
+ orig_pixels[128 + 6] = 0u;
+ orig_pixels[128 + 7] = 20u;
+ // Odd
+ orig_pixels[126] = 4u;
+ orig_pixels[127] = 255u;
+ orig_pixels[128 + 126] = 16u;
+ orig_pixels[128 + 127] = 255u;
+
+ // Test regular half size.
+ ScaleRowDown2Box_C(orig_pixels, 128, dst_pixels_c, 64);
+
+ EXPECT_EQ(64u, dst_pixels_c[0]);
+ EXPECT_EQ(25u, dst_pixels_c[1]);
+ EXPECT_EQ(13u, dst_pixels_c[2]);
+ EXPECT_EQ(5u, dst_pixels_c[3]);
+ EXPECT_EQ(0u, dst_pixels_c[4]);
+ EXPECT_EQ(133u, dst_pixels_c[63]);
+
+ // Test Odd width version - Last pixel is just 1 horizontal pixel.
+ ScaleRowDown2Box_Odd_C(orig_pixels, 128, dst_pixels_c, 64);
+
+ EXPECT_EQ(64u, dst_pixels_c[0]);
+ EXPECT_EQ(25u, dst_pixels_c[1]);
+ EXPECT_EQ(13u, dst_pixels_c[2]);
+ EXPECT_EQ(5u, dst_pixels_c[3]);
+ EXPECT_EQ(0u, dst_pixels_c[4]);
+ EXPECT_EQ(10u, dst_pixels_c[63]);
+
+ // Test one pixel less, should skip the last pixel.
+ memset(dst_pixels_c, 0, sizeof(dst_pixels_c));
+ ScaleRowDown2Box_Odd_C(orig_pixels, 128, dst_pixels_c, 63);
+
+ EXPECT_EQ(64u, dst_pixels_c[0]);
+ EXPECT_EQ(25u, dst_pixels_c[1]);
+ EXPECT_EQ(13u, dst_pixels_c[2]);
+ EXPECT_EQ(5u, dst_pixels_c[3]);
+ EXPECT_EQ(0u, dst_pixels_c[4]);
+ EXPECT_EQ(0u, dst_pixels_c[63]);
+
+ // Test regular half size SSSE3.
+ ScaleRowDown2Box_SSSE3(orig_pixels, 128, dst_pixels_opt, 64);
+
+ EXPECT_EQ(64u, dst_pixels_opt[0]);
+ EXPECT_EQ(25u, dst_pixels_opt[1]);
+ EXPECT_EQ(13u, dst_pixels_opt[2]);
+ EXPECT_EQ(5u, dst_pixels_opt[3]);
+ EXPECT_EQ(0u, dst_pixels_opt[4]);
+ EXPECT_EQ(133u, dst_pixels_opt[63]);
+
+ // Compare C and SSSE3 match.
+ ScaleRowDown2Box_Odd_C(orig_pixels, 128, dst_pixels_c, 64);
+ ScaleRowDown2Box_Odd_SSSE3(orig_pixels, 128, dst_pixels_opt, 64);
+ for (int i = 0; i < 64; ++i) {
+ EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
+ }
+}
+#endif // HAS_SCALEROWDOWN2_SSSE3
+
+} // namespace libyuv
diff --git a/libyuv/src/main/cpp/libyuv/unit_test/testdata/arm_v7.txt b/libyuv/src/main/cpp/libyuv/unit_test/testdata/arm_v7.txt
new file mode 100644
index 0000000..5d7dbd0
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/unit_test/testdata/arm_v7.txt
@@ -0,0 +1,12 @@
+Processor : ARMv7 Processor rev 5 (v7l)
+BogoMIPS : 795.44
+Features : swp half thumb fastmult vfp edsp iwmmxt thumbee vfpv3 vfpv3d16
+CPU implementer : 0x56
+CPU architecture: 7
+CPU variant : 0x0
+CPU part : 0x581
+CPU revision : 5
+
+Hardware : OLPC XO-1.75
+Revision : 0000
+Serial : 0000000000000000
diff --git a/libyuv/src/main/cpp/libyuv/unit_test/testdata/juno.txt b/libyuv/src/main/cpp/libyuv/unit_test/testdata/juno.txt
new file mode 100644
index 0000000..dd46527
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/unit_test/testdata/juno.txt
@@ -0,0 +1,15 @@
+Processor : AArch64 Processor rev 0 (aarch64)
+processor : 0
+processor : 1
+processor : 2
+processor : 3
+processor : 4
+processor : 5
+Features : fp asimd evtstrm aes pmull sha1 sha2 crc32
+CPU implementer : 0x41
+CPU architecture: AArch64
+CPU variant : 0x0
+CPU part : 0xd07
+CPU revision : 0
+
+Hardware : Juno
diff --git a/libyuv/src/main/cpp/libyuv/unit_test/testdata/tegra3.txt b/libyuv/src/main/cpp/libyuv/unit_test/testdata/tegra3.txt
new file mode 100644
index 0000000..d1b09f6
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/unit_test/testdata/tegra3.txt
@@ -0,0 +1,23 @@
+Processor : ARMv7 Processor rev 9 (v7l)
+processor : 0
+BogoMIPS : 1992.29
+
+processor : 1
+BogoMIPS : 1992.29
+
+processor : 2
+BogoMIPS : 1992.29
+
+processor : 3
+BogoMIPS : 1992.29
+
+Features : swp half thumb fastmult vfp edsp neon vfpv3
+CPU implementer : 041
+CPU architecture: 7
+CPU variant : 02
+CPU part : 0xc09
+CPU revision : 9
+
+Hardware : cardhu
+Revision : 0000
+
diff --git a/libyuv/src/main/cpp/libyuv/unit_test/unit_test.cc b/libyuv/src/main/cpp/libyuv/unit_test/unit_test.cc
new file mode 100644
index 0000000..8cc124a
--- /dev/null
+++ b/libyuv/src/main/cpp/libyuv/unit_test/unit_test.cc
@@ -0,0 +1,470 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "../unit_test/unit_test.h"
+
+#include // For getenv()
+
+#include
+
+#ifdef LIBYUV_USE_GFLAGS
+#include "gflags/gflags.h"
+#endif
+
+// Change this to 1000 for benchmarking.
+// TODO(fbarchard): Add command line parsing to pass this as option.
+#define BENCHMARK_ITERATIONS 1
+
+unsigned int fastrand_seed = 0xfb;
+
+#ifdef LIBYUV_USE_GFLAGS
+DEFINE_int32(libyuv_width, 0, "width of test image.");
+DEFINE_int32(libyuv_height, 0, "height of test image.");
+DEFINE_int32(libyuv_repeat, 0, "number of times to repeat test.");
+DEFINE_int32(libyuv_flags, 0, "cpu flags for reference code. 1 = C, -1 = SIMD");
+DEFINE_int32(libyuv_cpu_info,
+ 0,
+ "cpu flags for benchmark code. 1 = C, -1 = SIMD");
+#else
+// Disable command line parameters if gflags disabled.
+static const int32 FLAGS_libyuv_width = 0;
+static const int32 FLAGS_libyuv_height = 0;
+static const int32 FLAGS_libyuv_repeat = 0;
+static const int32 FLAGS_libyuv_flags = 0;
+static const int32 FLAGS_libyuv_cpu_info = 0;
+#endif
+
+// For quicker unittests, default is 128 x 72. But when benchmarking,
+// default to 720p. Allow size to specify.
+// Set flags to -1 for benchmarking to avoid slower C code.
+
+LibYUVConvertTest::LibYUVConvertTest()
+ : benchmark_iterations_(BENCHMARK_ITERATIONS),
+ benchmark_width_(128),
+ benchmark_height_(72),
+ disable_cpu_flags_(1),
+ benchmark_cpu_info_(-1) {
+ const char* repeat = getenv("LIBYUV_REPEAT");
+ if (repeat) {
+ benchmark_iterations_ = atoi(repeat); // NOLINT
+ }
+ if (FLAGS_libyuv_repeat) {
+ benchmark_iterations_ = FLAGS_libyuv_repeat;
+ }
+ if (benchmark_iterations_ > 1) {
+ benchmark_width_ = 1280;
+ benchmark_height_ = 720;
+ }
+ const char* width = getenv("LIBYUV_WIDTH");
+ if (width) {
+ benchmark_width_ = atoi(width); // NOLINT
+ }
+ if (FLAGS_libyuv_width) {
+ benchmark_width_ = FLAGS_libyuv_width;
+ }
+ const char* height = getenv("LIBYUV_HEIGHT");
+ if (height) {
+ benchmark_height_ = atoi(height); // NOLINT
+ }
+ if (FLAGS_libyuv_height) {
+ benchmark_height_ = FLAGS_libyuv_height;
+ }
+ const char* cpu_flags = getenv("LIBYUV_FLAGS");
+ if (cpu_flags) {
+ disable_cpu_flags_ = atoi(cpu_flags); // NOLINT
+ }
+ if (FLAGS_libyuv_flags) {
+ disable_cpu_flags_ = FLAGS_libyuv_flags;
+ }
+ const char* cpu_info = getenv("LIBYUV_CPU_INFO");
+ if (cpu_info) {
+ benchmark_cpu_info_ = atoi(cpu_flags); // NOLINT
+ }
+ if (FLAGS_libyuv_cpu_info) {
+ benchmark_cpu_info_ = FLAGS_libyuv_cpu_info;
+ }
+ benchmark_pixels_div256_ =
+ static_cast((static_cast(Abs(benchmark_width_)) *
+ static_cast(Abs(benchmark_height_)) *
+ static_cast(benchmark_iterations_) +
+ 255.0) /
+ 256.0);
+ benchmark_pixels_div1280_ =
+ static_cast((static_cast(Abs(benchmark_width_)) *
+ static_cast(Abs(benchmark_height_)) *
+ static_cast(benchmark_iterations_) +
+ 1279.0) /
+ 1280.0);
+}
+
+LibYUVColorTest::LibYUVColorTest()
+ : benchmark_iterations_(BENCHMARK_ITERATIONS),
+ benchmark_width_(128),
+ benchmark_height_(72),
+ disable_cpu_flags_(1),
+ benchmark_cpu_info_(-1) {
+ const char* repeat = getenv("LIBYUV_REPEAT");
+ if (repeat) {
+ benchmark_iterations_ = atoi(repeat); // NOLINT
+ }
+ if (FLAGS_libyuv_repeat) {
+ benchmark_iterations_ = FLAGS_libyuv_repeat;
+ }
+ if (benchmark_iterations_ > 1) {
+ benchmark_width_ = 1280;
+ benchmark_height_ = 720;
+ }
+ const char* width = getenv("LIBYUV_WIDTH");
+ if (width) {
+ benchmark_width_ = atoi(width); // NOLINT
+ }
+ if (FLAGS_libyuv_width) {
+ benchmark_width_ = FLAGS_libyuv_width;
+ }
+ const char* height = getenv("LIBYUV_HEIGHT");
+ if (height) {
+ benchmark_height_ = atoi(height); // NOLINT
+ }
+ if (FLAGS_libyuv_height) {
+ benchmark_height_ = FLAGS_libyuv_height;
+ }
+ const char* cpu_flags = getenv("LIBYUV_FLAGS");
+ if (cpu_flags) {
+ disable_cpu_flags_ = atoi(cpu_flags); // NOLINT
+ }
+ if (FLAGS_libyuv_flags) {
+ disable_cpu_flags_ = FLAGS_libyuv_flags;
+ }
+ const char* cpu_info = getenv("LIBYUV_CPU_INFO");
+ if (cpu_info) {
+ benchmark_cpu_info_ = atoi(cpu_flags); // NOLINT
+ }
+ if (FLAGS_libyuv_cpu_info) {
+ benchmark_cpu_info_ = FLAGS_libyuv_cpu_info;
+ }
+ benchmark_pixels_div256_ =
+ static_cast((static_cast(Abs(benchmark_width_)) *
+ static_cast(Abs(benchmark_height_)) *
+ static_cast(benchmark_iterations_) +
+ 255.0) /
+ 256.0);
+ benchmark_pixels_div1280_ =
+ static_cast((static_cast(Abs(benchmark_width_)) *
+ static_cast(Abs(benchmark_height_)) *
+ static_cast(benchmark_iterations_) +
+ 1279.0) /
+ 1280.0);
+}
+
+LibYUVScaleTest::LibYUVScaleTest()
+ : benchmark_iterations_(BENCHMARK_ITERATIONS),
+ benchmark_width_(128),
+ benchmark_height_(72),
+ disable_cpu_flags_(1),
+ benchmark_cpu_info_(-1) {
+ const char* repeat = getenv("LIBYUV_REPEAT");
+ if (repeat) {
+ benchmark_iterations_ = atoi(repeat); // NOLINT
+ }
+ if (FLAGS_libyuv_repeat) {
+ benchmark_iterations_ = FLAGS_libyuv_repeat;
+ }
+ if (benchmark_iterations_ > 1) {
+ benchmark_width_ = 1280;
+ benchmark_height_ = 720;
+ }
+ const char* width = getenv("LIBYUV_WIDTH");
+ if (width) {
+ benchmark_width_ = atoi(width); // NOLINT
+ }
+ if (FLAGS_libyuv_width) {
+ benchmark_width_ = FLAGS_libyuv_width;
+ }
+ const char* height = getenv("LIBYUV_HEIGHT");
+ if (height) {
+ benchmark_height_ = atoi(height); // NOLINT
+ }
+ if (FLAGS_libyuv_height) {
+ benchmark_height_ = FLAGS_libyuv_height;
+ }
+ const char* cpu_flags = getenv("LIBYUV_FLAGS");
+ if (cpu_flags) {
+ disable_cpu_flags_ = atoi(cpu_flags); // NOLINT
+ }
+ if (FLAGS_libyuv_flags) {
+ disable_cpu_flags_ = FLAGS_libyuv_flags;
+ }
+ const char* cpu_info = getenv("LIBYUV_CPU_INFO");
+ if (cpu_info) {
+ benchmark_cpu_info_ = atoi(cpu_flags); // NOLINT
+ }
+ if (FLAGS_libyuv_cpu_info) {
+ benchmark_cpu_info_ = FLAGS_libyuv_cpu_info;
+ }
+ benchmark_pixels_div256_ =
+ static_cast((static_cast(Abs(benchmark_width_)) *
+ static_cast(Abs(benchmark_height_)) *
+ static_cast(benchmark_iterations_) +
+ 255.0) /
+ 256.0);
+ benchmark_pixels_div1280_ =
+ static_cast((static_cast(Abs(benchmark_width_)) *
+ static_cast(Abs(benchmark_height_)) *
+ static_cast(benchmark_iterations_) +
+ 1279.0) /
+ 1280.0);
+}
+
+LibYUVRotateTest::LibYUVRotateTest()
+ : benchmark_iterations_(BENCHMARK_ITERATIONS),
+ benchmark_width_(128),
+ benchmark_height_(72),
+ disable_cpu_flags_(1),
+ benchmark_cpu_info_(-1) {
+ const char* repeat = getenv("LIBYUV_REPEAT");
+ if (repeat) {
+ benchmark_iterations_ = atoi(repeat); // NOLINT
+ }
+ if (FLAGS_libyuv_repeat) {
+ benchmark_iterations_ = FLAGS_libyuv_repeat;
+ }
+ if (benchmark_iterations_ > 1) {
+ benchmark_width_ = 1280;
+ benchmark_height_ = 720;
+ }
+ const char* width = getenv("LIBYUV_WIDTH");
+ if (width) {
+ benchmark_width_ = atoi(width); // NOLINT
+ }
+ if (FLAGS_libyuv_width) {
+ benchmark_width_ = FLAGS_libyuv_width;
+ }
+ const char* height = getenv("LIBYUV_HEIGHT");
+ if (height) {
+ benchmark_height_ = atoi(height); // NOLINT
+ }
+ if (FLAGS_libyuv_height) {
+ benchmark_height_ = FLAGS_libyuv_height;
+ }
+ const char* cpu_flags = getenv("LIBYUV_FLAGS");
+ if (cpu_flags) {
+ disable_cpu_flags_ = atoi(cpu_flags); // NOLINT
+ }
+ if (FLAGS_libyuv_flags) {
+ disable_cpu_flags_ = FLAGS_libyuv_flags;
+ }
+ const char* cpu_info = getenv("LIBYUV_CPU_INFO");
+ if (cpu_info) {
+ benchmark_cpu_info_ = atoi(cpu_flags); // NOLINT
+ }
+ if (FLAGS_libyuv_cpu_info) {
+ benchmark_cpu_info_ = FLAGS_libyuv_cpu_info;
+ }
+ benchmark_pixels_div256_ =
+ static_cast((static_cast(Abs(benchmark_width_)) *
+ static_cast(Abs(benchmark_height_)) *
+ static_cast(benchmark_iterations_) +
+ 255.0) /
+ 256.0);
+ benchmark_pixels_div1280_ =
+ static_cast((static_cast(Abs(benchmark_width_)) *
+ static_cast(Abs(benchmark_height_)) *
+ static_cast(benchmark_iterations_) +
+ 1279.0) /
+ 1280.0);
+}
+
+LibYUVPlanarTest::LibYUVPlanarTest()
+ : benchmark_iterations_(BENCHMARK_ITERATIONS),
+ benchmark_width_(128),
+ benchmark_height_(72),
+ disable_cpu_flags_(1),
+ benchmark_cpu_info_(-1) {
+ const char* repeat = getenv("LIBYUV_REPEAT");
+ if (repeat) {
+ benchmark_iterations_ = atoi(repeat); // NOLINT
+ }
+ if (FLAGS_libyuv_repeat) {
+ benchmark_iterations_ = FLAGS_libyuv_repeat;
+ }
+ if (benchmark_iterations_ > 1) {
+ benchmark_width_ = 1280;
+ benchmark_height_ = 720;
+ }
+ const char* width = getenv("LIBYUV_WIDTH");
+ if (width) {
+ benchmark_width_ = atoi(width); // NOLINT
+ }
+ if (FLAGS_libyuv_width) {
+ benchmark_width_ = FLAGS_libyuv_width;
+ }
+ const char* height = getenv("LIBYUV_HEIGHT");
+ if (height) {
+ benchmark_height_ = atoi(height); // NOLINT
+ }
+ if (FLAGS_libyuv_height) {
+ benchmark_height_ = FLAGS_libyuv_height;
+ }
+ const char* cpu_flags = getenv("LIBYUV_FLAGS");
+ if (cpu_flags) {
+ disable_cpu_flags_ = atoi(cpu_flags); // NOLINT
+ }
+ if (FLAGS_libyuv_flags) {
+ disable_cpu_flags_ = FLAGS_libyuv_flags;
+ }
+ const char* cpu_info = getenv("LIBYUV_CPU_INFO");
+ if (cpu_info) {
+ benchmark_cpu_info_ = atoi(cpu_flags); // NOLINT
+ }
+ if (FLAGS_libyuv_cpu_info) {
+ benchmark_cpu_info_ = FLAGS_libyuv_cpu_info;
+ }
+ benchmark_pixels_div256_ =
+ static_cast((static_cast(Abs(benchmark_width_)) *
+ static_cast(Abs(benchmark_height_)) *
+ static_cast(benchmark_iterations_) +
+ 255.0) /
+ 256.0);
+ benchmark_pixels_div1280_ =
+ static_cast((static_cast