diff --git a/doc/release/1.17.0-notes.rst b/doc/release/1.17.0-notes.rst
index 4f9134e0b515..3f8095440e6b 100644
--- a/doc/release/1.17.0-notes.rst
+++ b/doc/release/1.17.0-notes.rst
@@ -319,6 +319,31 @@ was accidental. The old behavior can be retained with
 ``structured_to_unstructured(arr[['a']]).squeeze(axis=-1)`` or far more simply,
 ``arr['a']``.
 
+``clip`` now uses a ufunc under the hood
+----------------------------------------
+This means that registering clip functions for custom dtypes in C via
+`descr->f->fastclip` is deprecated - they should use the ufunc registration
+mechanism instead, attaching to the ``np.core.umath.clip`` ufunc.
+
+It also means that ``clip`` accepts ``where`` and ``casting`` arguments,
+and can be override with ``__array_ufunc__``.
+
+A consequence of this change is that some behaviors of the old ``clip`` have
+been deprecated:
+
+* Passing ``nan`` to mean "do not clip" as one or both bounds. This didn't work
+  in all cases anyway, and can be better handled by passing infinities of the
+  appropriate sign.
+* Using "unsafe" casting by default when an ``out`` argument is passed. Using
+  ``casting="unsafe"`` explicitly will silence this warning.
+
+Additionally, there are some corner cases with behavior changes:
+
+* Padding ``max < min`` has changed to be more consistent across dtypes, but
+  should not be relied upon.
+* Scalar ``min`` and ``max`` take part in promotion rules like they do in all
+  other ufuncs.
+
 ``__array_interface__`` offset now works as documented
 ------------------------------------------------------
 The interface may use an ``offset`` value that was mistakenly ignored.
diff --git a/numpy/core/_add_newdocs.py b/numpy/core/_add_newdocs.py
index 3e9d056747a1..e5e7f6667c18 100644
--- a/numpy/core/_add_newdocs.py
+++ b/numpy/core/_add_newdocs.py
@@ -2771,7 +2771,7 @@
 
 add_newdoc('numpy.core.multiarray', 'ndarray', ('clip',
     """
-    a.clip(min=None, max=None, out=None)
+    a.clip(min=None, max=None, out=None, **kwargs)
 
     Return an array whose values are limited to ``[min, max]``.
     One of max or min must be given.
diff --git a/numpy/core/_methods.py b/numpy/core/_methods.py
index 3ab64f7a1ee2..ba6f7d1112ad 100644
--- a/numpy/core/_methods.py
+++ b/numpy/core/_methods.py
@@ -11,6 +11,7 @@
 from numpy.core import umath as um
 from numpy.core._asarray import asanyarray
 from numpy.core import numerictypes as nt
+from numpy.core import _exceptions
 from numpy._globals import _NoValue
 
 # save those O(100) nanoseconds!
@@ -55,6 +56,80 @@ def _count_reduce_items(arr, axis):
         items *= arr.shape[ax]
     return items
 
+# Numpy 1.17.0, 2019-02-24
+# Various clip behavior deprecations, marked with _clip_dep as a prefix.
+
+def _clip_dep_is_scalar_nan(a):
+    # guarded to protect circular imports
+    from numpy.core.fromnumeric import ndim
+    if ndim(a) != 0:
+        return False
+    try:
+        return um.isnan(a)
+    except TypeError:
+        return False
+
+def _clip_dep_is_byte_swapped(a):
+    if isinstance(a, mu.ndarray):
+        return not a.dtype.isnative
+    return False
+
+def _clip_dep_invoke_with_casting(ufunc, *args, out=None, casting=None, **kwargs):
+    # normal path
+    if casting is not None:
+        return ufunc(*args, out=out, casting=casting, **kwargs)
+
+    # try to deal with broken casting rules
+    try:
+        return ufunc(*args, out=out, **kwargs)
+    except _exceptions._UFuncOutputCastingError as e:
+        # Numpy 1.17.0, 2019-02-24
+        warnings.warn(
+            "Converting the output of clip from {!r} to {!r} is deprecated. "
+            "Pass `casting=\"unsafe\"` explicitly to silence this warning, or "
+            "correct the type of the variables.".format(e.from_, e.to),
+            DeprecationWarning,
+            stacklevel=2
+        )
+        return ufunc(*args, out=out, casting="unsafe", **kwargs)
+
+def _clip(a, min=None, max=None, out=None, *, casting=None, **kwargs):
+    if min is None and max is None:
+        raise ValueError("One of max or min must be given")
+
+    # Numpy 1.17.0, 2019-02-24
+    # This deprecation probably incurs a substantial slowdown for small arrays,
+    # it will be good to get rid of it.
+    if not _clip_dep_is_byte_swapped(a) and not _clip_dep_is_byte_swapped(out):
+        using_deprecated_nan = False
+        if _clip_dep_is_scalar_nan(min):
+            min = -float('inf')
+            using_deprecated_nan = True
+        if _clip_dep_is_scalar_nan(max):
+            max = float('inf')
+            using_deprecated_nan = True
+        if using_deprecated_nan:
+            warnings.warn(
+                "Passing `np.nan` to mean no clipping in np.clip has always "
+                "been unreliable, and is now deprecated. "
+                "In future, this will always return nan, like it already does "
+                "when min or max are arrays that contain nan. "
+                "To skip a bound, pass either None or an np.inf of an "
+                "appropriate sign.",
+                DeprecationWarning,
+                stacklevel=2
+            )
+
+    if min is None:
+        return _clip_dep_invoke_with_casting(
+            um.minimum, a, max, out=out, casting=casting, **kwargs)
+    elif max is None:
+        return _clip_dep_invoke_with_casting(
+            um.maximum, a, min, out=out, casting=casting, **kwargs)
+    else:
+        return _clip_dep_invoke_with_casting(
+            um.clip, a, min, max, out=out, casting=casting, **kwargs)
+
 def _mean(a, axis=None, dtype=None, out=None, keepdims=False):
     arr = asanyarray(a)
 
diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py
index 76360eb5a3bf..c586900695ba 100644
--- a/numpy/core/code_generators/generate_umath.py
+++ b/numpy/core/code_generators/generate_umath.py
@@ -511,6 +511,13 @@ def english_upper(s):
           TD(noobj),
           TD(O, f='npy_ObjectMin')
           ),
+'clip':
+    Ufunc(3, 1, ReorderableNone,
+          docstrings.get('numpy.core.umath.clip'),
+          'PyUFunc_SimpleUniformOperationTypeResolver',
+          TD(noobj),
+          [TypeDescription('O', 'npy_ObjectClip', 'OOO', 'O')]
+          ),
 'fmax':
     Ufunc(2, 1, ReorderableNone,
           docstrings.get('numpy.core.umath.fmax'),
@@ -963,6 +970,9 @@ def indent(st, spaces):
         'O': 'OO_O',
         'P': 'OO_O_method',
     },
+    (3, 1): {
+        'O': 'OOO_O',
+    }
 }
 
 #for each name
@@ -1139,6 +1149,7 @@ def make_code(funcdict, filename):
     #include "ufunc_type_resolution.h"
     #include "loops.h"
     #include "matmul.h"
+    #include "clip.h"
     %s
 
     static int
diff --git a/numpy/core/code_generators/ufunc_docstrings.py b/numpy/core/code_generators/ufunc_docstrings.py
index 8b1a5a3db270..6a5def4f253a 100644
--- a/numpy/core/code_generators/ufunc_docstrings.py
+++ b/numpy/core/code_generators/ufunc_docstrings.py
@@ -42,14 +42,20 @@ def get(name):
 def add_newdoc(place, name, doc):
     doc = textwrap.dedent(doc).strip()
 
-    if name[0] != '_' and name != 'matmul':
-        # matmul is special, it does not use the OUT_SCALAR replacement strings
+    skip = (
+        # gufuncs do not use the OUT_SCALAR replacement strings
+        'matmul',
+        # clip has 3 inputs, which is not handled by this
+        'clip',
+    )
+    if name[0] != '_' and name not in skip:
         if '\nx :' in doc:
             assert '$OUT_SCALAR_1' in doc, "in {}".format(name)
         elif '\nx2 :' in doc or '\nx1, x2 :' in doc:
             assert '$OUT_SCALAR_2' in doc, "in {}".format(name)
         else:
             assert False, "Could not detect number of inputs in {}".format(name)
+
     for k, v in subst.items():
         doc = doc.replace('$' + k, v)
 
@@ -2535,6 +2541,46 @@ def add_newdoc(place, name, doc):
 
     """)
 
+add_newdoc('numpy.core.umath', 'clip',
+    """
+    Clip (limit) the values in an array.
+
+    Given an interval, values outside the interval are clipped to
+    the interval edges.  For example, if an interval of ``[0, 1]``
+    is specified, values smaller than 0 become 0, and values larger
+    than 1 become 1.
+
+    Equivalent to but faster than ``np.minimum(np.maximum(a, a_min), a_max)``.
+
+    Parameters
+    ----------
+    a : array_like
+        Array containing elements to clip.
+    a_min : array_like
+        Minimum value.
+    a_max : array_like
+        Maximum value.
+    out : ndarray, optional
+        The results will be placed in this array. It may be the input
+        array for in-place clipping.  `out` must be of the right shape
+        to hold the output.  Its type is preserved.
+    $PARAMS
+
+    See Also
+    --------
+    numpy.clip :
+        Wrapper that makes the ``a_min`` and ``a_max`` arguments optional,
+        dispatching to one of `~numpy.core.umath.clip`,
+        `~numpy.core.umath.minimum`, and `~numpy.core.umath.maximum`.
+
+    Returns
+    -------
+    clipped_array : ndarray
+        An array with the elements of `a`, but where values
+        < `a_min` are replaced with `a_min`, and those > `a_max`
+        with `a_max`.
+    """)
+
 add_newdoc('numpy.core.umath', 'matmul',
     """
     Matrix product of two arrays.
diff --git a/numpy/core/fromnumeric.py b/numpy/core/fromnumeric.py
index 7024ac2376c8..58da8a54b885 100644
--- a/numpy/core/fromnumeric.py
+++ b/numpy/core/fromnumeric.py
@@ -1961,12 +1961,12 @@ def compress(condition, a, axis=None, out=None):
     return _wrapfunc(a, 'compress', condition, axis=axis, out=out)
 
 
-def _clip_dispatcher(a, a_min, a_max, out=None):
+def _clip_dispatcher(a, a_min, a_max, out=None, **kwargs):
     return (a, a_min, a_max)
 
 
 @array_function_dispatch(_clip_dispatcher)
-def clip(a, a_min, a_max, out=None):
+def clip(a, a_min, a_max, out=None, **kwargs):
     """
     Clip (limit) the values in an array.
 
@@ -1975,6 +1975,9 @@ def clip(a, a_min, a_max, out=None):
     is specified, values smaller than 0 become 0, and values larger
     than 1 become 1.
 
+    Equivalent to but faster than ``np.maximum(a_min, np.minimum(a, a_max))``.
+    No check is performed to ensure ``a_min < a_max``.
+
     Parameters
     ----------
     a : array_like
@@ -1992,6 +1995,11 @@ def clip(a, a_min, a_max, out=None):
         The results will be placed in this array. It may be the input
         array for in-place clipping.  `out` must be of the right shape
         to hold the output.  Its type is preserved.
+    **kwargs
+        For other keyword-only arguments, see the
+        :ref:`ufunc docs <ufuncs.kwargs>`.
+
+        .. versionadded:: 1.17.0
 
     Returns
     -------
@@ -2020,7 +2028,7 @@ def clip(a, a_min, a_max, out=None):
     array([3, 4, 2, 3, 4, 5, 6, 7, 8, 8])
 
     """
-    return _wrapfunc(a, 'clip', a_min, a_max, out=out)
+    return _wrapfunc(a, 'clip', a_min, a_max, out=out, **kwargs)
 
 
 def _sum_dispatcher(a, axis=None, dtype=None, out=None, keepdims=None,
diff --git a/numpy/core/setup.py b/numpy/core/setup.py
index 81d140d5e255..62147d22bbdb 100644
--- a/numpy/core/setup.py
+++ b/numpy/core/setup.py
@@ -906,6 +906,8 @@ def generate_umath_c(ext, build_dir):
             join('src', 'umath', 'loops.c.src'),
             join('src', 'umath', 'matmul.h.src'),
             join('src', 'umath', 'matmul.c.src'),
+            join('src', 'umath', 'clip.h.src'),
+            join('src', 'umath', 'clip.c.src'),
             join('src', 'umath', 'ufunc_object.c'),
             join('src', 'umath', 'extobj.c'),
             join('src', 'umath', 'cpuid.c'),
diff --git a/numpy/core/src/common/lowlevel_strided_loops.h b/numpy/core/src/common/lowlevel_strided_loops.h
index 5f139cffb98a..bacd27473248 100644
--- a/numpy/core/src/common/lowlevel_strided_loops.h
+++ b/numpy/core/src/common/lowlevel_strided_loops.h
@@ -4,6 +4,9 @@
 #include <npy_config.h>
 #include "mem_overlap.h"
 
+/* For PyArray_ macros used below */
+#include "numpy/ndarrayobject.h"
+
 /*
  * NOTE: This API should remain private for the time being, to allow
  *       for further refinement.  I think the 'aligned' mechanism
diff --git a/numpy/core/src/multiarray/arraytypes.c.src b/numpy/core/src/multiarray/arraytypes.c.src
index 1463d89903b3..5f7bcb8f73ee 100644
--- a/numpy/core/src/multiarray/arraytypes.c.src
+++ b/numpy/core/src/multiarray/arraytypes.c.src
@@ -3808,176 +3808,6 @@ static void
 /**end repeat**/
 
 
-/*
- *****************************************************************************
- **                               FASTCLIP                                  **
- *****************************************************************************
- */
-
-#define _LESS_THAN(a, b) ((a) < (b))
-#define _GREATER_THAN(a, b) ((a) > (b))
-
-/*
- * In fastclip, 'b' was already checked for NaN, so the half comparison
- * only needs to check 'a' for NaN.
- */
-
-#define _HALF_LESS_THAN(a, b) (!npy_half_isnan(a) && npy_half_lt_nonan(a, b))
-#define _HALF_GREATER_THAN(a, b) (!npy_half_isnan(a) && npy_half_lt_nonan(b, a))
-
-/**begin repeat
- *
- * #name = BOOL,
- *         BYTE, UBYTE, SHORT, USHORT, INT, UINT,
- *         LONG, ULONG, LONGLONG, ULONGLONG,
- *         HALF, FLOAT, DOUBLE, LONGDOUBLE,
- *         DATETIME, TIMEDELTA#
- * #type = npy_bool,
- *         npy_byte, npy_ubyte, npy_short, npy_ushort, npy_int, npy_uint,
- *         npy_long, npy_ulong, npy_longlong, npy_ulonglong,
- *         npy_half, npy_float, npy_double, npy_longdouble,
- *         npy_datetime, npy_timedelta#
- * #isfloat = 0*11, 1*4, 0*2#
- * #isnan = nop*11, npy_half_isnan, npy_isnan*3, nop*2#
- * #lt = _LESS_THAN*11, _HALF_LESS_THAN, _LESS_THAN*5#
- * #gt = _GREATER_THAN*11, _HALF_GREATER_THAN, _GREATER_THAN*5#
- */
-static void
-@name@_fastclip(@type@ *in, npy_intp ni, @type@ *min, @type@ *max, @type@ *out)
-{
-    npy_intp i;
-    @type@ max_val = 0, min_val = 0;
-
-    if (max != NULL) {
-        max_val = *max;
-#if @isfloat@
-        /* NaNs result in no clipping, so optimize the case away */
-        if (@isnan@(max_val)) {
-            if (min == NULL) {
-                memmove(out, in, ni * sizeof(@type@));
-                return;
-            }
-            max = NULL;
-        }
-#endif
-    }
-    if (min != NULL) {
-        min_val = *min;
-#if @isfloat@
-        if (@isnan@(min_val)) {
-            if (max == NULL) {
-                memmove(out, in, ni * sizeof(@type@));
-                return;
-            }
-            min = NULL;
-        }
-#endif
-    }
-    if (max == NULL) {
-        for (i = 0; i < ni; i++) {
-            if (@lt@(in[i], min_val)) {
-                out[i] = min_val;
-            }
-            else {
-                out[i] = in[i];
-            }
-        }
-    }
-    else if (min == NULL) {
-        for (i = 0; i < ni; i++) {
-            if (@gt@(in[i], max_val)) {
-                out[i] = max_val;
-            }
-            else {
-                out[i] = in[i];
-            }
-        }
-    }
-    else {
-        /*
-         * Visual Studio 2015 loop vectorizer handles NaN in an unexpected
-         * manner, see: https://github.com/numpy/numpy/issues/7601
-         */
-        #if (_MSC_VER == 1900)
-        #pragma loop( no_vector )
-        #endif
-        for (i = 0; i < ni; i++) {
-            if (@lt@(in[i], min_val)) {
-                out[i]   = min_val;
-            }
-            else if (@gt@(in[i], max_val)) {
-                out[i]   = max_val;
-            }
-            else {
-                out[i] = in[i];
-            }
-        }
-    }
-}
-/**end repeat**/
-
-#undef _LESS_THAN
-#undef _GREATER_THAN
-#undef _HALF_LESS_THAN
-#undef _HALF_GREATER_THAN
-
-/**begin repeat
- *
- * #name = CFLOAT, CDOUBLE, CLONGDOUBLE#
- * #type = npy_cfloat, npy_cdouble, npy_clongdouble#
- */
-static void
-@name@_fastclip(@type@ *in, npy_intp ni, @type@ *min, @type@ *max, @type@ *out)
-{
-    npy_intp i;
-    @type@ max_val, min_val;
-
-    if (max != NULL) {
-        max_val = *max;
-    }
-    if (min != NULL) {
-        min_val = *min;
-    }
-    if (max == NULL) {
-        for (i = 0; i < ni; i++) {
-            if (PyArray_CLT(in[i],min_val)) {
-                out[i] = min_val;
-            }
-            else {
-                out[i] = in[i];
-            }
-        }
-    }
-    else if (min == NULL) {
-        for (i = 0; i < ni; i++) {
-            if (PyArray_CGT(in[i], max_val)) {
-                out[i] = max_val;
-            }
-            else {
-                out[i] = in[i];
-            }
-        }
-    }
-    else {
-        for (i = 0; i < ni; i++) {
-            if (PyArray_CLT(in[i], min_val)) {
-                out[i] = min_val;
-            }
-            else if (PyArray_CGT(in[i], max_val)) {
-                out[i] = max_val;
-            }
-            else {
-                out[i] = in[i];
-            }
-        }
-    }
-}
-
-/**end repeat**/
-
-#define OBJECT_fastclip NULL
-
-
 /*
  *****************************************************************************
  **                              FASTPUTMASK                                **
@@ -4501,7 +4331,7 @@ static PyArray_ArrFuncs _Py@NAME@_ArrFuncs = {
     (PyArray_ScalarKindFunc*)NULL,
     NULL,
     NULL,
-    (PyArray_FastClipFunc*)@from@_fastclip,
+    (PyArray_FastClipFunc*)NULL,
     (PyArray_FastPutmaskFunc*)@from@_fastputmask,
     (PyArray_FastTakeFunc*)@from@_fasttake,
     (PyArray_ArgFunc*)@from@_argmin
diff --git a/numpy/core/src/multiarray/calculation.c b/numpy/core/src/multiarray/calculation.c
index 90ee2c5b28d5..1d72a52273a7 100644
--- a/numpy/core/src/multiarray/calculation.c
+++ b/numpy/core/src/multiarray/calculation.c
@@ -918,6 +918,27 @@ PyArray_Clip(PyArrayObject *self, PyObject *min, PyObject *max, PyArrayObject *o
     }
 
     func = PyArray_DESCR(self)->f->fastclip;
+    if (func == NULL) {
+        if (min == NULL) {
+            return PyObject_CallFunctionObjArgs(n_ops.minimum, self, max, out, NULL);
+        }
+        else if (max == NULL) {
+            return PyObject_CallFunctionObjArgs(n_ops.maximum, self, min, out, NULL);
+        }
+        else {
+            return PyObject_CallFunctionObjArgs(n_ops.clip, self, min, max, out, NULL);
+        }
+    }
+
+    /* NumPy 1.17.0, 2019-02-24 */
+    if (DEPRECATE(
+            "->f->fastclip is deprecated. Use PyUFunc_RegisterLoopForDescr to "
+            "attach a custom loop to np.core.umath.clip, np.minimum, and "
+            "np.maximum") < 0) {
+        return NULL;
+    }
+    /* everything below can be removed once this deprecation completes */
+
     if (func == NULL
         || (min != NULL && !PyArray_CheckAnyScalar(min))
         || (max != NULL && !PyArray_CheckAnyScalar(max))
diff --git a/numpy/core/src/multiarray/methods.c b/numpy/core/src/multiarray/methods.c
index 0ddec2995336..0d30db07e729 100644
--- a/numpy/core/src/multiarray/methods.c
+++ b/numpy/core/src/multiarray/methods.c
@@ -2399,21 +2399,7 @@ array_trace(PyArrayObject *self, PyObject *args, PyObject *kwds)
 static PyObject *
 array_clip(PyArrayObject *self, PyObject *args, PyObject *kwds)
 {
-    PyObject *min = NULL, *max = NULL;
-    PyArrayObject *out = NULL;
-    static char *kwlist[] = {"min", "max", "out", NULL};
-
-    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|OOO&:clip", kwlist,
-                                     &min,
-                                     &max,
-                                     PyArray_OutputConverter, &out)) {
-        return NULL;
-    }
-    if (max == NULL && min == NULL) {
-        PyErr_SetString(PyExc_ValueError, "One of max or min must be given.");
-        return NULL;
-    }
-    return PyArray_Return((PyArrayObject *)PyArray_Clip(self, min, max, out));
+    NPY_FORWARD_NDARRAY_METHOD("_clip");
 }
 
 
diff --git a/numpy/core/src/multiarray/number.c b/numpy/core/src/multiarray/number.c
index 420501ce251f..0ceb994ef5d8 100644
--- a/numpy/core/src/multiarray/number.c
+++ b/numpy/core/src/multiarray/number.c
@@ -113,6 +113,7 @@ _PyArray_SetNumericOps(PyObject *dict)
     SET(rint);
     SET(conjugate);
     SET(matmul);
+    SET(clip);
     return 0;
 }
 
@@ -179,6 +180,7 @@ _PyArray_GetNumericOps(void)
     GET(rint);
     GET(conjugate);
     GET(matmul);
+    GET(clip);
     return dict;
 
  fail:
diff --git a/numpy/core/src/multiarray/number.h b/numpy/core/src/multiarray/number.h
index 33a7cf872824..643241b3d02f 100644
--- a/numpy/core/src/multiarray/number.h
+++ b/numpy/core/src/multiarray/number.h
@@ -40,6 +40,7 @@ typedef struct {
     PyObject *rint;
     PyObject *conjugate;
     PyObject *matmul;
+    PyObject *clip;
 } NumericOps;
 
 extern NPY_NO_EXPORT NumericOps n_ops;
diff --git a/numpy/core/src/umath/clip.c.src b/numpy/core/src/umath/clip.c.src
new file mode 100644
index 000000000000..30fa3d2b35a3
--- /dev/null
+++ b/numpy/core/src/umath/clip.c.src
@@ -0,0 +1,119 @@
+/**
+ * This module provides the inner loops for the clip ufunc
+ */
+#define _UMATHMODULE
+#define _MULTIARRAYMODULE
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+
+#include "Python.h"
+
+#include "numpy/halffloat.h"
+#include "numpy/npy_math.h"
+#include "numpy/ndarraytypes.h"
+#include "numpy/npy_common.h"
+#include "numpy/utils.h"
+#include "fast_loop_macros.h"
+
+/*
+ * Produce macros that perform nan/nat-propagating min and max
+ */
+
+/**begin repeat
+ * #name = BOOL,
+ *         BYTE, UBYTE, SHORT, USHORT, INT, UINT,
+ *         LONG, ULONG, LONGLONG, ULONGLONG#
+ */
+#define _NPY_@name@_MIN(a, b) PyArray_MIN(a, b)
+#define _NPY_@name@_MAX(a, b) PyArray_MAX(a, b)
+/**end repeat**/
+
+#define _NPY_HALF_MIN(a, b) (npy_half_isnan(a) || npy_half_le(a, b) ? (a) : (b))
+#define _NPY_HALF_MAX(a, b) (npy_half_isnan(a) || npy_half_ge(a, b) ? (a) : (b))
+
+/**begin repeat
+ * #name = FLOAT, DOUBLE, LONGDOUBLE#
+ */
+#define _NPY_@name@_MIN(a, b) (npy_isnan(a) ? (a) : PyArray_MIN(a, b))
+#define _NPY_@name@_MAX(a, b) (npy_isnan(a) ? (a) : PyArray_MAX(a, b))
+/**end repeat**/
+
+/**begin repeat
+ * #name = CFLOAT, CDOUBLE, CLONGDOUBLE#
+ */
+#define _NPY_@name@_MIN(a, b) (npy_isnan((a).real) || npy_isnan((a).imag) || PyArray_CLT(a, b) ? (a) : (b))
+#define _NPY_@name@_MAX(a, b) (npy_isnan((a).real) || npy_isnan((a).imag) || PyArray_CGT(a, b) ? (a) : (b))
+/**end repeat**/
+
+/**begin repeat
+ * #name = DATETIME, TIMEDELTA#
+ */
+#define _NPY_@name@_MIN(a, b) ( \
+    (a) == NPY_DATETIME_NAT ? (a) : \
+    (b) == NPY_DATETIME_NAT ? (b) : \
+    (a) < (b) ? (a) : (b) \
+)
+#define _NPY_@name@_MAX(a, b) ( \
+    (a) == NPY_DATETIME_NAT ? (a) : \
+    (b) == NPY_DATETIME_NAT ? (b) : \
+    (a) > (b) ? (a) : (b) \
+)
+/**end repeat**/
+
+/**begin repeat
+ *
+ * #name = BOOL,
+ *         BYTE, UBYTE, SHORT, USHORT, INT, UINT,
+ *         LONG, ULONG, LONGLONG, ULONGLONG,
+ *         HALF, FLOAT, DOUBLE, LONGDOUBLE,
+ *         CFLOAT, CDOUBLE, CLONGDOUBLE,
+ *         DATETIME, TIMEDELTA#
+ * #type = npy_bool,
+ *         npy_byte, npy_ubyte, npy_short, npy_ushort, npy_int, npy_uint,
+ *         npy_long, npy_ulong, npy_longlong, npy_ulonglong,
+ *         npy_half, npy_float, npy_double, npy_longdouble,
+ *         npy_cfloat, npy_cdouble, npy_clongdouble,
+ *         npy_datetime, npy_timedelta#
+ */
+
+#define _NPY_CLIP(x, min, max) \
+    _NPY_@name@_MIN(_NPY_@name@_MAX((x), (min)), (max))
+
+NPY_NO_EXPORT void
+@name@_clip(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+{
+    if (steps[1] == 0 && steps[2] == 0) {
+        /* min and max are constant throughout the loop, the most common case */
+        /* NOTE: it may be possible to optimize these checks for nan */
+        @type@ min_val = *(@type@ *)args[1];
+        @type@ max_val = *(@type@ *)args[2];
+
+        char *ip1 = args[0], *op1 = args[3];
+        npy_intp is1 = steps[0], os1 = steps[3];
+        npy_intp n = dimensions[0];
+
+        /* contiguous, branch to let the compiler optimize */
+        if (is1 == sizeof(@type@) && os1 == sizeof(@type@)) {
+            for(npy_intp i = 0; i < n; i++, ip1 += is1, op1 += os1) {
+                *(@type@ *)op1 = _NPY_CLIP(*(@type@ *)ip1, min_val, max_val);
+            }
+        }
+        else {
+            for(npy_intp i = 0; i < n; i++, ip1 += is1, op1 += os1) {
+                *(@type@ *)op1 = _NPY_CLIP(*(@type@ *)ip1, min_val, max_val);
+            }
+        }
+    }
+    else {
+        TERNARY_LOOP {
+            *(@type@ *)op1 = _NPY_CLIP(*(@type@ *)ip1, *(@type@ *)ip2, *(@type@ *)ip3);
+        }
+    }
+    npy_clear_floatstatus_barrier((char*)dimensions);
+}
+
+// clean up the macros we defined above
+#undef _NPY_CLIP
+#undef _NPY_@name@_MAX
+#undef _NPY_@name@_MIN
+
+/**end repeat**/
diff --git a/numpy/core/src/umath/clip.h.src b/numpy/core/src/umath/clip.h.src
new file mode 100644
index 000000000000..d77971ad7946
--- /dev/null
+++ b/numpy/core/src/umath/clip.h.src
@@ -0,0 +1,18 @@
+#ifndef _NPY_UMATH_CLIP_H_
+#define _NPY_UMATH_CLIP_H_
+
+
+/**begin repeat
+ *
+ * #name = BOOL,
+ *         BYTE, UBYTE, SHORT, USHORT, INT, UINT,
+ *         LONG, ULONG, LONGLONG, ULONGLONG,
+ *         HALF, FLOAT, DOUBLE, LONGDOUBLE,
+ *         CFLOAT, CDOUBLE, CLONGDOUBLE,
+ *         DATETIME, TIMEDELTA#
+ */
+NPY_NO_EXPORT void
+@name@_clip(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
+/**end repeat**/
+
+#endif
diff --git a/numpy/core/src/umath/fast_loop_macros.h b/numpy/core/src/umath/fast_loop_macros.h
index 7a1ed66bc94f..ae6d69a3e7cc 100644
--- a/numpy/core/src/umath/fast_loop_macros.h
+++ b/numpy/core/src/umath/fast_loop_macros.h
@@ -58,6 +58,14 @@
     npy_intp i;\
     for(i = 0; i < n; i++, ip1 += is1, ip2 += is2, op1 += os1, op2 += os2)
 
+/** (ip1, ip2, ip3) -> (op1) */
+#define TERNARY_LOOP\
+    char *ip1 = args[0], *ip2 = args[1], *ip3 = args[2], *op1 = args[3];\
+    npy_intp is1 = steps[0], is2 = steps[1], is3 = steps[2], os1 = steps[3];\
+    npy_intp n = dimensions[0];\
+    npy_intp i;\
+    for(i = 0; i < n; i++, ip1 += is1, ip2 += is2, ip3 += is3, op1 += os1)
+
 /** @} */
 
 /* unary loop input and output contiguous */
diff --git a/numpy/core/src/umath/funcs.inc.src b/numpy/core/src/umath/funcs.inc.src
index 2acae3c37c88..c2732f92523f 100644
--- a/numpy/core/src/umath/funcs.inc.src
+++ b/numpy/core/src/umath/funcs.inc.src
@@ -259,6 +259,17 @@ npy_ObjectLCM(PyObject *i1, PyObject *i2)
     return PyNumber_Absolute(tmp);
 }
 
+
+static PyObject *
+npy_ObjectClip(PyObject *arr, PyObject *min, PyObject *max) {
+    PyObject *o = npy_ObjectMax(arr, min);
+    if (o == NULL) {
+        return NULL;
+    }
+    Py_SETREF(o, npy_ObjectMin(o, max));
+    return o;
+}
+
 /*
  *****************************************************************************
  **                           COMPLEX FUNCTIONS                             **
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
index ff3b36428df1..89eeb0c47302 100644
--- a/numpy/core/src/umath/loops.c.src
+++ b/numpy/core/src/umath/loops.c.src
@@ -425,6 +425,28 @@ PyUFunc_OO_O(char **args, npy_intp *dimensions, npy_intp *steps, void *func)
     }
 }
 
+NPY_NO_EXPORT void
+PyUFunc_OOO_O(char **args, npy_intp *dimensions, npy_intp *steps, void *func)
+{
+    ternaryfunc f = (ternaryfunc)func;
+    TERNARY_LOOP {
+        PyObject *in1 = *(PyObject **)ip1;
+        PyObject *in2 = *(PyObject **)ip2;
+        PyObject *in3 = *(PyObject **)ip3;
+        PyObject **out = (PyObject **)op1;
+        PyObject *ret = f(
+            in1 ? in1 : Py_None,
+            in2 ? in2 : Py_None,
+            in3 ? in3 : Py_None
+        );
+        if (ret == NULL) {
+            return;
+        }
+        Py_XDECREF(*out);
+        *out = ret;
+    }
+}
+
 /*UFUNC_API*/
 NPY_NO_EXPORT void
 PyUFunc_OO_O_method(char **args, npy_intp *dimensions, npy_intp *steps, void *func)
diff --git a/numpy/core/src/umath/loops.h.src b/numpy/core/src/umath/loops.h.src
index 8dd3170e3c0c..7f05a693a0c8 100644
--- a/numpy/core/src/umath/loops.h.src
+++ b/numpy/core/src/umath/loops.h.src
@@ -549,6 +549,9 @@ OBJECT@suffix@_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *
 NPY_NO_EXPORT void
 OBJECT_sign(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
 
+NPY_NO_EXPORT void
+PyUFunc_OOO_O(char **args, npy_intp *dimensions, npy_intp *steps, void *func);
+
 /*
  *****************************************************************************
  **                              END LOOPS                                  **
diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src
index b94a5a0f7a1f..72493e3084f3 100644
--- a/numpy/core/src/umath/simd.inc.src
+++ b/numpy/core/src/umath/simd.inc.src
@@ -131,7 +131,7 @@ abs_ptrdiff(char *a, char *b)
  * #func = exp, log#
  */
 
-static void
+static NPY_INLINE void
 @ISA@_@func@_FLOAT(npy_float *, npy_float *, const npy_intp n);
 
 /**end repeat1**/
diff --git a/numpy/core/tests/test_multiarray.py b/numpy/core/tests/test_multiarray.py
index d22f160992ba..59c19aa1b71a 100644
--- a/numpy/core/tests/test_multiarray.py
+++ b/numpy/core/tests/test_multiarray.py
@@ -4287,7 +4287,11 @@ def _clip_type(self, type_group, array_max,
 
                 x = (np.random.random(1000) * array_max).astype(dtype)
                 if inplace:
-                    x.clip(clip_min, clip_max, x)
+                    # The tests that call us pass clip_min and clip_max that
+                    # might not fit in the destination dtype. They were written
+                    # assuming the previous unsafe casting, which now must be
+                    # passed explicitly to avoid a warning.
+                    x.clip(clip_min, clip_max, x, casting='unsafe')
                 else:
                     x = x.clip(clip_min, clip_max)
                     byteorder = '='
@@ -4306,7 +4310,7 @@ def test_basic(self):
                 'float', 1024, 0, 0, inplace=inplace)
 
             self._clip_type(
-                'int', 1024, -120, 100.5, inplace=inplace)
+                'int', 1024, -120, 100, inplace=inplace)
             self._clip_type(
                 'int', 1024, 0, 0, inplace=inplace)
 
@@ -7793,13 +7797,6 @@ def test_argmin_with_out(self):
         res = np.argmin(mat, 0, out=out)
         assert_equal(res, range(5))
 
-    def test_clip_with_out(self):
-        mat = np.eye(5)
-        out = np.eye(5, dtype='i2')
-        res = np.clip(mat, a_min=-10, a_max=0, out=out)
-        assert_(res is out)
-        assert_equal(np.sum(out), 0)
-
     def test_insert_noncontiguous(self):
         a = np.arange(6).reshape(2,3).T # force non-c-contiguous
         # uses arr_insert
diff --git a/numpy/core/tests/test_numeric.py b/numpy/core/tests/test_numeric.py
index 406110fa7588..ae596ec20bd3 100644
--- a/numpy/core/tests/test_numeric.py
+++ b/numpy/core/tests/test_numeric.py
@@ -13,7 +13,7 @@
 from numpy.testing import (
     assert_, assert_equal, assert_raises, assert_raises_regex,
     assert_array_equal, assert_almost_equal, assert_array_almost_equal,
-    HAS_REFCOUNT
+    assert_warns, HAS_REFCOUNT
     )
 
 
@@ -1336,11 +1336,17 @@ def setup(self):
         self.nr = 5
         self.nc = 3
 
-    def fastclip(self, a, m, M, out=None):
+    def fastclip(self, a, m, M, out=None, casting=None):
         if out is None:
-            return a.clip(m, M)
+            if casting is None:
+                return a.clip(m, M)
+            else:
+                return a.clip(m, M, casting=casting)
         else:
-            return a.clip(m, M, out)
+            if casting is None:
+                return a.clip(m, M, out)
+            else:
+                return a.clip(m, M, out, casting=casting)
 
     def clip(self, a, m, M, out=None):
         # use slow-clip
@@ -1378,6 +1384,20 @@ def _generate_int32_data(self, n, m):
         return (10 * rand(n, m)).astype(np.int32)
 
     # Now the real test cases
+
+    @pytest.mark.parametrize("dtype", '?bhilqpBHILQPefdgFDGO')
+    def test_ones_pathological(self, dtype):
+        # for preservation of behavior described in
+        # gh-12519; amin > amax behavior may still change
+        # in the future
+        arr = np.ones(10, dtype=dtype)
+        expected = np.zeros(10, dtype=dtype)
+        actual = np.clip(arr, 1, 0)
+        if dtype == 'O':
+            assert actual.tolist() == expected.tolist()
+        else:
+            assert_equal(actual, expected)
+
     def test_simple_double(self):
         # Test native double input with scalar min/max.
         a = self._generate_data(self.nr, self.nc)
@@ -1476,14 +1496,21 @@ def test_simple_out(self):
         self.clip(a, m, M, act)
         assert_array_strict_equal(ac, act)
 
-    def test_simple_int32_inout(self):
+    @pytest.mark.parametrize("casting", [None, "unsafe"])
+    def test_simple_int32_inout(self, casting):
         # Test native int32 input with double min/max and int32 out.
         a = self._generate_int32_data(self.nr, self.nc)
         m = np.float64(0)
         M = np.float64(2)
         ac = np.zeros(a.shape, dtype=np.int32)
         act = ac.copy()
-        self.fastclip(a, m, M, ac)
+        if casting is None:
+            with assert_warns(DeprecationWarning):
+                # NumPy 1.17.0, 2018-02-24 - casting is unsafe
+                self.fastclip(a, m, M, ac, casting=casting)
+        else:
+            # explicitly passing "unsafe" will silence warning
+            self.fastclip(a, m, M, ac, casting=casting)
         self.clip(a, m, M, act)
         assert_array_strict_equal(ac, act)
 
@@ -1505,7 +1532,9 @@ def test_simple_int64_inout(self):
         M = np.float64(1)
         ac = np.zeros(a.shape, dtype=np.int32)
         act = ac.copy()
-        self.fastclip(a, m, M, ac)
+        with assert_warns(DeprecationWarning):
+            # NumPy 1.17.0, 2018-02-24 - casting is unsafe
+            self.fastclip(a, m, M, ac)
         self.clip(a, m, M, act)
         assert_array_strict_equal(ac, act)
 
@@ -1516,7 +1545,9 @@ def test_simple_int32_out(self):
         M = 2.0
         ac = np.zeros(a.shape, dtype=np.int32)
         act = ac.copy()
-        self.fastclip(a, m, M, ac)
+        with assert_warns(DeprecationWarning):
+            # NumPy 1.17.0, 2018-02-24 - casting is unsafe
+            self.fastclip(a, m, M, ac)
         self.clip(a, m, M, act)
         assert_array_strict_equal(ac, act)
 
@@ -1692,7 +1723,9 @@ def test_clip_with_out_simple2(self):
         M = np.float64(2)
         ac = np.zeros(a.shape, dtype=np.int32)
         act = ac.copy()
-        self.fastclip(a, m, M, ac)
+        with assert_warns(DeprecationWarning):
+            # NumPy 1.17.0, 2018-02-24 - casting is unsafe
+            self.fastclip(a, m, M, ac)
         self.clip(a, m, M, act)
         assert_array_strict_equal(ac, act)
 
@@ -1714,7 +1747,9 @@ def test_clip_with_out_array_int32(self):
         M = np.float64(1)
         ac = np.zeros(a.shape, dtype=np.int32)
         act = ac.copy()
-        self.fastclip(a, m, M, ac)
+        with assert_warns(DeprecationWarning):
+            # NumPy 1.17.0, 2018-02-24 - casting is unsafe
+            self.fastclip(a, m, M, ac)
         self.clip(a, m, M, act)
         assert_array_strict_equal(ac, act)
 
@@ -1725,7 +1760,9 @@ def test_clip_with_out_array_outint32(self):
         M = 2.0
         ac = np.zeros(a.shape, dtype=np.int32)
         act = ac.copy()
-        self.fastclip(a, m, M, ac)
+        with assert_warns(DeprecationWarning):
+            # NumPy 1.17.0, 2018-02-24 - casting is unsafe
+            self.fastclip(a, m, M, ac)
         self.clip(a, m, M, act)
         assert_array_strict_equal(ac, act)
 
@@ -1778,11 +1815,94 @@ def test_clip_func_takes_out(self):
 
     def test_clip_nan(self):
         d = np.arange(7.)
-        assert_equal(d.clip(min=np.nan), d)
-        assert_equal(d.clip(max=np.nan), d)
-        assert_equal(d.clip(min=np.nan, max=np.nan), d)
-        assert_equal(d.clip(min=-2, max=np.nan), d)
-        assert_equal(d.clip(min=np.nan, max=10), d)
+        with assert_warns(DeprecationWarning):
+            assert_equal(d.clip(min=np.nan), d)
+        with assert_warns(DeprecationWarning):
+            assert_equal(d.clip(max=np.nan), d)
+        with assert_warns(DeprecationWarning):
+            assert_equal(d.clip(min=np.nan, max=np.nan), d)
+        with assert_warns(DeprecationWarning):
+            assert_equal(d.clip(min=-2, max=np.nan), d)
+        with assert_warns(DeprecationWarning):
+            assert_equal(d.clip(min=np.nan, max=10), d)
+
+    def test_object_clip(self):
+        a = np.arange(10, dtype=object)
+        actual = np.clip(a, 1, 5)
+        expected = np.array([1, 1, 2, 3, 4, 5, 5, 5, 5, 5])
+        assert actual.tolist() == expected.tolist()
+
+    def test_clip_all_none(self):
+        a = np.arange(10, dtype=object)
+        with assert_raises_regex(ValueError, 'max or min'):
+            np.clip(a, None, None)
+
+    def test_clip_invalid_casting(self):
+        a = np.arange(10, dtype=object)
+        with assert_raises_regex(ValueError,
+                                 'casting must be one of'):
+            self.fastclip(a, 1, 8, casting="garbage")
+
+    @pytest.mark.parametrize("amin, amax", [
+        # two scalars
+        (1, 0),
+        # mix scalar and array
+        (1, np.zeros(10)),
+        # two arrays
+        (np.ones(10), np.zeros(10)),
+        ])
+    def test_clip_value_min_max_flip(self, amin, amax):
+        a = np.arange(10, dtype=np.int64)
+        # requirement from ufunc_docstrings.py
+        expected = np.minimum(np.maximum(a, amin), amax)
+        actual = np.clip(a, amin, amax)
+        assert_equal(actual, expected)
+
+    @pytest.mark.parametrize("arr, amin, amax, exp", [
+        # for a bug in npy_ObjectClip, based on a
+        # case produced by hypothesis
+        (np.zeros(10, dtype=np.int64),
+         0,
+         -2**64+1,
+         np.full(10, -2**64+1, dtype=object)),
+        # for bugs in NPY_TIMEDELTA_MAX, based on a case
+        # produced by hypothesis
+        (np.zeros(10, dtype='m8') - 1,
+         0,
+         0,
+         np.zeros(10, dtype='m8')),
+    ])
+    def test_clip_problem_cases(self, arr, amin, amax, exp):
+        actual = np.clip(arr, amin, amax)
+        assert_equal(actual, exp)
+
+    @pytest.mark.xfail(reason="no scalar nan propagation yet")
+    @pytest.mark.parametrize("arr, amin, amax", [
+        # problematic scalar nan case from hypothesis
+        (np.zeros(10, dtype=np.int64),
+         np.array(np.nan),
+         np.zeros(10, dtype=np.int32)),
+    ])
+    def test_clip_scalar_nan_propagation(self, arr, amin, amax):
+        # enforcement of scalar nan propagation for comparisons
+        # called through clip()
+        expected = np.minimum(np.maximum(a, amin), amax)
+        with assert_warns(DeprecationWarning):
+            actual = np.clip(arr, amin, amax)
+            assert_equal(actual, expected)
+
+    @pytest.mark.xfail(reason="propagation doesn't match spec")
+    @pytest.mark.parametrize("arr, amin, amax", [
+        (np.array([1] * 10, dtype='m8'),
+         np.timedelta64('NaT'),
+         np.zeros(10, dtype=np.int32)),
+    ])
+    def test_NaT_propagation(self, arr, amin, amax):
+        # NOTE: the expected function spec doesn't
+        # propagate NaT, but clip() now does
+        expected = np.minimum(np.maximum(a, amin), amax)
+        actual = np.clip(arr, amin, amax)
+        assert_equal(actual, expected)
 
 
 class TestAllclose(object):