blend_h for w=32 with single row doesn't help much

                                A73             A53
                        Current Earlier Current Earlier
blend_h_w32_8bpc_neon:	169.3	144.3	298.2	262.2
blend_h_w64_8bpc_neon:	262.8	247.1	521.6	469.3
blend_h_w128_8bpc_neon:	571.6	544.1	1167.7	1055.1
......@@ -418,51 +418,32 @@ function blend_h_8bpc_neon, export=1
lsr x1, x1, #1
sub x1, x1, w3, uxtw
add x7, x2, w3, uxtw
ld2r {v0.16b, v1.16b}, [x5], #2
ld1r {v0.16b}, [x5], #1
mov w6, w3
sub v20.16b, v4.16b, v0.16b
sub v21.16b, v4.16b, v1.16b
ld1 {v16.16b, v17.16b}, [x2], #32
ld1 {v2.16b, v3.16b}, [x0]
subs w6, w6, #32
umull v23.8h, v0.8b, v16.8b
umlal v23.8h, v2.8b, v20.8b
ld1 {v18.16b, v19.16b}, [x7], #32
umull2 v27.8h, v0.16b, v16.16b
umlal2 v27.8h, v2.16b, v20.16b
ld1 {v6.16b, v7.16b}, [x8]
umull v24.8h, v0.8b, v17.8b
umlal v24.8h, v3.8b, v20.8b
umull2 v28.8h, v0.16b, v17.16b
umlal2 v28.8h, v3.16b, v20.16b
umull v25.8h, v1.8b, v18.8b
umlal v25.8h, v6.8b, v21.8b
umull2 v5.8h, v1.16b, v18.16b
umlal2 v5.8h, v6.16b, v21.16b
rshrn v29.8b, v23.8h, #6
rshrn2 v29.16b, v27.8h, #6
umull v26.8h, v1.8b, v19.8b
umlal v26.8h, v7.8b, v21.8b
umull2 v31.8h, v1.16b, v19.16b
umlal2 v31.8h, v7.16b, v21.16b
rshrn v30.8b, v24.8h, #6
rshrn2 v30.16b, v28.8h, #6
rshrn v23.8b, v25.8h, #6
rshrn2 v23.16b, v5.8h, #6
rshrn v24.8b, v26.8h, #6
st1 {v29.16b, v30.16b}, [x0], #32
rshrn2 v24.16b, v31.8h, #6
st1 {v23.16b, v24.16b}, [x8], #32 32b
subs w4, w4, #2
subs w4, w4, #1
add x0, x0, x1
add x8, x8, x1
add x2, x2, w3, uxtw
add x7, x7, w3, uxtw 321b
