Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
Z
zstd
Manage
Activity
Members
Plan
Wiki
Code
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Deploy
Releases
Package Registry
Container Registry
Model registry
Operate
Terraform modules
Analyze
Contributor analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
CodeLinaro
yocto-mirrors
zstd
Commits
f6d673a5
Commit
f6d673a5
authored
3 years ago
by
Yann Collet
Browse files
Options
Downloads
Patches
Plain Diff
attempt a sse2/avx2 branch of the lazy match detector
parent
41153071
Branches
lazy_avx2
No related tags found
No related merge requests found
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
lib/common/compiler.h
+8
-1
8 additions, 1 deletion
lib/common/compiler.h
lib/compress/zstd_lazy.c
+26
-1
26 additions, 1 deletion
lib/compress/zstd_lazy.c
with
34 additions
and
2 deletions
lib/common/compiler.h
+
8
−
1
View file @
f6d673a5
...
@@ -190,6 +190,9 @@
...
@@ -190,6 +190,9 @@
/* compile time determination of SIMD support */
/* compile time determination of SIMD support */
#if !defined(ZSTD_NO_INTRINSICS)
#if !defined(ZSTD_NO_INTRINSICS)
# if defined(__AVX2__)
# define ZSTD_ARCH_X86_AVX2
# endif
# if defined(__SSE2__) || defined(_M_AMD64) || (defined (_M_IX86) && defined(_M_IX86_FP) && (_M_IX86_FP >= 2))
# if defined(__SSE2__) || defined(_M_AMD64) || (defined (_M_IX86) && defined(_M_IX86_FP) && (_M_IX86_FP >= 2))
# define ZSTD_ARCH_X86_SSE2
# define ZSTD_ARCH_X86_SSE2
# endif
# endif
...
@@ -197,9 +200,13 @@
...
@@ -197,9 +200,13 @@
# define ZSTD_ARCH_ARM_NEON
# define ZSTD_ARCH_ARM_NEON
# endif
# endif
#
#
# if defined(ZSTD_ARCH_X86_AVX2)
# include <immintrin.h>
# endif
# if defined(ZSTD_ARCH_X86_SSE2)
# if defined(ZSTD_ARCH_X86_SSE2)
# include <emmintrin.h>
# include <emmintrin.h>
# elif defined(ZSTD_ARCH_ARM_NEON)
# endif
# if defined(ZSTD_ARCH_ARM_NEON)
# include <arm_neon.h>
# include <arm_neon.h>
# endif
# endif
#endif
#endif
...
...
This diff is collapsed.
Click to expand it.
lib/compress/zstd_lazy.c
+
26
−
1
View file @
f6d673a5
...
@@ -1002,6 +1002,27 @@ ZSTD_row_getSSEMask(int nbChunks, const BYTE* const src, const BYTE tag, const U
...
@@ -1002,6 +1002,27 @@ ZSTD_row_getSSEMask(int nbChunks, const BYTE* const src, const BYTE tag, const U
}
}
#endif
#endif
#if defined(ZSTD_ARCH_X86_AVX2)
FORCE_INLINE_TEMPLATE
ZSTD_VecMask
ZSTD_row_getAVXMask
(
int
nbChunks
,
const
BYTE
*
const
src
,
const
BYTE
tag
,
const
U32
head
)
{
if
(
nbChunks
==
1
)
return
ZSTD_row_getSSEMask
(
1
,
src
,
tag
,
head
);
{
const
__m256i
comparisonMask
=
_mm256_set1_epi8
((
char
)
tag
);
int
matches
[
2
]
=
{
0
};
int
i
;
assert
(
nbChunks
==
2
||
nbChunks
==
4
);
for
(
i
=
0
;
i
<
(
nbChunks
/
2
);
i
++
)
{
const
__m256i
chunk
=
_mm256_loadu_si256
((
const
__m256i
*
)(
const
void
*
)(
src
+
32
*
i
));
const
__m256i
equalMask
=
_mm256_cmpeq_epi8
(
chunk
,
comparisonMask
);
matches
[
i
]
=
_mm256_movemask_epi8
(
equalMask
);
}
if
(
nbChunks
==
2
)
return
ZSTD_rotateRight_U32
((
U32
)
matches
[
0
],
head
);
assert
(
nbChunks
==
4
);
return
ZSTD_rotateRight_U64
((
U64
)
matches
[
1
]
<<
32
|
(
U64
)
matches
[
0
],
head
);
}
}
#endif
/* Returns a ZSTD_VecMask (U32) that has the nth bit set to 1 if the newly-computed "tag" matches
/* Returns a ZSTD_VecMask (U32) that has the nth bit set to 1 if the newly-computed "tag" matches
* the hash at the nth position in a row of the tagTable.
* the hash at the nth position in a row of the tagTable.
* Each row is a circular buffer beginning at the value of "head". So we must rotate the "matches" bitfield
* Each row is a circular buffer beginning at the value of "head". So we must rotate the "matches" bitfield
...
@@ -1013,7 +1034,11 @@ ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 head,
...
@@ -1013,7 +1034,11 @@ ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 head,
assert
((
rowEntries
==
16
)
||
(
rowEntries
==
32
)
||
rowEntries
==
64
);
assert
((
rowEntries
==
16
)
||
(
rowEntries
==
32
)
||
rowEntries
==
64
);
assert
(
rowEntries
<=
ZSTD_ROW_HASH_MAX_ENTRIES
);
assert
(
rowEntries
<=
ZSTD_ROW_HASH_MAX_ENTRIES
);
#if defined(ZSTD_ARCH_X86_SSE2)
#if defined(ZSTD_ARCH_X86_AVX2)
return
ZSTD_row_getAVXMask
(
rowEntries
/
16
,
src
,
tag
,
head
);
#elif defined(ZSTD_ARCH_X86_SSE2)
return
ZSTD_row_getSSEMask
(
rowEntries
/
16
,
src
,
tag
,
head
);
return
ZSTD_row_getSSEMask
(
rowEntries
/
16
,
src
,
tag
,
head
);
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment