md5-ia64.S@ 69890

Last change on this file since 69890 was 69890, checked in by vboxsync, 7 years ago
Added OpenSSL 1.1.0g with unneeded files removed, otherwise unmodified. bugref:8070: src/libs maintenance
Property svn:eol-style set to `native`
File size: 21.4 KB

Line
1	/*
2	*
3	* Copyright 2005-2016 The OpenSSL Project Authors. All Rights Reserved.
4	*
5	* Licensed under the OpenSSL license (the "License"). You may not use
6	* this file except in compliance with the License. You can obtain a copy
7	* in the file LICENSE in the source distribution or at
8	* https://www.openssl.org/source/license.html
9	*/
10
11	/* Copyright (c) 2005 Hewlett-Packard Development Company, L.P.
12
13	Permission is hereby granted, free of charge, to any person obtaining
14	a copy of this software and associated documentation files (the
15	"Software"), to deal in the Software without restriction, including
16	without limitation the rights to use, copy, modify, merge, publish,
17	distribute, sublicense, and/or sell copies of the Software, and to
18	permit persons to whom the Software is furnished to do so, subject to
19	the following conditions:
20
21	The above copyright notice and this permission notice shall be
22	included in all copies or substantial portions of the Software.
23
24	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
25	EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
26	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
27	NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
28	LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
29	OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
30	WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */
31
32	// Common registers are assigned as follows:
33	//
34	// COMMON
35	//
36	// t0 Const Tbl Ptr TPtr
37	// t1 Round Constant TRound
38	// t4 Block residual LenResid
39	// t5 Residual Data DTmp
40	//
41	// {in,out}0 Block 0 Cycle RotateM0
42	// {in,out}1 Block Value 12 M12
43	// {in,out}2 Block Value 8 M8
44	// {in,out}3 Block Value 4 M4
45	// {in,out}4 Block Value 0 M0
46	// {in,out}5 Block 1 Cycle RotateM1
47	// {in,out}6 Block Value 13 M13
48	// {in,out}7 Block Value 9 M9
49	// {in,out}8 Block Value 5 M5
50	// {in,out}9 Block Value 1 M1
51	// {in,out}10 Block 2 Cycle RotateM2
52	// {in,out}11 Block Value 14 M14
53	// {in,out}12 Block Value 10 M10
54	// {in,out}13 Block Value 6 M6
55	// {in,out}14 Block Value 2 M2
56	// {in,out}15 Block 3 Cycle RotateM3
57	// {in,out}16 Block Value 15 M15
58	// {in,out}17 Block Value 11 M11
59	// {in,out}18 Block Value 7 M7
60	// {in,out}19 Block Value 3 M3
61	// {in,out}20 Scratch Z
62	// {in,out}21 Scratch Y
63	// {in,out}22 Scratch X
64	// {in,out}23 Scratch W
65	// {in,out}24 Digest A A
66	// {in,out}25 Digest B B
67	// {in,out}26 Digest C C
68	// {in,out}27 Digest D D
69	// {in,out}28 Active Data Ptr DPtr
70	// in28 Dummy Value -
71	// out28 Dummy Value -
72	// bt0 Coroutine Link QUICK_RTN
73	//
74	/// These predicates are used for computing the padding block(s) and
75	/// are shared between the driver and digest co-routines
76	//
77	// pt0 Extra Pad Block pExtra
78	// pt1 Load next word pLoad
79	// pt2 Skip next word pSkip
80	// pt3 Search for Pad pNoPad
81	// pt4 Pad Word 0 pPad0
82	// pt5 Pad Word 1 pPad1
83	// pt6 Pad Word 2 pPad2
84	// pt7 Pad Word 3 pPad3
85
86	#define DTmp r19
87	#define LenResid r18
88	#define QUICK_RTN b6
89	#define TPtr r14
90	#define TRound r15
91	#define pExtra p6
92	#define pLoad p7
93	#define pNoPad p9
94	#define pPad0 p10
95	#define pPad1 p11
96	#define pPad2 p12
97	#define pPad3 p13
98	#define pSkip p8
99
100	#define A_ out24
101	#define B_ out25
102	#define C_ out26
103	#define D_ out27
104	#define DPtr_ out28
105	#define M0_ out4
106	#define M1_ out9
107	#define M10_ out12
108	#define M11_ out17
109	#define M12_ out1
110	#define M13_ out6
111	#define M14_ out11
112	#define M15_ out16
113	#define M2_ out14
114	#define M3_ out19
115	#define M4_ out3
116	#define M5_ out8
117	#define M6_ out13
118	#define M7_ out18
119	#define M8_ out2
120	#define M9_ out7
121	#define RotateM0_ out0
122	#define RotateM1_ out5
123	#define RotateM2_ out10
124	#define RotateM3_ out15
125	#define W_ out23
126	#define X_ out22
127	#define Y_ out21
128	#define Z_ out20
129
130	#define A in24
131	#define B in25
132	#define C in26
133	#define D in27
134	#define DPtr in28
135	#define M0 in4
136	#define M1 in9
137	#define M10 in12
138	#define M11 in17
139	#define M12 in1
140	#define M13 in6
141	#define M14 in11
142	#define M15 in16
143	#define M2 in14
144	#define M3 in19
145	#define M4 in3
146	#define M5 in8
147	#define M6 in13
148	#define M7 in18
149	#define M8 in2
150	#define M9 in7
151	#define RotateM0 in0
152	#define RotateM1 in5
153	#define RotateM2 in10
154	#define RotateM3 in15
155	#define W in23
156	#define X in22
157	#define Y in21
158	#define Z in20
159
160	/* register stack configuration for md5_block_asm_data_order(): */
161	#define MD5_NINP 3
162	#define MD5_NLOC 0
163	#define MD5_NOUT 29
164	#define MD5_NROT 0
165
166	/* register stack configuration for helpers: */
167	#define _NINPUTS MD5_NOUT
168	#define _NLOCALS 0
169	#define _NOUTPUT 0
170	#define _NROTATE 24 /* this must be <= _NINPUTS */
171
172	#if defined(_HPUX_SOURCE) && !defined(_LP64)
173	#define ADDP addp4
174	#else
175	#define ADDP add
176	#endif
177
178	#if defined(_HPUX_SOURCE) \|\| defined(B_ENDIAN)
179	#define HOST_IS_BIG_ENDIAN
180	#endif
181
182	// Macros for getting the left and right portions of little-endian words
183
184	#define GETLW(dst, src, align) dep.z dst = src, 32 - 8 * align, 8 * align
185	#define GETRW(dst, src, align) extr.u dst = src, 8 * align, 32 - 8 * align
186
187	// MD5 driver
188	//
189	// Reads an input block, then calls the digest block
190	// subroutine and adds the results to the accumulated
191	// digest. It allocates 32 outs which the subroutine
192	// uses as it's inputs and rotating
193	// registers. Initializes the round constant pointer and
194	// takes care of saving/restoring ar.lc
195	//
196	/// INPUT
197	//
198	// in0 Context Ptr CtxPtr0
199	// in1 Input Data Ptr DPtrIn
200	// in2 Integral Blocks BlockCount
201	// rp Return Address -
202	//
203	/// CODE
204	//
205	// v2 Input Align InAlign
206	// t0 Shared w/digest -
207	// t1 Shared w/digest -
208	// t2 Shared w/digest -
209	// t3 Shared w/digest -
210	// t4 Shared w/digest -
211	// t5 Shared w/digest -
212	// t6 PFS Save PFSSave
213	// t7 ar.lc Save LCSave
214	// t8 Saved PR PRSave
215	// t9 2nd CtxPtr CtxPtr1
216	// t10 Table Base CTable
217	// t11 Table[0] CTable0
218	// t13 Accumulator A AccumA
219	// t14 Accumulator B AccumB
220	// t15 Accumulator C AccumC
221	// t16 Accumulator D AccumD
222	// pt0 Shared w/digest -
223	// pt1 Shared w/digest -
224	// pt2 Shared w/digest -
225	// pt3 Shared w/digest -
226	// pt4 Shared w/digest -
227	// pt5 Shared w/digest -
228	// pt6 Shared w/digest -
229	// pt7 Shared w/digest -
230	// pt8 Not Aligned pOff
231	// pt8 Blocks Left pAgain
232
233	#define AccumA r27
234	#define AccumB r28
235	#define AccumC r29
236	#define AccumD r30
237	#define CTable r24
238	#define CTable0 r25
239	#define CtxPtr0 in0
240	#define CtxPtr1 r23
241	#define DPtrIn in1
242	#define BlockCount in2
243	#define InAlign r10
244	#define LCSave r21
245	#define PFSSave r20
246	#define PRSave r22
247	#define pAgain p63
248	#define pOff p63
249
250	.text
251
252	/* md5_block_asm_data_order(MD5_CTX c, const void data, size_t num)
253
254	where:
255	c: a pointer to a structure of this type:
256
257	typedef struct MD5state_st
258	{
259	MD5_LONG A,B,C,D;
260	MD5_LONG Nl,Nh;
261	MD5_LONG data[MD5_LBLOCK];
262	unsigned int num;
263	}
264	MD5_CTX;
265
266	data: a pointer to the input data (may be misaligned)
267	num: the number of 16-byte blocks to hash (i.e., the length
268	of DATA is 16*NUM.
269
270	*/
271
272	.type md5_block_asm_data_order, @function
273	.global md5_block_asm_data_order
274	.align 32
275	.proc md5_block_asm_data_order
276	md5_block_asm_data_order:
277	.md5_block:
278	.prologue
279	{ .mmi
280	.save ar.pfs, PFSSave
281	alloc PFSSave = ar.pfs, MD5_NINP, MD5_NLOC, MD5_NOUT, MD5_NROT
282	ADDP CtxPtr1 = 8, CtxPtr0
283	mov CTable = ip
284	}
285	{ .mmi
286	ADDP DPtrIn = 0, DPtrIn
287	ADDP CtxPtr0 = 0, CtxPtr0
288	.save ar.lc, LCSave
289	mov LCSave = ar.lc
290	}
291	;;
292	{ .mmi
293	add CTable = .md5_tbl_data_order#-.md5_block#, CTable
294	and InAlign = 0x3, DPtrIn
295	}
296
297	{ .mmi
298	ld4 AccumA = [CtxPtr0], 4
299	ld4 AccumC = [CtxPtr1], 4
300	.save pr, PRSave
301	mov PRSave = pr
302	.body
303	}
304	;;
305	{ .mmi
306	ld4 AccumB = [CtxPtr0]
307	ld4 AccumD = [CtxPtr1]
308	dep DPtr_ = 0, DPtrIn, 0, 2
309	} ;;
310	#ifdef HOST_IS_BIG_ENDIAN
311	rum psr.be;; // switch to little-endian
312	#endif
313	{ .mmb
314	ld4 CTable0 = [CTable], 4
315	cmp.ne pOff, p0 = 0, InAlign
316	(pOff) br.cond.spnt.many .md5_unaligned
317	} ;;
318
319	// The FF load/compute loop rotates values three times, so that
320	// loading into M12 here produces the M0 value, M13 -> M1, etc.
321
322	.md5_block_loop0:
323	{ .mmi
324	ld4 M12_ = [DPtr_], 4
325	mov TPtr = CTable
326	mov TRound = CTable0
327	} ;;
328	{ .mmi
329	ld4 M13_ = [DPtr_], 4
330	mov A_ = AccumA
331	mov B_ = AccumB
332	} ;;
333	{ .mmi
334	ld4 M14_ = [DPtr_], 4
335	mov C_ = AccumC
336	mov D_ = AccumD
337	} ;;
338	{ .mmb
339	ld4 M15_ = [DPtr_], 4
340	add BlockCount = -1, BlockCount
341	br.call.sptk.many QUICK_RTN = md5_digest_block0
342	} ;;
343
344	// Now, we add the new digest values and do some clean-up
345	// before checking if there's another full block to process
346
347	{ .mmi
348	add AccumA = AccumA, A_
349	add AccumB = AccumB, B_
350	cmp.ne pAgain, p0 = 0, BlockCount
351	}
352	{ .mib
353	add AccumC = AccumC, C_
354	add AccumD = AccumD, D_
355	(pAgain) br.cond.dptk.many .md5_block_loop0
356	} ;;
357
358	.md5_exit:
359	#ifdef HOST_IS_BIG_ENDIAN
360	sum psr.be;; // switch back to big-endian mode
361	#endif
362	{ .mmi
363	st4 [CtxPtr0] = AccumB, -4
364	st4 [CtxPtr1] = AccumD, -4
365	mov pr = PRSave, 0x1ffff ;;
366	}
367	{ .mmi
368	st4 [CtxPtr0] = AccumA
369	st4 [CtxPtr1] = AccumC
370	mov ar.lc = LCSave
371	} ;;
372	{ .mib
373	mov ar.pfs = PFSSave
374	br.ret.sptk.few rp
375	} ;;
376
377	#define MD5UNALIGNED(offset) \
378	.md5_process##offset: \
379	{ .mib ; \
380	nop 0x0 ; \
381	GETRW(DTmp, DTmp, offset) ; \
382	} ;; \
383	.md5_block_loop##offset: \
384	{ .mmi ; \
385	ld4 Y_ = [DPtr_], 4 ; \
386	mov TPtr = CTable ; \
387	mov TRound = CTable0 ; \
388	} ;; \
389	{ .mmi ; \
390	ld4 M13_ = [DPtr_], 4 ; \
391	mov A_ = AccumA ; \
392	mov B_ = AccumB ; \
393	} ;; \
394	{ .mii ; \
395	ld4 M14_ = [DPtr_], 4 ; \
396	GETLW(W_, Y_, offset) ; \
397	mov C_ = AccumC ; \
398	} \
399	{ .mmi ; \
400	mov D_ = AccumD ;; \
401	or M12_ = W_, DTmp ; \
402	GETRW(DTmp, Y_, offset) ; \
403	} \
404	{ .mib ; \
405	ld4 M15_ = [DPtr_], 4 ; \
406	add BlockCount = -1, BlockCount ; \
407	br.call.sptk.many QUICK_RTN = md5_digest_block##offset; \
408	} ;; \
409	{ .mmi ; \
410	add AccumA = AccumA, A_ ; \
411	add AccumB = AccumB, B_ ; \
412	cmp.ne pAgain, p0 = 0, BlockCount ; \
413	} \
414	{ .mib ; \
415	add AccumC = AccumC, C_ ; \
416	add AccumD = AccumD, D_ ; \
417	(pAgain) br.cond.dptk.many .md5_block_loop##offset ; \
418	} ;; \
419	{ .mib ; \
420	nop 0x0 ; \
421	nop 0x0 ; \
422	br.cond.sptk.many .md5_exit ; \
423	} ;;
424
425	.align 32
426	.md5_unaligned:
427	//
428	// Because variable shifts are expensive, we special case each of
429	// the four alignements. In practice, this won't hurt too much
430	// since only one working set of code will be loaded.
431	//
432	{ .mib
433	ld4 DTmp = [DPtr_], 4
434	cmp.eq pOff, p0 = 1, InAlign
435	(pOff) br.cond.dpnt.many .md5_process1
436	} ;;
437	{ .mib
438	cmp.eq pOff, p0 = 2, InAlign
439	nop 0x0
440	(pOff) br.cond.dpnt.many .md5_process2
441	} ;;
442	MD5UNALIGNED(3)
443	MD5UNALIGNED(1)
444	MD5UNALIGNED(2)
445
446	.endp md5_block_asm_data_order
447
448
449	// MD5 Perform the F function and load
450	//
451	// Passed the first 4 words (M0 - M3) and initial (A, B, C, D) values,
452	// computes the FF() round of functions, then branches to the common
453	// digest code to finish up with GG(), HH, and II().
454	//
455	// INPUT
456	//
457	// rp Return Address -
458	//
459	// CODE
460	//
461	// v0 PFS bit bucket PFS
462	// v1 Loop Trip Count LTrip
463	// pt0 Load next word pMore
464
465	/* For F round: */
466	#define LTrip r9
467	#define PFS r8
468	#define pMore p6
469
470	/* For GHI rounds: */
471	#define T r9
472	#define U r10
473	#define V r11
474
475	#define COMPUTE(a, b, s, M, R) \
476	{ \
477	.mii ; \
478	ld4 TRound = [TPtr], 4 ; \
479	dep.z Y = Z, 32, 32 ;; \
480	shrp Z = Z, Y, 64 - s ; \
481	} ;; \
482	{ \
483	.mmi ; \
484	add a = Z, b ; \
485	mov R = M ; \
486	nop 0x0 ; \
487	} ;;
488
489	#define LOOP(a, b, s, M, R, label) \
490	{ .mii ; \
491	ld4 TRound = [TPtr], 4 ; \
492	dep.z Y = Z, 32, 32 ;; \
493	shrp Z = Z, Y, 64 - s ; \
494	} ;; \
495	{ .mib ; \
496	add a = Z, b ; \
497	mov R = M ; \
498	br.ctop.sptk.many label ; \
499	} ;;
500
501	// G(B, C, D) = (B & D) \| (C & ~D)
502
503	#define G(a, b, c, d, M) \
504	{ .mmi ; \
505	add Z = M, TRound ; \
506	and Y = b, d ; \
507	andcm X = c, d ; \
508	} ;; \
509	{ .mii ; \
510	add Z = Z, a ; \
511	or Y = Y, X ;; \
512	add Z = Z, Y ; \
513	} ;;
514
515	// H(B, C, D) = B ^ C ^ D
516
517	#define H(a, b, c, d, M) \
518	{ .mmi ; \
519	add Z = M, TRound ; \
520	xor Y = b, c ; \
521	nop 0x0 ; \
522	} ;; \
523	{ .mii ; \
524	add Z = Z, a ; \
525	xor Y = Y, d ;; \
526	add Z = Z, Y ; \
527	} ;;
528
529	// I(B, C, D) = C ^ (B \| ~D)
530	//
531	// However, since we have an andcm operator, we use the fact that
532	//
533	// Y ^ Z == ~Y ^ ~Z
534	//
535	// to rewrite the expression as
536	//
537	// I(B, C, D) = ~C ^ (~B & D)
538
539	#define I(a, b, c, d, M) \
540	{ .mmi ; \
541	add Z = M, TRound ; \
542	andcm Y = d, b ; \
543	andcm X = -1, c ; \
544	} ;; \
545	{ .mii ; \
546	add Z = Z, a ; \
547	xor Y = Y, X ;; \
548	add Z = Z, Y ; \
549	} ;;
550
551	#define GG4(label) \
552	G(A, B, C, D, M0) \
553	COMPUTE(A, B, 5, M0, RotateM0) \
554	G(D, A, B, C, M1) \
555	COMPUTE(D, A, 9, M1, RotateM1) \
556	G(C, D, A, B, M2) \
557	COMPUTE(C, D, 14, M2, RotateM2) \
558	G(B, C, D, A, M3) \
559	LOOP(B, C, 20, M3, RotateM3, label)
560
561	#define HH4(label) \
562	H(A, B, C, D, M0) \
563	COMPUTE(A, B, 4, M0, RotateM0) \
564	H(D, A, B, C, M1) \
565	COMPUTE(D, A, 11, M1, RotateM1) \
566	H(C, D, A, B, M2) \
567	COMPUTE(C, D, 16, M2, RotateM2) \
568	H(B, C, D, A, M3) \
569	LOOP(B, C, 23, M3, RotateM3, label)
570
571	#define II4(label) \
572	I(A, B, C, D, M0) \
573	COMPUTE(A, B, 6, M0, RotateM0) \
574	I(D, A, B, C, M1) \
575	COMPUTE(D, A, 10, M1, RotateM1) \
576	I(C, D, A, B, M2) \
577	COMPUTE(C, D, 15, M2, RotateM2) \
578	I(B, C, D, A, M3) \
579	LOOP(B, C, 21, M3, RotateM3, label)
580
581	#define FFLOAD(a, b, c, d, M, N, s) \
582	{ .mii ; \
583	(pMore) ld4 N = [DPtr], 4 ; \
584	add Z = M, TRound ; \
585	and Y = c, b ; \
586	} \
587	{ .mmi ; \
588	andcm X = d, b ;; \
589	add Z = Z, a ; \
590	or Y = Y, X ; \
591	} ;; \
592	{ .mii ; \
593	ld4 TRound = [TPtr], 4 ; \
594	add Z = Z, Y ;; \
595	dep.z Y = Z, 32, 32 ; \
596	} ;; \
597	{ .mii ; \
598	nop 0x0 ; \
599	shrp Z = Z, Y, 64 - s ;; \
600	add a = Z, b ; \
601	} ;;
602
603	#define FFLOOP(a, b, c, d, M, N, s, dest) \
604	{ .mii ; \
605	(pMore) ld4 N = [DPtr], 4 ; \
606	add Z = M, TRound ; \
607	and Y = c, b ; \
608	} \
609	{ .mmi ; \
610	andcm X = d, b ;; \
611	add Z = Z, a ; \
612	or Y = Y, X ; \
613	} ;; \
614	{ .mii ; \
615	ld4 TRound = [TPtr], 4 ; \
616	add Z = Z, Y ;; \
617	dep.z Y = Z, 32, 32 ; \
618	} ;; \
619	{ .mii ; \
620	nop 0x0 ; \
621	shrp Z = Z, Y, 64 - s ;; \
622	add a = Z, b ; \
623	} \
624	{ .mib ; \
625	cmp.ne pMore, p0 = 0, LTrip ; \
626	add LTrip = -1, LTrip ; \
627	br.ctop.dptk.many dest ; \
628	} ;;
629
630	.type md5_digest_block0, @function
631	.align 32
632
633	.proc md5_digest_block0
634	.prologue
635	md5_digest_block0:
636	.altrp QUICK_RTN
637	.body
638	{ .mmi
639	alloc PFS = ar.pfs, _NINPUTS, _NLOCALS, _NOUTPUT, _NROTATE
640	mov LTrip = 2
641	mov ar.lc = 3
642	} ;;
643	{ .mii
644	cmp.eq pMore, p0 = r0, r0
645	mov ar.ec = 0
646	nop 0x0
647	} ;;
648
649	.md5_FF_round0:
650	FFLOAD(A, B, C, D, M12, RotateM0, 7)
651	FFLOAD(D, A, B, C, M13, RotateM1, 12)
652	FFLOAD(C, D, A, B, M14, RotateM2, 17)
653	FFLOOP(B, C, D, A, M15, RotateM3, 22, .md5_FF_round0)
654	//
655	// !!! Fall through to md5_digest_GHI
656	//
657	.endp md5_digest_block0
658
659	.type md5_digest_GHI, @function
660	.align 32
661
662	.proc md5_digest_GHI
663	.prologue
664	.regstk _NINPUTS, _NLOCALS, _NOUTPUT, _NROTATE
665	md5_digest_GHI:
666	.altrp QUICK_RTN
667	.body
668	//
669	// The following sequence shuffles the block counstants round for the
670	// next round:
671	//
672	// 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
673	// 1 6 11 0 5 10 14 4 9 14 3 8 13 2 7 12
674	//
675	{ .mmi
676	mov Z = M0
677	mov Y = M15
678	mov ar.lc = 3
679	}
680	{ .mmi
681	mov X = M2
682	mov W = M9
683	mov V = M4
684	} ;;
685
686	{ .mmi
687	mov M0 = M1
688	mov M15 = M12
689	mov ar.ec = 1
690	}
691	{ .mmi
692	mov M2 = M11
693	mov M9 = M14
694	mov M4 = M5
695	} ;;
696
697	{ .mmi
698	mov M1 = M6
699	mov M12 = M13
700	mov U = M3
701	}
702	{ .mmi
703	mov M11 = M8
704	mov M14 = M7
705	mov M5 = M10
706	} ;;
707
708	{ .mmi
709	mov M6 = Y
710	mov M13 = X
711	mov M3 = Z
712	}
713	{ .mmi
714	mov M8 = W
715	mov M7 = V
716	mov M10 = U
717	} ;;
718
719	.md5_GG_round:
720	GG4(.md5_GG_round)
721
722	// The following sequence shuffles the block constants round for the
723	// next round:
724	//
725	// 1 6 11 0 5 10 14 4 9 14 3 8 13 2 7 12
726	// 5 8 11 14 1 4 7 10 13 0 3 6 9 12 15 2
727
728	{ .mmi
729	mov Z = M0
730	mov Y = M1
731	mov ar.lc = 3
732	}
733	{ .mmi
734	mov X = M3
735	mov W = M5
736	mov V = M6
737	} ;;
738
739	{ .mmi
740	mov M0 = M4
741	mov M1 = M11
742	mov ar.ec = 1
743	}
744	{ .mmi
745	mov M3 = M9
746	mov U = M8
747	mov T = M13
748	} ;;
749
750	{ .mmi
751	mov M4 = Z
752	mov M11 = Y
753	mov M5 = M7
754	}
755	{ .mmi
756	mov M6 = M14
757	mov M8 = M12
758	mov M13 = M15
759	} ;;
760
761	{ .mmi
762	mov M7 = W
763	mov M14 = V
764	nop 0x0
765	}
766	{ .mmi
767	mov M9 = X
768	mov M12 = U
769	mov M15 = T
770	} ;;
771
772	.md5_HH_round:
773	HH4(.md5_HH_round)
774
775	// The following sequence shuffles the block constants round for the
776	// next round:
777	//
778	// 5 8 11 14 1 4 7 10 13 0 3 6 9 12 15 2
779	// 0 7 14 5 12 3 10 1 8 15 6 13 4 11 2 9
780
781	{ .mmi
782	mov Z = M0
783	mov Y = M15
784	mov ar.lc = 3
785	}
786	{ .mmi
787	mov X = M10
788	mov W = M1
789	mov V = M4
790	} ;;
791
792	{ .mmi
793	mov M0 = M9
794	mov M15 = M12
795	mov ar.ec = 1
796	}
797	{ .mmi
798	mov M10 = M11
799	mov M1 = M6
800	mov M4 = M13
801	} ;;
802
803	{ .mmi
804	mov M9 = M14
805	mov M12 = M5
806	mov U = M3
807	}
808	{ .mmi
809	mov M11 = M8
810	mov M6 = M7
811	mov M13 = M2
812	} ;;
813
814	{ .mmi
815	mov M14 = Y
816	mov M5 = X
817	mov M3 = Z
818	}
819	{ .mmi
820	mov M8 = W
821	mov M7 = V
822	mov M2 = U
823	} ;;
824
825	.md5_II_round:
826	II4(.md5_II_round)
827
828	{ .mib
829	nop 0x0
830	nop 0x0
831	br.ret.sptk.many QUICK_RTN
832	} ;;
833
834	.endp md5_digest_GHI
835
836	#define FFLOADU(a, b, c, d, M, P, N, s, offset) \
837	{ .mii ; \
838	(pMore) ld4 N = [DPtr], 4 ; \
839	add Z = M, TRound ; \
840	and Y = c, b ; \
841	} \
842	{ .mmi ; \
843	andcm X = d, b ;; \
844	add Z = Z, a ; \
845	or Y = Y, X ; \
846	} ;; \
847	{ .mii ; \
848	ld4 TRound = [TPtr], 4 ; \
849	GETLW(W, P, offset) ; \
850	add Z = Z, Y ; \
851	} ;; \
852	{ .mii ; \
853	or W = W, DTmp ; \
854	dep.z Y = Z, 32, 32 ;; \
855	shrp Z = Z, Y, 64 - s ; \
856	} ;; \
857	{ .mii ; \
858	add a = Z, b ; \
859	GETRW(DTmp, P, offset) ; \
860	mov P = W ; \
861	} ;;
862
863	#define FFLOOPU(a, b, c, d, M, P, N, s, offset) \
864	{ .mii ; \
865	(pMore) ld4 N = [DPtr], 4 ; \
866	add Z = M, TRound ; \
867	and Y = c, b ; \
868	} \
869	{ .mmi ; \
870	andcm X = d, b ;; \
871	add Z = Z, a ; \
872	or Y = Y, X ; \
873	} ;; \
874	{ .mii ; \
875	ld4 TRound = [TPtr], 4 ; \
876	(pMore) GETLW(W, P, offset) ; \
877	add Z = Z, Y ; \
878	} ;; \
879	{ .mii ; \
880	(pMore) or W = W, DTmp ; \
881	dep.z Y = Z, 32, 32 ;; \
882	shrp Z = Z, Y, 64 - s ; \
883	} ;; \
884	{ .mii ; \
885	add a = Z, b ; \
886	(pMore) GETRW(DTmp, P, offset) ; \
887	(pMore) mov P = W ; \
888	} \
889	{ .mib ; \
890	cmp.ne pMore, p0 = 0, LTrip ; \
891	add LTrip = -1, LTrip ; \
892	br.ctop.sptk.many .md5_FF_round##offset ; \
893	} ;;
894
895	#define MD5FBLOCK(offset) \
896	.type md5_digest_block##offset, @function ; \
897	\
898	.align 32 ; \
899	.proc md5_digest_block##offset ; \
900	.prologue ; \
901	.altrp QUICK_RTN ; \
902	.body ; \
903	md5_digest_block##offset: \
904	{ .mmi ; \
905	alloc PFS = ar.pfs, _NINPUTS, _NLOCALS, _NOUTPUT, _NROTATE ; \
906	mov LTrip = 2 ; \
907	mov ar.lc = 3 ; \
908	} ;; \
909	{ .mii ; \
910	cmp.eq pMore, p0 = r0, r0 ; \
911	mov ar.ec = 0 ; \
912	nop 0x0 ; \
913	} ;; \
914	\
915	.pred.rel "mutex", pLoad, pSkip ; \
916	.md5_FF_round##offset: \
917	FFLOADU(A, B, C, D, M12, M13, RotateM0, 7, offset) \
918	FFLOADU(D, A, B, C, M13, M14, RotateM1, 12, offset) \
919	FFLOADU(C, D, A, B, M14, M15, RotateM2, 17, offset) \
920	FFLOOPU(B, C, D, A, M15, RotateM0, RotateM3, 22, offset) \
921	\
922	{ .mib ; \
923	nop 0x0 ; \
924	nop 0x0 ; \
925	br.cond.sptk.many md5_digest_GHI ; \
926	} ;; \
927	.endp md5_digest_block##offset
928
929	MD5FBLOCK(1)
930	MD5FBLOCK(2)
931	MD5FBLOCK(3)
932
933	.align 64
934	.type md5_constants, @object
935	md5_constants:
936	.md5_tbl_data_order: // To ensure little-endian data
937	// order, code as bytes.
938	data1 0x78, 0xa4, 0x6a, 0xd7 // 0
939	data1 0x56, 0xb7, 0xc7, 0xe8 // 1
940	data1 0xdb, 0x70, 0x20, 0x24 // 2
941	data1 0xee, 0xce, 0xbd, 0xc1 // 3
942	data1 0xaf, 0x0f, 0x7c, 0xf5 // 4
943	data1 0x2a, 0xc6, 0x87, 0x47 // 5
944	data1 0x13, 0x46, 0x30, 0xa8 // 6
945	data1 0x01, 0x95, 0x46, 0xfd // 7
946	data1 0xd8, 0x98, 0x80, 0x69 // 8
947	data1 0xaf, 0xf7, 0x44, 0x8b // 9
948	data1 0xb1, 0x5b, 0xff, 0xff // 10
949	data1 0xbe, 0xd7, 0x5c, 0x89 // 11
950	data1 0x22, 0x11, 0x90, 0x6b // 12
951	data1 0x93, 0x71, 0x98, 0xfd // 13
952	data1 0x8e, 0x43, 0x79, 0xa6 // 14
953	data1 0x21, 0x08, 0xb4, 0x49 // 15
954	data1 0x62, 0x25, 0x1e, 0xf6 // 16
955	data1 0x40, 0xb3, 0x40, 0xc0 // 17
956	data1 0x51, 0x5a, 0x5e, 0x26 // 18
957	data1 0xaa, 0xc7, 0xb6, 0xe9 // 19
958	data1 0x5d, 0x10, 0x2f, 0xd6 // 20
959	data1 0x53, 0x14, 0x44, 0x02 // 21
960	data1 0x81, 0xe6, 0xa1, 0xd8 // 22
961	data1 0xc8, 0xfb, 0xd3, 0xe7 // 23
962	data1 0xe6, 0xcd, 0xe1, 0x21 // 24
963	data1 0xd6, 0x07, 0x37, 0xc3 // 25
964	data1 0x87, 0x0d, 0xd5, 0xf4 // 26
965	data1 0xed, 0x14, 0x5a, 0x45 // 27
966	data1 0x05, 0xe9, 0xe3, 0xa9 // 28
967	data1 0xf8, 0xa3, 0xef, 0xfc // 29
968	data1 0xd9, 0x02, 0x6f, 0x67 // 30
969	data1 0x8a, 0x4c, 0x2a, 0x8d // 31
970	data1 0x42, 0x39, 0xfa, 0xff // 32
971	data1 0x81, 0xf6, 0x71, 0x87 // 33
972	data1 0x22, 0x61, 0x9d, 0x6d // 34
973	data1 0x0c, 0x38, 0xe5, 0xfd // 35
974	data1 0x44, 0xea, 0xbe, 0xa4 // 36
975	data1 0xa9, 0xcf, 0xde, 0x4b // 37
976	data1 0x60, 0x4b, 0xbb, 0xf6 // 38
977	data1 0x70, 0xbc, 0xbf, 0xbe // 39
978	data1 0xc6, 0x7e, 0x9b, 0x28 // 40
979	data1 0xfa, 0x27, 0xa1, 0xea // 41
980	data1 0x85, 0x30, 0xef, 0xd4 // 42
981	data1 0x05, 0x1d, 0x88, 0x04 // 43
982	data1 0x39, 0xd0, 0xd4, 0xd9 // 44
983	data1 0xe5, 0x99, 0xdb, 0xe6 // 45
984	data1 0xf8, 0x7c, 0xa2, 0x1f // 46
985	data1 0x65, 0x56, 0xac, 0xc4 // 47
986	data1 0x44, 0x22, 0x29, 0xf4 // 48
987	data1 0x97, 0xff, 0x2a, 0x43 // 49
988	data1 0xa7, 0x23, 0x94, 0xab // 50
989	data1 0x39, 0xa0, 0x93, 0xfc // 51
990	data1 0xc3, 0x59, 0x5b, 0x65 // 52
991	data1 0x92, 0xcc, 0x0c, 0x8f // 53
992	data1 0x7d, 0xf4, 0xef, 0xff // 54
993	data1 0xd1, 0x5d, 0x84, 0x85 // 55
994	data1 0x4f, 0x7e, 0xa8, 0x6f // 56
995	data1 0xe0, 0xe6, 0x2c, 0xfe // 57
996	data1 0x14, 0x43, 0x01, 0xa3 // 58
997	data1 0xa1, 0x11, 0x08, 0x4e // 59
998	data1 0x82, 0x7e, 0x53, 0xf7 // 60
999	data1 0x35, 0xf2, 0x3a, 0xbd // 61
1000	data1 0xbb, 0xd2, 0xd7, 0x2a // 62
1001	data1 0x91, 0xd3, 0x86, 0xeb // 63
1002	.size md5_constants#,64*4

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

Original Format