U
    Ö9%e&H  ã                   @   sò  d Z ddlZddlZddlmZ ddlmZ ddlZddlZddl	m
Z ddlmZ ddlmZmZ ddlmZmZmZmZmZmZmZ dd	lmZ dd
lmZ ddlmZ e ¡  e  e!¡Z"ddddgZ#ej$j%ej$j&ej$j'ej$j(ej$j)ej$j*ej$j+ej$j,ej$j-ej$j.ej$j/ej$j0edœZ1e2dƒZ3e3dg Z4e4dddddg Z5dd„ Z6dd„ Z7e8e8e9e8e8dœdd„Z:e!dkrîe ;¡ Z<e<j=d e8d!d"d# e<j=d$d%d&d' e<j=d(de8d!d)d* e<j=d+e8d,d- e<j=d.e8d/d- e< >¡ Z?e:e?j@e?jAe?jBe?jCe?jDƒ dS )0zConvert ESM checkpoint.é    N)ÚPath)ÚTemporaryDirectory)Úbatch_encode_sequences)Ú
esmfold_v1)Ú	EsmConfigÚEsmFoldConfig)ÚEsmForMaskedLMÚEsmForSequenceClassificationÚEsmIntermediateÚEsmLayerÚ	EsmOutputÚEsmSelfAttentionÚEsmSelfOutput)ÚEsmForProteinFolding)ÚEsmTokenizer)Úlogging)Zprotein1A\  MNGTEGPNFYVPFSNATGVVRSPFEYPQYYLAEPWQFSMLAAYMFLLIVLGFPINFLTLYVTVQHKKLRTPLNYILLNLAVADLFMVLGGFTSTLYTSLHGYFVFGPTGCNLEGFFATLGGEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPLAGWSRYIPEGLQCSCGIDYYTLKPEVNNESFVIYMFVVHFTIPMIIIFFCYGQLVFTVKEAAAQQQESATTQKAEKEVTRMVIIMVIAFLICWVPYASVAFYIFTHQGSNFGPIFMTIPAFFAKSAAIYNPVIYIMMNKQFRNCMLTTICCGKNPLGDDEASATVSKTETSQVAPA)Zprotein2Z?MKTVRQERLKSIVRILERSKEPVSGAQLAEELSVSRQVIVQDIAYLRSLGYNIVATPRGYVLA)Zprotein3zPMKTVRQERLKSI<mask>RILERSKEPVSGAQLAEELS<mask>SRQVIVQDIAYLRSLGYN<mask>VATPRGYVLAGG)Zprotein4zNMKTVRQERLKSI<mask>RILERSKEPVSGAQLAEELS<mask>SRQVIVQDIAYLRSLGYN<mask>VATPRGYVLA)Úesm1b_t33_650M_UR50SÚesm1v_t33_650M_UR90S_1Úesm1v_t33_650M_UR90S_2Úesm1v_t33_650M_UR90S_3Úesm1v_t33_650M_UR90S_4Úesm1v_t33_650M_UR90S_5Úesm2_t48_15B_UR50DÚesm2_t36_3B_UR50DÚesm2_t33_650M_UR50DÚesm2_t30_150M_UR50DÚesm2_t12_35M_UR50DÚesm2_t6_8M_UR50Dr   ZARNDCQEGHILKMFPSTWYVÚXz<pad>z<mask>z<cls>z<sep>z<eos>c               	   C   sJ   t ƒ 4} d t¡}t| ƒd }| |¡ tt|ƒd}W 5 Q R X d|_|S )NÚ
ú	vocab.txt©Ú
vocab_filer   )r   ÚjoinÚrestypes_with_extrasr   Ú
write_textr   ÚstrÚpad_token_id)ÚtempdirÚvocabr"   Úhf_tokenizer© r+   úb/var/www/html/Darija-Ai-API/env/lib/python3.8/site-packages/transformers/models/esm/convert_esm.pyÚget_esmfold_tokenizerN   s    

r-   c                 C   s>   |  |  ¡ ¡}|jr$td|j› ƒ‚|jr:td|j› ƒ‚d S )NzMissing keys: zUnexpected keys: )Zload_state_dictZ
state_dictZmissing_keysÚ
ValueErrorZunexpected_keys)Zoriginal_moduleZ
our_moduleÚstatusr+   r+   r,   Útransfer_and_check_weightsX   s
    r0   )ÚmodelÚpytorch_dump_folder_pathÚclassification_headÚpush_to_repoÚ
auth_tokenc           0      C   s\	  |   d¡rt|  ƒ }nt|  ƒ \}}| ¡  |   d¡r|jj}|jj}|jj}	d| }
|jj}d}d}d}tƒ }|j	 
¡ D ]&\}}t||ƒr||dkr|t|||ƒ q||j	j 
¡ D ]*\}}t|j|ƒr°|dkr°t|j||ƒ q°|j	jj 
¡ D ]&\}}t|jj|ƒrêt|jj||ƒ qên‚t|dƒrf|jj}|jj}|jj}	|jj}
|jj}|jrTdnd}d	}d}d
}n0|j}|j}|j}	d| }
|j}d}d}d}d
}|r¤|jj}t|jƒ}|j}|j}|rÈ|j}n|}t|jj||||	|
dddd|||||||d}|r|jd jjjd |_t d|ƒ |   d¡r2t!}n|r>t"}nt#}||ƒ} |  ¡  |jj| jj$j%_|d	kr||j&j| jj$j'_|jr¤|jj| jj$j(_|jj)| jj$j(_)|j*j| jj+j*_|j*j)| jj+j*_)t,|j-ƒD ]ä}| jj+j.| }|j| }|j/j0}|j1j2jj3j|j1j4jj3j  krD|j1j5jj3j  krDt6 7|j8|j8f¡ksJn t9‚|j1j4j|j:j_3|j1j4j)|j:j)_3|j1j2j|j;j_3|j1j2j)|j;j)_3|j1j5j|j<j_3|j1j5j)|j<j)_3t=|j1dd
ƒd
k	rÎ|j1j>j?|j@j?_3|jAj|j/jB_|jAj)|j/jB_)|jCj|jB_|jCj)|jB_)|j/jD}|jEjj|j1jjjks&t9‚|j1jj|jE_|j1jj)|jE_)|jF}|jEjj|jGjjksbt9‚|jGj|jE_|jGj)|jE_)|jD}|jEjj|jHjjksšt9‚|jHj|jE_|jHj)|jE_)qÎ|r8|jIj3| jI_3|jJj3| jJ_3tK|jL| jLƒ tK|jM| jMƒ tK|j| jƒ tK|jN| jNƒ tK|jO| jOƒ tK|jP| jPƒ tK|jQ| jQƒ n´|r’|jjd jEj| jRjE_|jd jEj)| jRjE_)|jd jj| jRj_|jd jj)| jRj_)nZ|jPjEj| jPjE_|jPjEj)| jPjE_)|jPj(j| jPj(_|jPj(j)| jPj(_)|jPj| jPjS_|jPj)| jP_)tK|jT| jjTƒ |rtUd
d… }ntU}|r~tVƒ } | dd„ |D ƒdddd}!tWdd„ |D ƒƒ\}"}#}$}$}$t6 X|!d |"k¡ozt6 X|!d |#k¡}%n‚| Y¡ }&|&|ƒ\}'}(})tZƒ 6}*d [|j¡}+t\|*ƒd },|, ]|+¡ t^t_|,ƒd} W 5 Q R X | dd„ |D ƒddd}!t6 X|!d |)k¡}%t d |%rd!nd"ƒ |%s"t`d#ƒ‚t6 a¡ ì |rn| b¡  cd$d„ |D ƒ¡}-|  b¡ |!d  b¡ |!d  b¡ d%}.nX| f |!d&di—Ž}.|.d' }.|r¦|jdjd | e|)¡ƒ}-n ||!d tft,d(ƒƒd)}-|-d' }-|rt6 gt6 h|.d* |-d*  ¡¡ i¡ }/t6jj|.d* |-d* dd+}%n(t6 gt6 h|.|- ¡¡ i¡ }/t6jj|.|-dd+}%t d,|/› ƒ t d-|%rJd!nd"ƒ |%s^t`d.ƒ‚|sæ|  k|!d |!d ¡}.| k|!d ¡}-t6 gt6 h|.|- ¡¡ i¡ }/t6jj|.|-dd+}%t d/ƒ t d,|/› ƒ t d-|%rÒd!nd"ƒ |%sæt`d.ƒ‚tl \|¡jmddd0 t d1|› ƒ |  n|¡ ~W 5 Q R X t d2|› ƒ |  n|¡ |	rX| jo||d3 | jo||d3 d
S )4z?
    Copy/paste/tweak esm's weights to our BERT structure.
    Zesmfoldé   FZrotaryTÚtrunkÚstructure_moduleÚargsÚabsoluteNi  gñhãˆµøä>g        )Z
vocab_sizeÚmask_token_idÚhidden_sizeÚnum_hidden_layersÚnum_attention_headsÚintermediate_sizeZmax_position_embeddingsZlayer_norm_epsZattention_probs_dropout_probZhidden_dropout_probr'   Úemb_layer_norm_beforeÚtoken_dropoutÚposition_embedding_typeÚis_folding_modelÚesmfold_configÚ
vocab_listZmnlir   zOur ESM config:Úrot_embé   c                 S   s   g | ]}|d  ‘qS ©é   r+   ©Ú.0Úrowr+   r+   r,   Ú
<listcomp>,  s     z5convert_esm_checkpoint_to_pytorch.<locals>.<listcomp>Úpt)Úreturn_tensorsÚpaddingZadd_special_tokensc                 S   s   g | ]}|d  ‘qS rH   r+   rJ   r+   r+   r,   rM   .  s     Ú	input_idsÚattention_maskr   r    r!   c                 S   s   g | ]}|d  ‘qS rH   r+   rJ   r+   r+   r,   rM   =  s     )rO   rP   z1Do both models tokenizers output the same tokens?u   ðŸ”¥u   ðŸ’©zTokenization does not match!c                 S   s   g | ]}|d  ‘qS rH   r+   rJ   r+   r+   r,   rM   K  s     )rQ   rR   Zoutput_hidden_statesZlogitsiç  )Zrepr_layersZ	positions)Zatolzmax_absolute_diff = z'Do both models output the same tensors?zSomething went wRoNgzContact prediction testing:)ÚparentsÚexist_okzSaving model to zSaving tokenizer to )Zrepo_idZtoken_token)pÚ
startswithÚMODEL_MAPPINGÚevalÚesmÚ	embed_dimÚ
num_layersZattention_headsrA   r   ÚcfgÚitemsÚhasattrÚsetattrr7   r8   r9   ZlayersZffn_embed_dimr@   ÚalphabetÚtupleZall_toksZmask_idxZpadding_idxr   Zembed_tokensZnum_embeddingsZclassification_headsZout_projÚweightÚshapeZ
num_labelsÚprintr   r	   r   Z
embeddingsZword_embeddingsZembed_positionsZposition_embeddingsZ
layer_normZbiasZemb_layer_norm_afterÚencoderÚranger=   ÚlayerZ	attentionÚselfÚ	self_attnZk_projÚdataZq_projZv_projÚtorchÚSizer<   ÚAssertionErrorÚqueryÚkeyÚvalueÚgetattrrF   Zinv_freqZrotary_embeddingsZself_attn_layer_normZ	LayerNormZfinal_layer_normÚoutputZdenseÚintermediateZfc1Zfc2Zesm_s_combineZ
af2_to_esmr0   Z	embeddingZ	esm_s_mlpZdistogram_headZptm_headZlm_headZ	lddt_headÚ
classifierÚdecoderZcontact_headÚSAMPLE_DATAr-   Úesmfold_encode_sequencesÚallZget_batch_converterr   r#   r   r%   r   r&   Ú	ExceptionZno_gradÚcudaZinferr1   Zextract_featuresÚlistÚmaxÚabsÚitemZallcloseZpredict_contactsÚpathlibÚmkdirZsave_pretrainedZpush_to_hub)0r1   r2   r3   r4   r5   rX   r_   rY   rZ   r>   r?   rA   r@   rB   rC   rD   rn   ÚvalrE   r;   r'   Zoriginal_esm_modelÚconfigZmodel_classÚirf   Z	esm_layerrh   Zself_outputrr   Zbert_outputZsample_datar*   Z	hf_tokensZesmfold_aasZesmfold_maskÚ_ÚsuccessZbatch_converterZbatch_labelsZ
batch_strsZbatch_tokensr(   r)   r"   Ztheir_outputZ
our_outputZmax_absolute_diffr+   r+   r,   Ú!convert_esm_checkpoint_to_pytorch`   s¨   

ï


þ
ý
ü   ÿ
ÿ

 
ÿ 

r…   Ú__main__z--pytorch_dump_folder_pathTz!Path to the output PyTorch model.)ÚtypeÚrequiredÚhelpz--classification_headÚ
store_truez/Whether to convert a final classification head.)Úactionr‰   z--modelzName of model to convert.)Údefaultr‡   rˆ   r‰   z--push_to_repoz(Repo to upload to (including username!).)r‡   r‰   z--auth_tokenzHuggingFace auth token.)EÚ__doc__Úargparser~   r   Útempfiler   rX   Z
esm_modulerj   Zesm.esmfold.v1.miscr   rv   Zesm.esmfold.v1.pretrainedr   Z)transformers.models.esm.configuration_esmr   r   Z$transformers.models.esm.modeling_esmr   r	   r
   r   r   r   r   Z(transformers.models.esm.modeling_esmfoldr   Z(transformers.models.esm.tokenization_esmr   Ztransformers.utilsr   Zset_verbosity_infoZ
get_loggerÚ__name__Úloggerru   Z
pretrainedr   r   r   r   r   r   r   r   r   r   r   r   rV   rz   ZrestypesZrestypes_with_xr$   r-   r0   r&   Úboolr…   ÚArgumentParserÚparserÚadd_argumentÚ
parse_argsr9   r1   r2   r3   r4   r5   r+   r+   r+   r,   Ú<module>   sŽ   $	
ùó

	    ÿ  #
   ÿ  ÿ    ÿ