pashu123 · April 6, 2022 12:00
diff --git a/bert_back_torch.mlir b/bert_back_torch.mlir
 module attributes {torch.debug_module_name = "GraphModule"} {
  func @forward(%arg0: !torch.vtensor<[768],f32>, %arg1: !torch.vtensor<[768],f32>, %arg2: !torch.vtensor<[768],f32>, %arg3: !torch.vtensor<[768],f32>, %arg4: !torch.vtensor<[768],f32>, %arg5: !torch.vtensor<[768],f32>, %arg6: !torch.vtensor<[768],f32>, %arg7: !torch.vtensor<[768],f32>, %arg8: !torch.vtensor<[768],f32>, %arg9: !torch.vtensor<[768],f32>, %arg10: !torch.vtensor<[768],f32>, %arg11: !torch.vtensor<[768],f32>, %arg12: !torch.vtensor<[768],f32>, %arg13: !torch.vtensor<[768],f32>, %arg14: !torch.vtensor<[768],f32>, %arg15: !torch.vtensor<[768],f32>, %arg16: !torch.vtensor<[768],f32>, %arg17: !torch.vtensor<[768],f32>, %arg18: !torch.vtensor<[768],f32>, %arg19: !torch.vtensor<[768],f32>, %arg20: !torch.vtensor<[768],f32>, %arg21: !torch.vtensor<[768],f32>, %arg22: !torch.vtensor<[768],f32>, %arg23: !torch.vtensor<[768],f32>, %arg24: !torch.vtensor<[768],f32>, %arg25: !torch.vtensor<[768],f32>, %arg26: !torch.vtensor<[768],f32>, %arg27: !torch.vtensor<[768],f32>, %arg28: !torch.vtensor<[768],f32>, %arg29: !torch.vtensor<[768],f32>, %arg30: !torch.vtensor<[768],f32>, %arg31: !torch.vtensor<[768],f32>, %arg32: !torch.vtensor<[768],f32>, %arg33: !torch.vtensor<[768],f32>, %arg34: !torch.vtensor<[768],f32>, %arg35: !torch.vtensor<[768],f32>, %arg36: !torch.vtensor<[768],f32>, %arg37: !torch.vtensor<[768],f32>, %arg38: !torch.vtensor<[768],f32>, %arg39: !torch.vtensor<[768],f32>, %arg40: !torch.vtensor<[768],f32>, %arg41: !torch.vtensor<[768],f32>, %arg42: !torch.vtensor<[768],f32>, %arg43: !torch.vtensor<[768],f32>, %arg44: !torch.vtensor<[768],f32>, %arg45: !torch.vtensor<[768],f32>, %arg46: !torch.vtensor<[768],f32>, %arg47: !torch.vtensor<[768],f32>, %arg48: !torch.vtensor<[768],f32>, %arg49: !torch.vtensor<[768],f32>, %arg50: !torch.vtensor<[768],f32>, %arg51: !torch.vtensor<[768],f32>, %arg52: !torch.vtensor<[4,512],si64>, %arg53: !torch.vtensor<[4,512],si64>, %arg54: !torch.vtensor<[1,512],si64>, %arg55: !torch.vtensor<[4,512,768],f32>, %arg56: !torch.vtensor<[4,512,768],f32>, %arg57: !torch.vtensor<[768,768],f32>, %arg58: !torch.vtensor<[2048,768],f32>, %arg59: !torch.vtensor<[768,768],f32>, %arg60: !torch.vtensor<[2048,768],f32>, %arg61: !torch.vtensor<[768,768],f32>, %arg62: !torch.vtensor<[2048,768],f32>, %arg63: !torch.vtensor<[48,512,64],f32>, %arg64: !torch.vtensor<[48,64,512],f32>, %arg65: !torch.vtensor<[4,12,512,512],f32>, %arg66: !torch.vtensor<[4,12,512,512],f32>, %arg67: !torch.vtensor<[48,512,512],f32>, %arg68: !torch.vtensor<[48,512,64],f32>, %arg69: !torch.vtensor<[768,768],f32>, %arg70: !torch.vtensor<[2048,768],f32>, %arg71: !torch.vtensor<[4,512,768],f32>, %arg72: !torch.vtensor<[4,512,768],f32>, %arg73: !torch.vtensor<[768,3072],f32>, %arg74: !torch.vtensor<[2048,768],f32>, %arg75: !torch.vtensor<[4,512,3072],f32>, %arg76: !torch.vtensor<[3072,768],f32>, %arg77: !torch.vtensor<[2048,3072],f32>, %arg78: !torch.vtensor<[4,512,768],f32>, %arg79: !torch.vtensor<[4,512,768],f32>, %arg80: !torch.vtensor<[768,768],f32>, %arg81: !torch.vtensor<[2048,768],f32>, %arg82: !torch.vtensor<[768,768],f32>, %arg83: !torch.vtensor<[2048,768],f32>, %arg84: !torch.vtensor<[768,768],f32>, %arg85: !torch.vtensor<[2048,768],f32>, %arg86: !torch.vtensor<[48,512,64],f32>, %arg87: !torch.vtensor<[48,64,512],f32>, %arg88: !torch.vtensor<[4,12,512,512],f32>, %arg89: !torch.vtensor<[4,12,512,512],f32>, %arg90: !torch.vtensor<[48,512,512],f32>, %arg91: !torch.vtensor<[48,512,64],f32>, %arg92: !torch.vtensor<[768,768],f32>, %arg93: !torch.vtensor<[2048,768],f32>, %arg94: !torch.vtensor<[4,512,768],f32>, %arg95: !torch.vtensor<[4,512,768],f32>, %arg96: !torch.vtensor<[768,3072],f32>, %arg97: !torch.vtensor<[2048,768],f32>, %arg98: !torch.vtensor<[4,512,3072],f32>, %arg99: !torch.vtensor<[3072,768],f32>, %arg100: !torch.vtensor<[2048,3072],f32>, %arg101: !torch.vtensor<[4,512,768],f32>, %arg102: !torch.vtensor<[4,512,768],f32>, %arg103: !torch.vtensor<[768,768],f32>, %arg104: !torch.vtensor<[2048,768],f32>, %arg105: !torch.vtensor<[768,768],f32>, %arg106: !torch.vtensor<[2048,768],f32>, %arg107: !torch.vtensor<[768,768],f32>, %arg108: !torch.vtensor<[2048,768],f32>, %arg109: !torch.vtensor<[48,512,64],f32>, %arg110: !torch.vtensor<[48,64,512],f32>, %arg111: !torch.vtensor<[4,12,512,512],f32>, %arg112: !torch.vtensor<[4,12,512,512],f32>, %arg113: !torch.vtensor<[48,512,512],f32>, %arg114: !torch.vtensor<[48,512,64],f32>, %arg115: !torch.vtensor<[768,768],f32>, %arg116: !torch.vtensor<[2048,768],f32>, %arg117: !torch.vtensor<[4,512,768],f32>, %arg118: !torch.vtensor<[4,512,768],f32>, %arg119: !torch.vtensor<[768,3072],f32>, %arg120: !torch.vtensor<[2048,768],f32>, %arg121: !torch.vtensor<[4,512,3072],f32>, %arg122: !torch.vtensor<[3072,768],f32>, %arg123: !torch.vtensor<[2048,3072],f32>, %arg124: !torch.vtensor<[4,512,768],f32>, %arg125: !torch.vtensor<[4,512,768],f32>, %arg126: !torch.vtensor<[768,768],f32>, %arg127: !torch.vtensor<[2048,768],f32>, %arg128: !torch.vtensor<[768,768],f32>, %arg129: !torch.vtensor<[2048,768],f32>, %arg130: !torch.vtensor<[768,768],f32>, %arg131: !torch.vtensor<[2048,768],f32>, %arg132: !torch.vtensor<[48,512,64],f32>, %arg133: !torch.vtensor<[48,64,512],f32>, %arg134: !torch.vtensor<[4,12,512,512],f32>, %arg135: !torch.vtensor<[4,12,512,512],f32>, %arg136: !torch.vtensor<[48,512,512],f32>, %arg137: !torch.vtensor<[48,512,64],f32>, %arg138: !torch.vtensor<[768,768],f32>, %arg139: !torch.vtensor<[2048,768],f32>, %arg140: !torch.vtensor<[4,512,768],f32>, %arg141: !torch.vtensor<[4,512,768],f32>, %arg142: !torch.vtensor<[768,3072],f32>, %arg143: !torch.vtensor<[2048,768],f32>, %arg144: !torch.vtensor<[4,512,3072],f32>, %arg145: !torch.vtensor<[3072,768],f32>, %arg146: !torch.vtensor<[2048,3072],f32>, %arg147: !torch.vtensor<[4,512,768],f32>, %arg148: !torch.vtensor<[4,512,768],f32>, %arg149: !torch.vtensor<[768,768],f32>, %arg150: !torch.vtensor<[2048,768],f32>, %arg151: !torch.vtensor<[768,768],f32>, %arg152: !torch.vtensor<[2048,768],f32>, %arg153: !torch.vtensor<[768,768],f32>, %arg154: !torch.vtensor<[2048,768],f32>, %arg155: !torch.vtensor<[48,512,64],f32>, %arg156: !torch.vtensor<[48,64,512],f32>, %arg157: !torch.vtensor<[4,12,512,512],f32>, %arg158: !torch.vtensor<[4,12,512,512],f32>, %arg159: !torch.vtensor<[48,512,512],f32>, %arg160: !torch.vtensor<[48,512,64],f32>, %arg161: !torch.vtensor<[768,768],f32>, %arg162: !torch.vtensor<[2048,768],f32>, %arg163: !torch.vtensor<[4,512,768],f32>, %arg164: !torch.vtensor<[4,512,768],f32>, %arg165: !torch.vtensor<[768,3072],f32>, %arg166: !torch.vtensor<[2048,768],f32>, %arg167: !torch.vtensor<[4,512,3072],f32>, %arg168: !torch.vtensor<[3072,768],f32>, %arg169: !torch.vtensor<[2048,3072],f32>, %arg170: !torch.vtensor<[4,512,768],f32>, %arg171: !torch.vtensor<[4,512,768],f32>, %arg172: !torch.vtensor<[768,768],f32>, %arg173: !torch.vtensor<[2048,768],f32>, %arg174: !torch.vtensor<[768,768],f32>, %arg175: !torch.vtensor<[2048,768],f32>, %arg176: !torch.vtensor<[768,768],f32>, %arg177: !torch.vtensor<[2048,768],f32>, %arg178: !torch.vtensor<[48,512,64],f32>, %arg179: !torch.vtensor<[48,64,512],f32>, %arg180: !torch.vtensor<[4,12,512,512],f32>, %arg181: !torch.vtensor<[4,12,512,512],f32>, %arg182: !torch.vtensor<[48,512,512],f32>, %arg183: !torch.vtensor<[48,512,64],f32>, %arg184: !torch.vtensor<[768,768],f32>, %arg185: !torch.vtensor<[2048,768],f32>, %arg186: !torch.vtensor<[4,512,768],f32>, %arg187: !torch.vtensor<[4,512,768],f32>, %arg188: !torch.vtensor<[768,3072],f32>, %arg189: !torch.vtensor<[2048,768],f32>, %arg190: !torch.vtensor<[4,512,3072],f32>, %arg191: !torch.vtensor<[3072,768],f32>, %arg192: !torch.vtensor<[2048,3072],f32>, %arg193: !torch.vtensor<[4,512,768],f32>, %arg194: !torch.vtensor<[4,512,768],f32>, %arg195: !torch.vtensor<[768,768],f32>, %arg196: !torch.vtensor<[2048,768],f32>, %arg197: !torch.vtensor<[768,768],f32>, %arg198: !torch.vtensor<[2048,768],f32>, %arg199: !torch.vtensor<[768,768],f32>, %arg200: !torch.vtensor<[2048,768],f32>, %arg201: !torch.vtensor<[48,512,64],f32>, %arg202: !torch.vtensor<[48,64,512],f32>, %arg203: !torch.vtensor<[4,12,512,512],f32>, %arg204: !torch.vtensor<[4,12,512,512],f32>, %arg205: !torch.vtensor<[48,512,512],f32>, %arg206: !torch.vtensor<[48,512,64],f32>, %arg207: !torch.vtensor<[768,768],f32>, %arg208: !torch.vtensor<[2048,768],f32>, %arg209: !torch.vtensor<[4,512,768],f32>, %arg210: !torch.vtensor<[4,512,768],f32>, %arg211: !torch.vtensor<[768,3072],f32>, %arg212: !torch.vtensor<[2048,768],f32>, %arg213: !torch.vtensor<[4,512,3072],f32>, %arg214: !torch.vtensor<[3072,768],f32>, %arg215: !torch.vtensor<[2048,3072],f32>, %arg216: !torch.vtensor<[4,512,768],f32>, %arg217: !torch.vtensor<[4,512,768],f32>, %arg218: !torch.vtensor<[768,768],f32>, %arg219: !torch.vtensor<[2048,768],f32>, %arg220: !torch.vtensor<[768,768],f32>, %arg221: !torch.vtensor<[2048,768],f32>, %arg222: !torch.vtensor<[768,768],f32>, %arg223: !torch.vtensor<[2048,768],f32>, %arg224: !torch.vtensor<[48,512,64],f32>, %arg225: !torch.vtensor<[48,64,512],f32>, %arg226: !torch.vtensor<[4,12,512,512],f32>, %arg227: !torch.vtensor<[4,12,512,512],f32>, %arg228: !torch.vtensor<[48,512,512],f32>, %arg229: !torch.vtensor<[48,512,64],f32>, %arg230: !torch.vtensor<[768,768],f32>, %arg231: !torch.vtensor<[2048,768],f32>, %arg232: !torch.vtensor<[4,512,768],f32>, %arg233: !torch.vtensor<[4,512,768],f32>, %arg234: !torch.vtensor<[768,3072],f32>, %arg235: !torch.vtensor<[2048,768],f32>, %arg236: !torch.vtensor<[4,512,3072],f32>, %arg237: !torch.vtensor<[3072,768],f32>, %arg238: !torch.vtensor<[2048,3072],f32>, %arg239: !torch.vtensor<[4,512,768],f32>, %arg240: !torch.vtensor<[4,512,768],f32>, %arg241: !torch.vtensor<[768,768],f32>, %arg242: !torch.vtensor<[2048,768],f32>, %arg243: !torch.vtensor<[768,768],f32>, %arg244: !torch.vtensor<[2048,768],f32>, %arg245: !torch.vtensor<[768,768],f32>, %arg246: !torch.vtensor<[2048,768],f32>, %arg247: !torch.vtensor<[48,512,64],f32>, %arg248: !torch.vtensor<[48,64,512],f32>, %arg249: !torch.vtensor<[4,12,512,512],f32>, %arg250: !torch.vtensor<[4,12,512,512],f32>, %arg251: !torch.vtensor<[48,512,512],f32>, %arg252: !torch.vtensor<[48,512,64],f32>, %arg253: !torch.vtensor<[768,768],f32>, %arg254: !torch.vtensor<[2048,768],f32>, %arg255: !torch.vtensor<[4,512,768],f32>, %arg256: !torch.vtensor<[4,512,768],f32>, %arg257: !torch.vtensor<[768,3072],f32>, %arg258: !torch.vtensor<[2048,768],f32>, %arg259: !torch.vtensor<[4,512,3072],f32>, %arg260: !torch.vtensor<[3072,768],f32>, %arg261: !torch.vtensor<[2048,3072],f32>, %arg262: !torch.vtensor<[4,512,768],f32>, %arg263: !torch.vtensor<[4,512,768],f32>, %arg264: !torch.vtensor<[768,768],f32>, %arg265: !torch.vtensor<[2048,768],f32>, %arg266: !torch.vtensor<[768,768],f32>, %arg267: !torch.vtensor<[2048,768],f32>, %arg268: !torch.vtensor<[768,768],f32>, %arg269: !torch.vtensor<[2048,768],f32>, %arg270: !torch.vtensor<[48,512,64],f32>, %arg271: !torch.vtensor<[48,64,512],f32>, %arg272: !torch.vtensor<[4,12,512,512],f32>, %arg273: !torch.vtensor<[4,12,512,512],f32>, %arg274: !torch.vtensor<[48,512,512],f32>, %arg275: !torch.vtensor<[48,512,64],f32>, %arg276: !torch.vtensor<[768,768],f32>, %arg277: !torch.vtensor<[2048,768],f32>, %arg278: !torch.vtensor<[4,512,768],f32>, %arg279: !torch.vtensor<[4,512,768],f32>, %arg280: !torch.vtensor<[768,3072],f32>, %arg281: !torch.vtensor<[2048,768],f32>, %arg282: !torch.vtensor<[4,512,3072],f32>, %arg283: !torch.vtensor<[3072,768],f32>, %arg284: !torch.vtensor<[2048,3072],f32>, %arg285: !torch.vtensor<[4,512,768],f32>, %arg286: !torch.vtensor<[4,512,768],f32>, %arg287: !torch.vtensor<[768,768],f32>, %arg288: !torch.vtensor<[2048,768],f32>, %arg289: !torch.vtensor<[768,768],f32>, %arg290: !torch.vtensor<[2048,768],f32>, %arg291: !torch.vtensor<[768,768],f32>, %arg292: !torch.vtensor<[2048,768],f32>, %arg293: !torch.vtensor<[48,512,64],f32>, %arg294: !torch.vtensor<[48,64,512],f32>, %arg295: !torch.vtensor<[4,12,512,512],f32>, %arg296: !torch.vtensor<[4,12,512,512],f32>, %arg297: !torch.vtensor<[48,512,512],f32>, %arg298: !torch.vtensor<[48,512,64],f32>, %arg299: !torch.vtensor<[768,768],f32>, %arg300: !torch.vtensor<[2048,768],f32>, %arg301: !torch.vtensor<[4,512,768],f32>, %arg302: !torch.vtensor<[4,512,768],f32>, %arg303: !torch.vtensor<[768,3072],f32>, %arg304: !torch.vtensor<[2048,768],f32>, %arg305: !torch.vtensor<[4,512,3072],f32>, %arg306: !torch.vtensor<[3072,768],f32>, %arg307: !torch.vtensor<[2048,3072],f32>, %arg308: !torch.vtensor<[4,512,768],f32>, %arg309: !torch.vtensor<[4,512,768],f32>, %arg310: !torch.vtensor<[768,768],f32>, %arg311: !torch.vtensor<[2048,768],f32>, %arg312: !torch.vtensor<[768,768],f32>, %arg313: !torch.vtensor<[2048,768],f32>, %arg314: !torch.vtensor<[768,768],f32>, %arg315: !torch.vtensor<[2048,768],f32>, %arg316: !torch.vtensor<[48,512,64],f32>, %arg317: !torch.vtensor<[48,64,512],f32>, %arg318: !torch.vtensor<[4,12,512,512],f32>, %arg319: !torch.vtensor<[4,12,512,512],f32>, %arg320: !torch.vtensor<[48,512,512],f32>, %arg321: !torch.vtensor<[48,512,64],f32>, %arg322: !torch.vtensor<[768,768],f32>, %arg323: !torch.vtensor<[2048,768],f32>, %arg324: !torch.vtensor<[4,512,768],f32>, %arg325: !torch.vtensor<[4,512,768],f32>, %arg326: !torch.vtensor<[768,3072],f32>, %arg327: !torch.vtensor<[2048,768],f32>, %arg328: !torch.vtensor<[4,512,3072],f32>, %arg329: !torch.vtensor<[3072,768],f32>, %arg330: !torch.vtensor<[2048,3072],f32>, %arg331: !torch.vtensor<[4,512,768],f32>, %arg332: !torch.vtensor<[4,512,768],f32>, %arg333: !torch.vtensor<[768,768],f32>, %arg334: !torch.vtensor<[2048,768],f32>, %arg335: !torch.vtensor<[4,512,768],f32>, %arg336: !torch.vtensor<[4,512,768],f32>, %arg337: !torch.vtensor<[768,30522],f32>, %arg338: !torch.vtensor<[2048,768],f32>, %arg339: !torch.vtensor<[2048],si64>, %arg340: !torch.vtensor<[2048,30522],f32>, %arg341: !torch.vtensor<[],f32>, %arg342: !torch.vtensor<[4,512,30522],f32>) -> (!torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[512,768],f32>, !torch.vtensor<[2,768],f32>, !torch.vtensor<[30522,768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768,768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768,768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768,768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768,768],f32>, !torch.vtensor<[3072],f32>, !torch.vtensor<[3072,768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768,3072],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768,768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768,768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768,768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768,768],f32>, !torch.vtensor<[3072],f32>, !torch.vtensor<[3072,768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768,3072],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768,768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768,768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768,768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768,768],f32>, !torch.vtensor<[3072],f32>, !torch.vtensor<[3072,768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768,3072],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768,768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768,768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768,768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768,768],f32>, !torch.vtensor<[3072],f32>, !torch.vtensor<[3072,768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768,3072],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768,768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768,768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768,768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768,768],f32>, !torch.vtensor<[3072],f32>, !torch.vtensor<[3072,768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768,3072],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768,768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768,768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768,768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768,768],f32>, !torch.vtensor<[3072],f32>, !torch.vtensor<[3072,768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768,3072],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768,768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768,768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768,768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768,768],f32>, !torch.vtensor<[3072],f32>, !torch.vtensor<[3072,768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768,3072],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768,768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768,768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768,768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768,768],f32>, !torch.vtensor<[3072],f32>, !torch.vtensor<[3072,768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768,3072],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768,768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768,768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768,768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768,768],f32>, !torch.vtensor<[3072],f32>, !torch.vtensor<[3072,768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768,3072],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768,768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768,768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768,768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768,768],f32>, !torch.vtensor<[3072],f32>, !torch.vtensor<[3072,768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768,3072],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768,768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768,768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768,768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768,768],f32>, !torch.vtensor<[3072],f32>, !torch.vtensor<[3072,768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768,3072],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768,768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768,768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768,768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768,768],f32>, !torch.vtensor<[3072],f32>, !torch.vtensor<[3072,768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768,3072],f32>, !torch.vtensor<[1],f32>, !torch.vtensor<[30522],f32>, !torch.vtensor<[30522,768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768,768],f32>, !torch.vtensor<[1],f32>, !torch.vtensor<[1],f32>, !torch.vtensor<[1],f32>, !torch.vtensor<[1],f32>) {
    %0 = torch.vtensor.literal(dense<768> : tensor<si64>) : !torch.vtensor<[],si64>
    %1 = torch.vtensor.literal(dense<8.000000e+00> : tensor<f64>) : !torch.vtensor<[],f64>
    %2 = torch.vtensor.literal(dense<0.000000e+00> : tensor<1xf32>) : !torch.vtensor<[1],f32>
    %int-2 = torch.constant.int -2
    %int-1 = torch.constant.int -1
    %int3072 = torch.constant.int 3072
    %int2048 = torch.constant.int 2048
    %int30522 = torch.constant.int 30522
    %int512 = torch.constant.int 512
    %int-100 = torch.constant.int -100
    %float9.999990e-13 = torch.constant.float 9.9999999999999998E-13
    %int768 = torch.constant.int 768
    %none = torch.constant.none
    %true = torch.constant.bool true
    %false = torch.constant.bool false
    %str = torch.constant.str "none"
    %int0 = torch.constant.int 0
    %int1 = torch.constant.int 1
    %int2 = torch.constant.int 2
    %int3 = torch.constant.int 3
    %int4 = torch.constant.int 4
    %int6 = torch.constant.int 6
    %int12 = torch.constant.int 12
    %int48 = torch.constant.int 48
    %int64 = torch.constant.int 64
    %cpu = torch.constant.device "cpu"
    %3 = torch.prim.ListConstruct %int768 : (!torch.int) -> !torch.list<int>
    %result0, %result1, %result2 = torch.aten.native_layer_norm %arg55, %3, %arg1, %arg0, %float9.999990e-13 : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.float -> !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32>, !torch.vtensor<[4,512,1],f32>
    %result0_0, %result1_1, %result2_2 = torch.aten.native_layer_norm %arg72, %3, %arg3, %arg2, %float9.999990e-13 : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.float -> !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32>, !torch.vtensor<[4,512,1],f32>
    %result0_3, %result1_4, %result2_5 = torch.aten.native_layer_norm %arg79, %3, %arg5, %arg4, %float9.999990e-13 : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.float -> !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32>, !torch.vtensor<[4,512,1],f32>
    %result0_6, %result1_7, %result2_8 = torch.aten.native_layer_norm %arg95, %3, %arg7, %arg6, %float9.999990e-13 : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.float -> !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32>, !torch.vtensor<[4,512,1],f32>
    %result0_9, %result1_10, %result2_11 = torch.aten.native_layer_norm %arg102, %3, %arg9, %arg8, %float9.999990e-13 : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.float -> !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32>, !torch.vtensor<[4,512,1],f32>
    %result0_12, %result1_13, %result2_14 = torch.aten.native_layer_norm %arg118, %3, %arg19, %arg18, %float9.999990e-13 : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.float -> !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32>, !torch.vtensor<[4,512,1],f32>
    %result0_15, %result1_16, %result2_17 = torch.aten.native_layer_norm %arg125, %3, %arg21, %arg20, %float9.999990e-13 : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.float -> !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32>, !torch.vtensor<[4,512,1],f32>
    %result0_18, %result1_19, %result2_20 = torch.aten.native_layer_norm %arg141, %3, %arg23, %arg22, %float9.999990e-13 : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.float -> !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32>, !torch.vtensor<[4,512,1],f32>
    %result0_21, %result1_22, %result2_23 = torch.aten.native_layer_norm %arg148, %3, %arg25, %arg24, %float9.999990e-13 : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.float -> !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32>, !torch.vtensor<[4,512,1],f32>
    %result0_24, %result1_25, %result2_26 = torch.aten.native_layer_norm %arg164, %3, %arg27, %arg26, %float9.999990e-13 : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.float -> !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32>, !torch.vtensor<[4,512,1],f32>
    %result0_27, %result1_28, %result2_29 = torch.aten.native_layer_norm %arg171, %3, %arg29, %arg28, %float9.999990e-13 : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.float -> !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32>, !torch.vtensor<[4,512,1],f32>
    %result0_30, %result1_31, %result2_32 = torch.aten.native_layer_norm %arg187, %3, %arg31, %arg30, %float9.999990e-13 : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.float -> !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32>, !torch.vtensor<[4,512,1],f32>
    %result0_33, %result1_34, %result2_35 = torch.aten.native_layer_norm %arg194, %3, %arg33, %arg32, %float9.999990e-13 : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.float -> !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32>, !torch.vtensor<[4,512,1],f32>
    %result0_36, %result1_37, %result2_38 = torch.aten.native_layer_norm %arg210, %3, %arg35, %arg34, %float9.999990e-13 : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.float -> !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32>, !torch.vtensor<[4,512,1],f32>
    %result0_39, %result1_40, %result2_41 = torch.aten.native_layer_norm %arg217, %3, %arg37, %arg36, %float9.999990e-13 : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.float -> !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32>, !torch.vtensor<[4,512,1],f32>
    %result0_42, %result1_43, %result2_44 = torch.aten.native_layer_norm %arg233, %3, %arg39, %arg38, %float9.999990e-13 : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.float -> !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32>, !torch.vtensor<[4,512,1],f32>
    %result0_45, %result1_46, %result2_47 = torch.aten.native_layer_norm %arg240, %3, %arg41, %arg40, %float9.999990e-13 : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.float -> !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32>, !torch.vtensor<[4,512,1],f32>
    %result0_48, %result1_49, %result2_50 = torch.aten.native_layer_norm %arg256, %3, %arg43, %arg42, %float9.999990e-13 : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.float -> !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32>, !torch.vtensor<[4,512,1],f32>
    %result0_51, %result1_52, %result2_53 = torch.aten.native_layer_norm %arg263, %3, %arg45, %arg44, %float9.999990e-13 : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.float -> !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32>, !torch.vtensor<[4,512,1],f32>
    %result0_54, %result1_55, %result2_56 = torch.aten.native_layer_norm %arg279, %3, %arg47, %arg46, %float9.999990e-13 : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.float -> !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32>, !torch.vtensor<[4,512,1],f32>
    %result0_57, %result1_58, %result2_59 = torch.aten.native_layer_norm %arg286, %3, %arg49, %arg48, %float9.999990e-13 : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.float -> !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32>, !torch.vtensor<[4,512,1],f32>
    %result0_60, %result1_61, %result2_62 = torch.aten.native_layer_norm %arg302, %3, %arg11, %arg10, %float9.999990e-13 : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.float -> !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32>, !torch.vtensor<[4,512,1],f32>
    %result0_63, %result1_64, %result2_65 = torch.aten.native_layer_norm %arg309, %3, %arg13, %arg12, %float9.999990e-13 : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.float -> !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32>, !torch.vtensor<[4,512,1],f32>
    %result0_66, %result1_67, %result2_68 = torch.aten.native_layer_norm %arg325, %3, %arg15, %arg14, %float9.999990e-13 : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.float -> !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32>, !torch.vtensor<[4,512,1],f32>
    %result0_69, %result1_70, %result2_71 = torch.aten.native_layer_norm %arg332, %3, %arg17, %arg16, %float9.999990e-13 : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.float -> !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32>, !torch.vtensor<[4,512,1],f32>
    %result0_72, %result1_73, %result2_74 = torch.aten.native_layer_norm %arg336, %3, %arg51, %arg50, %float9.999990e-13 : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.float -> !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32>, !torch.vtensor<[4,512,1],f32>
    %output, %total_weight = torch.aten.nll_loss_forward %arg340, %arg339, %none, %int1, %int-100 : !torch.vtensor<[2048,30522],f32>, !torch.vtensor<[2048],si64>, !torch.none, !torch.int, !torch.int -> !torch.vtensor<[],f32>, !torch.vtensor<[],f32>
    %4 = torch.aten.nll_loss_backward %arg341, %arg340, %arg339, %none, %int1, %int-100, %total_weight : !torch.vtensor<[],f32>, !torch.vtensor<[2048,30522],f32>, !torch.vtensor<[2048],si64>, !torch.none, !torch.int, !torch.int, !torch.vtensor<[],f32> -> !torch.vtensor<[2048,30522],f32>
    %5 = torch.aten.exp %arg340 : !torch.vtensor<[2048,30522],f32> -> !torch.vtensor<[2048,30522],f32>
    %6 = torch.prim.ListConstruct %int1 : (!torch.int) -> !torch.list<int>
    %true_75 = torch.constant.bool true
    %none_76 = torch.constant.none
    %7 = torch.aten.sum.dim_IntList %4, %6, %true_75, %none_76 : !torch.vtensor<[2048,30522],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[2048,1],f32>
    %int0_77 = torch.constant.int 0
    %8 = torch.aten.size.int %4, %int0_77 : !torch.vtensor<[2048,30522],f32>, !torch.int -> !torch.int
    %int1_78 = torch.constant.int 1
    %9 = torch.aten.size.int %4, %int1_78 : !torch.vtensor<[2048,30522],f32>, !torch.int -> !torch.int
    %10 = torch.prim.ListConstruct %8, %9 : (!torch.int, !torch.int) -> !torch.list<int>
    %11 = torch.aten.broadcast_to %7, %10 : !torch.vtensor<[2048,1],f32>, !torch.list<int> -> !torch.vtensor<[2048,30522],f32>
    %12 = torch.aten.mul.Tensor %5, %11 : !torch.vtensor<[2048,30522],f32>, !torch.vtensor<[2048,30522],f32> -> !torch.vtensor<[2048,30522],f32>
    %float1.000000e00 = torch.constant.float 1.000000e+00
    %13 = torch.aten.sub.Tensor %4, %12, %float1.000000e00 : !torch.vtensor<[2048,30522],f32>, !torch.vtensor<[2048,30522],f32>, !torch.float -> !torch.vtensor<[2048,30522],f32>
    %14 = torch.prim.ListConstruct %int4, %int512, %int30522 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %15 = torch.aten.view %13, %14 : !torch.vtensor<[2048,30522],f32>, !torch.list<int> -> !torch.vtensor<[4,512,30522],f32>
    %16 = torch.aten.add.Tensor %arg342, %15, %int1 : !torch.vtensor<[4,512,30522],f32>, !torch.vtensor<[4,512,30522],f32>, !torch.int -> !torch.vtensor<[4,512,30522],f32>
    %17 = torch.prim.ListConstruct %int2048, %int30522 : (!torch.int, !torch.int) -> !torch.list<int>
    %18 = torch.aten.view %16, %17 : !torch.vtensor<[4,512,30522],f32>, !torch.list<int> -> !torch.vtensor<[2048,30522],f32>
    %int0_79 = torch.constant.int 0
    %int1_80 = torch.constant.int 1
    %19 = torch.aten.transpose.int %arg337, %int0_79, %int1_80 : !torch.vtensor<[768,30522],f32>, !torch.int, !torch.int -> !torch.vtensor<[30522,768],f32>
    %20 = torch.aten.mm %18, %19 : !torch.vtensor<[2048,30522],f32>, !torch.vtensor<[30522,768],f32> -> !torch.vtensor<[2048,768],f32>
    %int0_81 = torch.constant.int 0
    %int1_82 = torch.constant.int 1
    %21 = torch.aten.transpose.int %18, %int0_81, %int1_82 : !torch.vtensor<[2048,30522],f32>, !torch.int, !torch.int -> !torch.vtensor<[30522,2048],f32>
    %22 = torch.aten.mm %21, %arg338 : !torch.vtensor<[30522,2048],f32>, !torch.vtensor<[2048,768],f32> -> !torch.vtensor<[30522,768],f32>
    %int0_83 = torch.constant.int 0
    %int1_84 = torch.constant.int 1
    %23 = torch.aten.transpose.int %22, %int0_83, %int1_84 : !torch.vtensor<[30522,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,30522],f32>
    %24 = torch.prim.ListConstruct %int0 : (!torch.int) -> !torch.list<int>
    %25 = torch.aten.sum.dim_IntList %18, %24, %true, %none : !torch.vtensor<[2048,30522],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,30522],f32>
    %26 = torch.prim.ListConstruct %int30522 : (!torch.int) -> !torch.list<int>
    %27 = torch.aten.view %25, %26 : !torch.vtensor<[1,30522],f32>, !torch.list<int> -> !torch.vtensor<[30522],f32>
    %28 = torch.prim.ListConstruct %int4, %int512, %int768 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %29 = torch.aten.view %20, %28 : !torch.vtensor<[2048,768],f32>, !torch.list<int> -> !torch.vtensor<[4,512,768],f32>
    %int0_85 = torch.constant.int 0
    %int1_86 = torch.constant.int 1
    %30 = torch.aten.transpose.int %23, %int0_85, %int1_86 : !torch.vtensor<[768,30522],f32>, !torch.int, !torch.int -> !torch.vtensor<[30522,768],f32>
    %31 = torch.aten.sub.Tensor %arg336, %result1_73, %int1 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %32 = torch.aten.mul.Tensor %31, %result2_74 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32> -> !torch.vtensor<[4,512,768],f32>
    %33 = torch.aten.mul.Tensor %29, %arg51 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[768],f32> -> !torch.vtensor<[4,512,768],f32>
    %34 = torch.aten.mul.Tensor %33, %0 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[],si64> -> !torch.vtensor<[4,512,768],f32>
    %35 = torch.prim.ListConstruct %int2 : (!torch.int) -> !torch.list<int>
    %36 = torch.aten.sum.dim_IntList %33, %35, %true, %none : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[4,512,1],f32>
    %37 = torch.aten.mul.Tensor %33, %32 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32> -> !torch.vtensor<[4,512,768],f32>
    %38 = torch.aten.sum.dim_IntList %37, %35, %true, %none : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[4,512,1],f32>
    %39 = torch.aten.mul.Tensor %32, %38 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32> -> !torch.vtensor<[4,512,768],f32>
    %40 = torch.aten.sub.Tensor %34, %36, %int1 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %41 = torch.aten.sub.Tensor %40, %39, %int1 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %42 = torch.aten.div.Tensor %result2_74, %0 : !torch.vtensor<[4,512,1],f32>, !torch.vtensor<[],si64> -> !torch.vtensor<[4,512,1],f32>
    %43 = torch.aten.mul.Tensor %42, %41 : !torch.vtensor<[4,512,1],f32>, !torch.vtensor<[4,512,768],f32> -> !torch.vtensor<[4,512,768],f32>
    %44 = torch.aten.mul.Tensor %29, %32 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32> -> !torch.vtensor<[4,512,768],f32>
    %45 = torch.prim.ListConstruct %int0, %int1 : (!torch.int, !torch.int) -> !torch.list<int>
    %46 = torch.aten.sum.dim_IntList %44, %45, %false, %none : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[768],f32>
    %47 = torch.aten.sum.dim_IntList %29, %45, %false, %none : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[768],f32>
    %48 = torch.aten.gelu_backward %43, %arg335, %str : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32>, !torch.str -> !torch.vtensor<[4,512,768],f32>
    %49 = torch.prim.ListConstruct %int2048, %int768 : (!torch.int, !torch.int) -> !torch.list<int>
    %50 = torch.aten.view %48, %49 : !torch.vtensor<[4,512,768],f32>, !torch.list<int> -> !torch.vtensor<[2048,768],f32>
    %int0_87 = torch.constant.int 0
    %int1_88 = torch.constant.int 1
    %51 = torch.aten.transpose.int %arg333, %int0_87, %int1_88 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %52 = torch.aten.mm %50, %51 : !torch.vtensor<[2048,768],f32>, !torch.vtensor<[768,768],f32> -> !torch.vtensor<[2048,768],f32>
    %int0_89 = torch.constant.int 0
    %int1_90 = torch.constant.int 1
    %53 = torch.aten.transpose.int %50, %int0_89, %int1_90 : !torch.vtensor<[2048,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,2048],f32>
    %54 = torch.aten.mm %53, %arg334 : !torch.vtensor<[768,2048],f32>, !torch.vtensor<[2048,768],f32> -> !torch.vtensor<[768,768],f32>
    %int0_91 = torch.constant.int 0
    %int1_92 = torch.constant.int 1
    %55 = torch.aten.transpose.int %54, %int0_91, %int1_92 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %56 = torch.aten.sum.dim_IntList %50, %24, %true, %none : !torch.vtensor<[2048,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,768],f32>
    %57 = torch.aten.view %56, %3 : !torch.vtensor<[1,768],f32>, !torch.list<int> -> !torch.vtensor<[768],f32>
    %58 = torch.aten.view %52, %28 : !torch.vtensor<[2048,768],f32>, !torch.list<int> -> !torch.vtensor<[4,512,768],f32>
    %int0_93 = torch.constant.int 0
    %int1_94 = torch.constant.int 1
    %59 = torch.aten.transpose.int %55, %int0_93, %int1_94 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %60 = torch.aten.sub.Tensor %arg332, %result1_70, %int1 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %61 = torch.aten.mul.Tensor %60, %result2_71 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32> -> !torch.vtensor<[4,512,768],f32>
    %62 = torch.aten.mul.Tensor %58, %arg17 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[768],f32> -> !torch.vtensor<[4,512,768],f32>
    %63 = torch.aten.mul.Tensor %62, %0 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[],si64> -> !torch.vtensor<[4,512,768],f32>
    %64 = torch.aten.sum.dim_IntList %62, %35, %true, %none : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[4,512,1],f32>
    %65 = torch.aten.mul.Tensor %62, %61 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32> -> !torch.vtensor<[4,512,768],f32>
    %66 = torch.aten.sum.dim_IntList %65, %35, %true, %none : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[4,512,1],f32>
    %67 = torch.aten.mul.Tensor %61, %66 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32> -> !torch.vtensor<[4,512,768],f32>
    %68 = torch.aten.sub.Tensor %63, %64, %int1 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %69 = torch.aten.sub.Tensor %68, %67, %int1 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %70 = torch.aten.div.Tensor %result2_71, %0 : !torch.vtensor<[4,512,1],f32>, !torch.vtensor<[],si64> -> !torch.vtensor<[4,512,1],f32>
    %71 = torch.aten.mul.Tensor %70, %69 : !torch.vtensor<[4,512,1],f32>, !torch.vtensor<[4,512,768],f32> -> !torch.vtensor<[4,512,768],f32>
    %72 = torch.aten.mul.Tensor %58, %61 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32> -> !torch.vtensor<[4,512,768],f32>
    %73 = torch.aten.sum.dim_IntList %72, %45, %false, %none : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[768],f32>
    %74 = torch.aten.sum.dim_IntList %58, %45, %false, %none : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[768],f32>
    %75 = torch.aten.mul.Tensor %71, %arg331 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32> -> !torch.vtensor<[4,512,768],f32>
    %76 = torch.aten.view %75, %49 : !torch.vtensor<[4,512,768],f32>, !torch.list<int> -> !torch.vtensor<[2048,768],f32>
    %int0_95 = torch.constant.int 0
    %int1_96 = torch.constant.int 1
    %77 = torch.aten.transpose.int %arg329, %int0_95, %int1_96 : !torch.vtensor<[3072,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,3072],f32>
    %78 = torch.aten.mm %76, %77 : !torch.vtensor<[2048,768],f32>, !torch.vtensor<[768,3072],f32> -> !torch.vtensor<[2048,3072],f32>
    %int0_97 = torch.constant.int 0
    %int1_98 = torch.constant.int 1
    %79 = torch.aten.transpose.int %76, %int0_97, %int1_98 : !torch.vtensor<[2048,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,2048],f32>
    %80 = torch.aten.mm %79, %arg330 : !torch.vtensor<[768,2048],f32>, !torch.vtensor<[2048,3072],f32> -> !torch.vtensor<[768,3072],f32>
    %int0_99 = torch.constant.int 0
    %int1_100 = torch.constant.int 1
    %81 = torch.aten.transpose.int %80, %int0_99, %int1_100 : !torch.vtensor<[768,3072],f32>, !torch.int, !torch.int -> !torch.vtensor<[3072,768],f32>
    %82 = torch.aten.sum.dim_IntList %76, %24, %true, %none : !torch.vtensor<[2048,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,768],f32>
    %83 = torch.aten.view %82, %3 : !torch.vtensor<[1,768],f32>, !torch.list<int> -> !torch.vtensor<[768],f32>
    %84 = torch.prim.ListConstruct %int4, %int512, %int3072 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %85 = torch.aten.view %78, %84 : !torch.vtensor<[2048,3072],f32>, !torch.list<int> -> !torch.vtensor<[4,512,3072],f32>
    %int0_101 = torch.constant.int 0
    %int1_102 = torch.constant.int 1
    %86 = torch.aten.transpose.int %81, %int0_101, %int1_102 : !torch.vtensor<[3072,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,3072],f32>
    %87 = torch.aten.gelu_backward %85, %arg328, %str : !torch.vtensor<[4,512,3072],f32>, !torch.vtensor<[4,512,3072],f32>, !torch.str -> !torch.vtensor<[4,512,3072],f32>
    %88 = torch.prim.ListConstruct %int2048, %int3072 : (!torch.int, !torch.int) -> !torch.list<int>
    %89 = torch.aten.view %87, %88 : !torch.vtensor<[4,512,3072],f32>, !torch.list<int> -> !torch.vtensor<[2048,3072],f32>
    %int0_103 = torch.constant.int 0
    %int1_104 = torch.constant.int 1
    %90 = torch.aten.transpose.int %arg326, %int0_103, %int1_104 : !torch.vtensor<[768,3072],f32>, !torch.int, !torch.int -> !torch.vtensor<[3072,768],f32>
    %91 = torch.aten.mm %89, %90 : !torch.vtensor<[2048,3072],f32>, !torch.vtensor<[3072,768],f32> -> !torch.vtensor<[2048,768],f32>
    %int0_105 = torch.constant.int 0
    %int1_106 = torch.constant.int 1
    %92 = torch.aten.transpose.int %89, %int0_105, %int1_106 : !torch.vtensor<[2048,3072],f32>, !torch.int, !torch.int -> !torch.vtensor<[3072,2048],f32>
    %93 = torch.aten.mm %92, %arg327 : !torch.vtensor<[3072,2048],f32>, !torch.vtensor<[2048,768],f32> -> !torch.vtensor<[3072,768],f32>
    %int0_107 = torch.constant.int 0
    %int1_108 = torch.constant.int 1
    %94 = torch.aten.transpose.int %93, %int0_107, %int1_108 : !torch.vtensor<[3072,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,3072],f32>
    %95 = torch.aten.sum.dim_IntList %89, %24, %true, %none : !torch.vtensor<[2048,3072],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,3072],f32>
    %96 = torch.prim.ListConstruct %int3072 : (!torch.int) -> !torch.list<int>
    %97 = torch.aten.view %95, %96 : !torch.vtensor<[1,3072],f32>, !torch.list<int> -> !torch.vtensor<[3072],f32>
    %98 = torch.aten.view %91, %28 : !torch.vtensor<[2048,768],f32>, !torch.list<int> -> !torch.vtensor<[4,512,768],f32>
    %99 = torch.aten.add.Tensor %71, %98, %int1 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %int0_109 = torch.constant.int 0
    %int1_110 = torch.constant.int 1
    %100 = torch.aten.transpose.int %94, %int0_109, %int1_110 : !torch.vtensor<[768,3072],f32>, !torch.int, !torch.int -> !torch.vtensor<[3072,768],f32>
    %101 = torch.aten.sub.Tensor %arg325, %result1_67, %int1 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %102 = torch.aten.mul.Tensor %101, %result2_68 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32> -> !torch.vtensor<[4,512,768],f32>
    %103 = torch.aten.mul.Tensor %99, %arg15 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[768],f32> -> !torch.vtensor<[4,512,768],f32>
    %104 = torch.aten.mul.Tensor %103, %0 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[],si64> -> !torch.vtensor<[4,512,768],f32>
    %105 = torch.aten.sum.dim_IntList %103, %35, %true, %none : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[4,512,1],f32>
    %106 = torch.aten.mul.Tensor %103, %102 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32> -> !torch.vtensor<[4,512,768],f32>
    %107 = torch.aten.sum.dim_IntList %106, %35, %true, %none : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[4,512,1],f32>
    %108 = torch.aten.mul.Tensor %102, %107 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32> -> !torch.vtensor<[4,512,768],f32>
    %109 = torch.aten.sub.Tensor %104, %105, %int1 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %110 = torch.aten.sub.Tensor %109, %108, %int1 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %111 = torch.aten.div.Tensor %result2_68, %0 : !torch.vtensor<[4,512,1],f32>, !torch.vtensor<[],si64> -> !torch.vtensor<[4,512,1],f32>
    %112 = torch.aten.mul.Tensor %111, %110 : !torch.vtensor<[4,512,1],f32>, !torch.vtensor<[4,512,768],f32> -> !torch.vtensor<[4,512,768],f32>
    %113 = torch.aten.mul.Tensor %99, %102 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32> -> !torch.vtensor<[4,512,768],f32>
    %114 = torch.aten.sum.dim_IntList %113, %45, %false, %none : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[768],f32>
    %115 = torch.aten.sum.dim_IntList %99, %45, %false, %none : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[768],f32>
    %116 = torch.aten.mul.Tensor %112, %arg324 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32> -> !torch.vtensor<[4,512,768],f32>
    %117 = torch.aten.view %116, %49 : !torch.vtensor<[4,512,768],f32>, !torch.list<int> -> !torch.vtensor<[2048,768],f32>
    %int0_111 = torch.constant.int 0
    %int1_112 = torch.constant.int 1
    %118 = torch.aten.transpose.int %arg322, %int0_111, %int1_112 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %119 = torch.aten.mm %117, %118 : !torch.vtensor<[2048,768],f32>, !torch.vtensor<[768,768],f32> -> !torch.vtensor<[2048,768],f32>
    %int0_113 = torch.constant.int 0
    %int1_114 = torch.constant.int 1
    %120 = torch.aten.transpose.int %117, %int0_113, %int1_114 : !torch.vtensor<[2048,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,2048],f32>
    %121 = torch.aten.mm %120, %arg323 : !torch.vtensor<[768,2048],f32>, !torch.vtensor<[2048,768],f32> -> !torch.vtensor<[768,768],f32>
    %int0_115 = torch.constant.int 0
    %int1_116 = torch.constant.int 1
    %122 = torch.aten.transpose.int %121, %int0_115, %int1_116 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %123 = torch.aten.sum.dim_IntList %117, %24, %true, %none : !torch.vtensor<[2048,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,768],f32>
    %124 = torch.aten.view %123, %3 : !torch.vtensor<[1,768],f32>, !torch.list<int> -> !torch.vtensor<[768],f32>
    %125 = torch.aten.view %119, %28 : !torch.vtensor<[2048,768],f32>, !torch.list<int> -> !torch.vtensor<[4,512,768],f32>
    %int0_117 = torch.constant.int 0
    %int1_118 = torch.constant.int 1
    %126 = torch.aten.transpose.int %122, %int0_117, %int1_118 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %127 = torch.prim.ListConstruct %int4, %int512, %int12, %int64 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %128 = torch.aten.view %125, %127 : !torch.vtensor<[4,512,768],f32>, !torch.list<int> -> !torch.vtensor<[4,512,12,64],f32>
    %129 = torch.prim.ListConstruct %int0, %int2, %int1, %int3 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %130 = torch.aten.permute %128, %129 : !torch.vtensor<[4,512,12,64],f32>, !torch.list<int> -> !torch.vtensor<[4,12,512,64],f32>
    %131 = torch.aten.clone %130, %int0 : !torch.vtensor<[4,12,512,64],f32>, !torch.int -> !torch.vtensor<[4,12,512,64],f32>
    %132 = torch.prim.ListConstruct %int48, %int512, %int64 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %133 = torch.aten.view %131, %132 : !torch.vtensor<[4,12,512,64],f32>, !torch.list<int> -> !torch.vtensor<[48,512,64],f32>
    %134 = torch.aten.transpose.int %arg320, %int1, %int2 : !torch.vtensor<[48,512,512],f32>, !torch.int, !torch.int -> !torch.vtensor<[48,512,512],f32>
    %135 = torch.aten.bmm %134, %133 : !torch.vtensor<[48,512,512],f32>, !torch.vtensor<[48,512,64],f32> -> !torch.vtensor<[48,512,64],f32>
    %136 = torch.aten.transpose.int %arg321, %int1, %int2 : !torch.vtensor<[48,512,64],f32>, !torch.int, !torch.int -> !torch.vtensor<[48,64,512],f32>
    %137 = torch.aten.bmm %133, %136 : !torch.vtensor<[48,512,64],f32>, !torch.vtensor<[48,64,512],f32> -> !torch.vtensor<[48,512,512],f32>
    %138 = torch.prim.ListConstruct %int4, %int12, %int512, %int64 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %139 = torch.aten.view %135, %138 : !torch.vtensor<[48,512,64],f32>, !torch.list<int> -> !torch.vtensor<[4,12,512,64],f32>
    %140 = torch.prim.ListConstruct %int4, %int12, %int512, %int512 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %141 = torch.aten.view %137, %140 : !torch.vtensor<[48,512,512],f32>, !torch.list<int> -> !torch.vtensor<[4,12,512,512],f32>
    %142 = torch.aten.mul.Tensor %141, %arg319 : !torch.vtensor<[4,12,512,512],f32>, !torch.vtensor<[4,12,512,512],f32> -> !torch.vtensor<[4,12,512,512],f32>
    %143 = torch.aten.mul.Tensor %142, %arg318 : !torch.vtensor<[4,12,512,512],f32>, !torch.vtensor<[4,12,512,512],f32> -> !torch.vtensor<[4,12,512,512],f32>
    %144 = torch.prim.ListConstruct %int-1 : (!torch.int) -> !torch.list<int>
    %true_119 = torch.constant.bool true
    %none_120 = torch.constant.none
    %145 = torch.aten.sum.dim_IntList %143, %144, %true_119, %none_120 : !torch.vtensor<[4,12,512,512],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[4,12,512,1],f32>
    %int0_121 = torch.constant.int 0
    %146 = torch.aten.size.int %143, %int0_121 : !torch.vtensor<[4,12,512,512],f32>, !torch.int -> !torch.int
    %int1_122 = torch.constant.int 1
    %147 = torch.aten.size.int %143, %int1_122 : !torch.vtensor<[4,12,512,512],f32>, !torch.int -> !torch.int
    %int2_123 = torch.constant.int 2
    %148 = torch.aten.size.int %143, %int2_123 : !torch.vtensor<[4,12,512,512],f32>, !torch.int -> !torch.int
    %int3_124 = torch.constant.int 3
    %149 = torch.aten.size.int %143, %int3_124 : !torch.vtensor<[4,12,512,512],f32>, !torch.int -> !torch.int
    %150 = torch.prim.ListConstruct %146, %147, %148, %149 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %151 = torch.aten.broadcast_to %145, %150 : !torch.vtensor<[4,12,512,1],f32>, !torch.list<int> -> !torch.vtensor<[4,12,512,512],f32>
    %152 = torch.aten.mul.Tensor %arg318, %151 : !torch.vtensor<[4,12,512,512],f32>, !torch.vtensor<[4,12,512,512],f32> -> !torch.vtensor<[4,12,512,512],f32>
    %float1.000000e00_125 = torch.constant.float 1.000000e+00
    %153 = torch.aten.sub.Tensor %143, %152, %float1.000000e00_125 : !torch.vtensor<[4,12,512,512],f32>, !torch.vtensor<[4,12,512,512],f32>, !torch.float -> !torch.vtensor<[4,12,512,512],f32>
    %154 = torch.aten.div.Tensor %153, %1 : !torch.vtensor<[4,12,512,512],f32>, !torch.vtensor<[],f64> -> !torch.vtensor<[4,12,512,512],f32>
    %155 = torch.prim.ListConstruct %int48, %int512, %int512 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %156 = torch.aten.view %154, %155 : !torch.vtensor<[4,12,512,512],f32>, !torch.list<int> -> !torch.vtensor<[48,512,512],f32>
    %157 = torch.aten.transpose.int %arg316, %int1, %int2 : !torch.vtensor<[48,512,64],f32>, !torch.int, !torch.int -> !torch.vtensor<[48,64,512],f32>
    %158 = torch.aten.bmm %157, %156 : !torch.vtensor<[48,64,512],f32>, !torch.vtensor<[48,512,512],f32> -> !torch.vtensor<[48,64,512],f32>
    %159 = torch.aten.transpose.int %arg317, %int1, %int2 : !torch.vtensor<[48,64,512],f32>, !torch.int, !torch.int -> !torch.vtensor<[48,512,64],f32>
    %160 = torch.aten.bmm %156, %159 : !torch.vtensor<[48,512,512],f32>, !torch.vtensor<[48,512,64],f32> -> !torch.vtensor<[48,512,64],f32>
    %161 = torch.prim.ListConstruct %int4, %int12, %int64, %int512 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %162 = torch.aten.view %158, %161 : !torch.vtensor<[48,64,512],f32>, !torch.list<int> -> !torch.vtensor<[4,12,64,512],f32>
    %163 = torch.aten.view %160, %138 : !torch.vtensor<[48,512,64],f32>, !torch.list<int> -> !torch.vtensor<[4,12,512,64],f32>
    %164 = torch.aten.transpose.int %162, %int-1, %int-2 : !torch.vtensor<[4,12,64,512],f32>, !torch.int, !torch.int -> !torch.vtensor<[4,12,512,64],f32>
    %165 = torch.aten.permute %163, %129 : !torch.vtensor<[4,12,512,64],f32>, !torch.list<int> -> !torch.vtensor<[4,512,12,64],f32>
    %166 = torch.aten.clone %165, %int0 : !torch.vtensor<[4,512,12,64],f32>, !torch.int -> !torch.vtensor<[4,512,12,64],f32>
    %167 = torch.aten.view %166, %28 : !torch.vtensor<[4,512,12,64],f32>, !torch.list<int> -> !torch.vtensor<[4,512,768],f32>
    %168 = torch.aten.permute %139, %129 : !torch.vtensor<[4,12,512,64],f32>, !torch.list<int> -> !torch.vtensor<[4,512,12,64],f32>
    %169 = torch.aten.clone %168, %int0 : !torch.vtensor<[4,512,12,64],f32>, !torch.int -> !torch.vtensor<[4,512,12,64],f32>
    %170 = torch.aten.view %169, %28 : !torch.vtensor<[4,512,12,64],f32>, !torch.list<int> -> !torch.vtensor<[4,512,768],f32>
    %171 = torch.aten.view %170, %49 : !torch.vtensor<[4,512,768],f32>, !torch.list<int> -> !torch.vtensor<[2048,768],f32>
    %int0_126 = torch.constant.int 0
    %int1_127 = torch.constant.int 1
    %172 = torch.aten.transpose.int %arg314, %int0_126, %int1_127 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %173 = torch.aten.mm %171, %172 : !torch.vtensor<[2048,768],f32>, !torch.vtensor<[768,768],f32> -> !torch.vtensor<[2048,768],f32>
    %int0_128 = torch.constant.int 0
    %int1_129 = torch.constant.int 1
    %174 = torch.aten.transpose.int %171, %int0_128, %int1_129 : !torch.vtensor<[2048,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,2048],f32>
    %175 = torch.aten.mm %174, %arg315 : !torch.vtensor<[768,2048],f32>, !torch.vtensor<[2048,768],f32> -> !torch.vtensor<[768,768],f32>
    %int0_130 = torch.constant.int 0
    %int1_131 = torch.constant.int 1
    %176 = torch.aten.transpose.int %175, %int0_130, %int1_131 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %177 = torch.aten.sum.dim_IntList %171, %24, %true, %none : !torch.vtensor<[2048,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,768],f32>
    %178 = torch.aten.view %177, %3 : !torch.vtensor<[1,768],f32>, !torch.list<int> -> !torch.vtensor<[768],f32>
    %179 = torch.aten.view %173, %28 : !torch.vtensor<[2048,768],f32>, !torch.list<int> -> !torch.vtensor<[4,512,768],f32>
    %180 = torch.aten.add.Tensor %112, %179, %int1 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %int0_132 = torch.constant.int 0
    %int1_133 = torch.constant.int 1
    %181 = torch.aten.transpose.int %176, %int0_132, %int1_133 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %182 = torch.aten.permute %164, %129 : !torch.vtensor<[4,12,512,64],f32>, !torch.list<int> -> !torch.vtensor<[4,512,12,64],f32>
    %183 = torch.aten.view %182, %28 : !torch.vtensor<[4,512,12,64],f32>, !torch.list<int> -> !torch.vtensor<[4,512,768],f32>
    %184 = torch.aten.clone %183, %int0 : !torch.vtensor<[4,512,768],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %185 = torch.aten.view %184, %49 : !torch.vtensor<[4,512,768],f32>, !torch.list<int> -> !torch.vtensor<[2048,768],f32>
    %int0_134 = torch.constant.int 0
    %int1_135 = torch.constant.int 1
    %186 = torch.aten.transpose.int %arg312, %int0_134, %int1_135 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %187 = torch.aten.mm %185, %186 : !torch.vtensor<[2048,768],f32>, !torch.vtensor<[768,768],f32> -> !torch.vtensor<[2048,768],f32>
    %int0_136 = torch.constant.int 0
    %int1_137 = torch.constant.int 1
    %188 = torch.aten.transpose.int %185, %int0_136, %int1_137 : !torch.vtensor<[2048,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,2048],f32>
    %189 = torch.aten.mm %188, %arg313 : !torch.vtensor<[768,2048],f32>, !torch.vtensor<[2048,768],f32> -> !torch.vtensor<[768,768],f32>
    %int0_138 = torch.constant.int 0
    %int1_139 = torch.constant.int 1
    %190 = torch.aten.transpose.int %189, %int0_138, %int1_139 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %191 = torch.aten.sum.dim_IntList %185, %24, %true, %none : !torch.vtensor<[2048,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,768],f32>
    %192 = torch.aten.view %191, %3 : !torch.vtensor<[1,768],f32>, !torch.list<int> -> !torch.vtensor<[768],f32>
    %193 = torch.aten.view %187, %28 : !torch.vtensor<[2048,768],f32>, !torch.list<int> -> !torch.vtensor<[4,512,768],f32>
    %194 = torch.aten.add.Tensor %180, %193, %int1 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %int0_140 = torch.constant.int 0
    %int1_141 = torch.constant.int 1
    %195 = torch.aten.transpose.int %190, %int0_140, %int1_141 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %196 = torch.aten.view %167, %49 : !torch.vtensor<[4,512,768],f32>, !torch.list<int> -> !torch.vtensor<[2048,768],f32>
    %int0_142 = torch.constant.int 0
    %int1_143 = torch.constant.int 1
    %197 = torch.aten.transpose.int %arg310, %int0_142, %int1_143 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %198 = torch.aten.mm %196, %197 : !torch.vtensor<[2048,768],f32>, !torch.vtensor<[768,768],f32> -> !torch.vtensor<[2048,768],f32>
    %int0_144 = torch.constant.int 0
    %int1_145 = torch.constant.int 1
    %199 = torch.aten.transpose.int %196, %int0_144, %int1_145 : !torch.vtensor<[2048,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,2048],f32>
    %200 = torch.aten.mm %199, %arg311 : !torch.vtensor<[768,2048],f32>, !torch.vtensor<[2048,768],f32> -> !torch.vtensor<[768,768],f32>
    %int0_146 = torch.constant.int 0
    %int1_147 = torch.constant.int 1
    %201 = torch.aten.transpose.int %200, %int0_146, %int1_147 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %202 = torch.aten.sum.dim_IntList %196, %24, %true, %none : !torch.vtensor<[2048,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,768],f32>
    %203 = torch.aten.view %202, %3 : !torch.vtensor<[1,768],f32>, !torch.list<int> -> !torch.vtensor<[768],f32>
    %204 = torch.aten.view %198, %28 : !torch.vtensor<[2048,768],f32>, !torch.list<int> -> !torch.vtensor<[4,512,768],f32>
    %205 = torch.aten.add.Tensor %194, %204, %int1 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %int0_148 = torch.constant.int 0
    %int1_149 = torch.constant.int 1
    %206 = torch.aten.transpose.int %201, %int0_148, %int1_149 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %207 = torch.aten.sub.Tensor %arg309, %result1_64, %int1 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %208 = torch.aten.mul.Tensor %207, %result2_65 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32> -> !torch.vtensor<[4,512,768],f32>
    %209 = torch.aten.mul.Tensor %205, %arg13 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[768],f32> -> !torch.vtensor<[4,512,768],f32>
    %210 = torch.aten.mul.Tensor %209, %0 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[],si64> -> !torch.vtensor<[4,512,768],f32>
    %211 = torch.aten.sum.dim_IntList %209, %35, %true, %none : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[4,512,1],f32>
    %212 = torch.aten.mul.Tensor %209, %208 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32> -> !torch.vtensor<[4,512,768],f32>
    %213 = torch.aten.sum.dim_IntList %212, %35, %true, %none : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[4,512,1],f32>
    %214 = torch.aten.mul.Tensor %208, %213 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32> -> !torch.vtensor<[4,512,768],f32>
    %215 = torch.aten.sub.Tensor %210, %211, %int1 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %216 = torch.aten.sub.Tensor %215, %214, %int1 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %217 = torch.aten.div.Tensor %result2_65, %0 : !torch.vtensor<[4,512,1],f32>, !torch.vtensor<[],si64> -> !torch.vtensor<[4,512,1],f32>
    %218 = torch.aten.mul.Tensor %217, %216 : !torch.vtensor<[4,512,1],f32>, !torch.vtensor<[4,512,768],f32> -> !torch.vtensor<[4,512,768],f32>
    %219 = torch.aten.mul.Tensor %205, %208 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32> -> !torch.vtensor<[4,512,768],f32>
    %220 = torch.aten.sum.dim_IntList %219, %45, %false, %none : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[768],f32>
    %221 = torch.aten.sum.dim_IntList %205, %45, %false, %none : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[768],f32>
    %222 = torch.aten.mul.Tensor %218, %arg308 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32> -> !torch.vtensor<[4,512,768],f32>
    %223 = torch.aten.view %222, %49 : !torch.vtensor<[4,512,768],f32>, !torch.list<int> -> !torch.vtensor<[2048,768],f32>
    %int0_150 = torch.constant.int 0
    %int1_151 = torch.constant.int 1
    %224 = torch.aten.transpose.int %arg306, %int0_150, %int1_151 : !torch.vtensor<[3072,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,3072],f32>
    %225 = torch.aten.mm %223, %224 : !torch.vtensor<[2048,768],f32>, !torch.vtensor<[768,3072],f32> -> !torch.vtensor<[2048,3072],f32>
    %int0_152 = torch.constant.int 0
    %int1_153 = torch.constant.int 1
    %226 = torch.aten.transpose.int %223, %int0_152, %int1_153 : !torch.vtensor<[2048,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,2048],f32>
    %227 = torch.aten.mm %226, %arg307 : !torch.vtensor<[768,2048],f32>, !torch.vtensor<[2048,3072],f32> -> !torch.vtensor<[768,3072],f32>
    %int0_154 = torch.constant.int 0
    %int1_155 = torch.constant.int 1
    %228 = torch.aten.transpose.int %227, %int0_154, %int1_155 : !torch.vtensor<[768,3072],f32>, !torch.int, !torch.int -> !torch.vtensor<[3072,768],f32>
    %229 = torch.aten.sum.dim_IntList %223, %24, %true, %none : !torch.vtensor<[2048,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,768],f32>
    %230 = torch.aten.view %229, %3 : !torch.vtensor<[1,768],f32>, !torch.list<int> -> !torch.vtensor<[768],f32>
    %231 = torch.aten.view %225, %84 : !torch.vtensor<[2048,3072],f32>, !torch.list<int> -> !torch.vtensor<[4,512,3072],f32>
    %int0_156 = torch.constant.int 0
    %int1_157 = torch.constant.int 1
    %232 = torch.aten.transpose.int %228, %int0_156, %int1_157 : !torch.vtensor<[3072,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,3072],f32>
    %233 = torch.aten.gelu_backward %231, %arg305, %str : !torch.vtensor<[4,512,3072],f32>, !torch.vtensor<[4,512,3072],f32>, !torch.str -> !torch.vtensor<[4,512,3072],f32>
    %234 = torch.aten.view %233, %88 : !torch.vtensor<[4,512,3072],f32>, !torch.list<int> -> !torch.vtensor<[2048,3072],f32>
    %int0_158 = torch.constant.int 0
    %int1_159 = torch.constant.int 1
    %235 = torch.aten.transpose.int %arg303, %int0_158, %int1_159 : !torch.vtensor<[768,3072],f32>, !torch.int, !torch.int -> !torch.vtensor<[3072,768],f32>
    %236 = torch.aten.mm %234, %235 : !torch.vtensor<[2048,3072],f32>, !torch.vtensor<[3072,768],f32> -> !torch.vtensor<[2048,768],f32>
    %int0_160 = torch.constant.int 0
    %int1_161 = torch.constant.int 1
    %237 = torch.aten.transpose.int %234, %int0_160, %int1_161 : !torch.vtensor<[2048,3072],f32>, !torch.int, !torch.int -> !torch.vtensor<[3072,2048],f32>
    %238 = torch.aten.mm %237, %arg304 : !torch.vtensor<[3072,2048],f32>, !torch.vtensor<[2048,768],f32> -> !torch.vtensor<[3072,768],f32>
    %int0_162 = torch.constant.int 0
    %int1_163 = torch.constant.int 1
    %239 = torch.aten.transpose.int %238, %int0_162, %int1_163 : !torch.vtensor<[3072,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,3072],f32>
    %240 = torch.aten.sum.dim_IntList %234, %24, %true, %none : !torch.vtensor<[2048,3072],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,3072],f32>
    %241 = torch.aten.view %240, %96 : !torch.vtensor<[1,3072],f32>, !torch.list<int> -> !torch.vtensor<[3072],f32>
    %242 = torch.aten.view %236, %28 : !torch.vtensor<[2048,768],f32>, !torch.list<int> -> !torch.vtensor<[4,512,768],f32>
    %243 = torch.aten.add.Tensor %218, %242, %int1 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %int0_164 = torch.constant.int 0
    %int1_165 = torch.constant.int 1
    %244 = torch.aten.transpose.int %239, %int0_164, %int1_165 : !torch.vtensor<[768,3072],f32>, !torch.int, !torch.int -> !torch.vtensor<[3072,768],f32>
    %245 = torch.aten.sub.Tensor %arg302, %result1_61, %int1 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %246 = torch.aten.mul.Tensor %245, %result2_62 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32> -> !torch.vtensor<[4,512,768],f32>
    %247 = torch.aten.mul.Tensor %243, %arg11 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[768],f32> -> !torch.vtensor<[4,512,768],f32>
    %248 = torch.aten.mul.Tensor %247, %0 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[],si64> -> !torch.vtensor<[4,512,768],f32>
    %249 = torch.aten.sum.dim_IntList %247, %35, %true, %none : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[4,512,1],f32>
    %250 = torch.aten.mul.Tensor %247, %246 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32> -> !torch.vtensor<[4,512,768],f32>
    %251 = torch.aten.sum.dim_IntList %250, %35, %true, %none : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[4,512,1],f32>
    %252 = torch.aten.mul.Tensor %246, %251 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32> -> !torch.vtensor<[4,512,768],f32>
    %253 = torch.aten.sub.Tensor %248, %249, %int1 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %254 = torch.aten.sub.Tensor %253, %252, %int1 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %255 = torch.aten.div.Tensor %result2_62, %0 : !torch.vtensor<[4,512,1],f32>, !torch.vtensor<[],si64> -> !torch.vtensor<[4,512,1],f32>
    %256 = torch.aten.mul.Tensor %255, %254 : !torch.vtensor<[4,512,1],f32>, !torch.vtensor<[4,512,768],f32> -> !torch.vtensor<[4,512,768],f32>
    %257 = torch.aten.mul.Tensor %243, %246 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32> -> !torch.vtensor<[4,512,768],f32>
    %258 = torch.aten.sum.dim_IntList %257, %45, %false, %none : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[768],f32>
    %259 = torch.aten.sum.dim_IntList %243, %45, %false, %none : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[768],f32>
    %260 = torch.aten.mul.Tensor %256, %arg301 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32> -> !torch.vtensor<[4,512,768],f32>
    %261 = torch.aten.view %260, %49 : !torch.vtensor<[4,512,768],f32>, !torch.list<int> -> !torch.vtensor<[2048,768],f32>
    %int0_166 = torch.constant.int 0
    %int1_167 = torch.constant.int 1
    %262 = torch.aten.transpose.int %arg299, %int0_166, %int1_167 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %263 = torch.aten.mm %261, %262 : !torch.vtensor<[2048,768],f32>, !torch.vtensor<[768,768],f32> -> !torch.vtensor<[2048,768],f32>
    %int0_168 = torch.constant.int 0
    %int1_169 = torch.constant.int 1
    %264 = torch.aten.transpose.int %261, %int0_168, %int1_169 : !torch.vtensor<[2048,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,2048],f32>
    %265 = torch.aten.mm %264, %arg300 : !torch.vtensor<[768,2048],f32>, !torch.vtensor<[2048,768],f32> -> !torch.vtensor<[768,768],f32>
    %int0_170 = torch.constant.int 0
    %int1_171 = torch.constant.int 1
    %266 = torch.aten.transpose.int %265, %int0_170, %int1_171 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %267 = torch.aten.sum.dim_IntList %261, %24, %true, %none : !torch.vtensor<[2048,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,768],f32>
    %268 = torch.aten.view %267, %3 : !torch.vtensor<[1,768],f32>, !torch.list<int> -> !torch.vtensor<[768],f32>
    %269 = torch.aten.view %263, %28 : !torch.vtensor<[2048,768],f32>, !torch.list<int> -> !torch.vtensor<[4,512,768],f32>
    %int0_172 = torch.constant.int 0
    %int1_173 = torch.constant.int 1
    %270 = torch.aten.transpose.int %266, %int0_172, %int1_173 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %271 = torch.aten.view %269, %127 : !torch.vtensor<[4,512,768],f32>, !torch.list<int> -> !torch.vtensor<[4,512,12,64],f32>
    %272 = torch.aten.permute %271, %129 : !torch.vtensor<[4,512,12,64],f32>, !torch.list<int> -> !torch.vtensor<[4,12,512,64],f32>
    %273 = torch.aten.clone %272, %int0 : !torch.vtensor<[4,12,512,64],f32>, !torch.int -> !torch.vtensor<[4,12,512,64],f32>
    %274 = torch.aten.view %273, %132 : !torch.vtensor<[4,12,512,64],f32>, !torch.list<int> -> !torch.vtensor<[48,512,64],f32>
    %275 = torch.aten.transpose.int %arg297, %int1, %int2 : !torch.vtensor<[48,512,512],f32>, !torch.int, !torch.int -> !torch.vtensor<[48,512,512],f32>
    %276 = torch.aten.bmm %275, %274 : !torch.vtensor<[48,512,512],f32>, !torch.vtensor<[48,512,64],f32> -> !torch.vtensor<[48,512,64],f32>
    %277 = torch.aten.transpose.int %arg298, %int1, %int2 : !torch.vtensor<[48,512,64],f32>, !torch.int, !torch.int -> !torch.vtensor<[48,64,512],f32>
    %278 = torch.aten.bmm %274, %277 : !torch.vtensor<[48,512,64],f32>, !torch.vtensor<[48,64,512],f32> -> !torch.vtensor<[48,512,512],f32>
    %279 = torch.aten.view %276, %138 : !torch.vtensor<[48,512,64],f32>, !torch.list<int> -> !torch.vtensor<[4,12,512,64],f32>
    %280 = torch.aten.view %278, %140 : !torch.vtensor<[48,512,512],f32>, !torch.list<int> -> !torch.vtensor<[4,12,512,512],f32>
    %281 = torch.aten.mul.Tensor %280, %arg296 : !torch.vtensor<[4,12,512,512],f32>, !torch.vtensor<[4,12,512,512],f32> -> !torch.vtensor<[4,12,512,512],f32>
    %282 = torch.aten.mul.Tensor %281, %arg295 : !torch.vtensor<[4,12,512,512],f32>, !torch.vtensor<[4,12,512,512],f32> -> !torch.vtensor<[4,12,512,512],f32>
    %283 = torch.prim.ListConstruct %int-1 : (!torch.int) -> !torch.list<int>
    %true_174 = torch.constant.bool true
    %none_175 = torch.constant.none
    %284 = torch.aten.sum.dim_IntList %282, %283, %true_174, %none_175 : !torch.vtensor<[4,12,512,512],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[4,12,512,1],f32>
    %int0_176 = torch.constant.int 0
    %285 = torch.aten.size.int %282, %int0_176 : !torch.vtensor<[4,12,512,512],f32>, !torch.int -> !torch.int
    %int1_177 = torch.constant.int 1
    %286 = torch.aten.size.int %282, %int1_177 : !torch.vtensor<[4,12,512,512],f32>, !torch.int -> !torch.int
    %int2_178 = torch.constant.int 2
    %287 = torch.aten.size.int %282, %int2_178 : !torch.vtensor<[4,12,512,512],f32>, !torch.int -> !torch.int
    %int3_179 = torch.constant.int 3
    %288 = torch.aten.size.int %282, %int3_179 : !torch.vtensor<[4,12,512,512],f32>, !torch.int -> !torch.int
    %289 = torch.prim.ListConstruct %285, %286, %287, %288 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %290 = torch.aten.broadcast_to %284, %289 : !torch.vtensor<[4,12,512,1],f32>, !torch.list<int> -> !torch.vtensor<[4,12,512,512],f32>
    %291 = torch.aten.mul.Tensor %arg295, %290 : !torch.vtensor<[4,12,512,512],f32>, !torch.vtensor<[4,12,512,512],f32> -> !torch.vtensor<[4,12,512,512],f32>
    %float1.000000e00_180 = torch.constant.float 1.000000e+00
    %292 = torch.aten.sub.Tensor %282, %291, %float1.000000e00_180 : !torch.vtensor<[4,12,512,512],f32>, !torch.vtensor<[4,12,512,512],f32>, !torch.float -> !torch.vtensor<[4,12,512,512],f32>
    %293 = torch.aten.div.Tensor %292, %1 : !torch.vtensor<[4,12,512,512],f32>, !torch.vtensor<[],f64> -> !torch.vtensor<[4,12,512,512],f32>
    %294 = torch.aten.view %293, %155 : !torch.vtensor<[4,12,512,512],f32>, !torch.list<int> -> !torch.vtensor<[48,512,512],f32>
    %295 = torch.aten.transpose.int %arg293, %int1, %int2 : !torch.vtensor<[48,512,64],f32>, !torch.int, !torch.int -> !torch.vtensor<[48,64,512],f32>
    %296 = torch.aten.bmm %295, %294 : !torch.vtensor<[48,64,512],f32>, !torch.vtensor<[48,512,512],f32> -> !torch.vtensor<[48,64,512],f32>
    %297 = torch.aten.transpose.int %arg294, %int1, %int2 : !torch.vtensor<[48,64,512],f32>, !torch.int, !torch.int -> !torch.vtensor<[48,512,64],f32>
    %298 = torch.aten.bmm %294, %297 : !torch.vtensor<[48,512,512],f32>, !torch.vtensor<[48,512,64],f32> -> !torch.vtensor<[48,512,64],f32>
    %299 = torch.aten.view %296, %161 : !torch.vtensor<[48,64,512],f32>, !torch.list<int> -> !torch.vtensor<[4,12,64,512],f32>
    %300 = torch.aten.view %298, %138 : !torch.vtensor<[48,512,64],f32>, !torch.list<int> -> !torch.vtensor<[4,12,512,64],f32>
    %301 = torch.aten.transpose.int %299, %int-1, %int-2 : !torch.vtensor<[4,12,64,512],f32>, !torch.int, !torch.int -> !torch.vtensor<[4,12,512,64],f32>
    %302 = torch.aten.permute %300, %129 : !torch.vtensor<[4,12,512,64],f32>, !torch.list<int> -> !torch.vtensor<[4,512,12,64],f32>
    %303 = torch.aten.clone %302, %int0 : !torch.vtensor<[4,512,12,64],f32>, !torch.int -> !torch.vtensor<[4,512,12,64],f32>
    %304 = torch.aten.view %303, %28 : !torch.vtensor<[4,512,12,64],f32>, !torch.list<int> -> !torch.vtensor<[4,512,768],f32>
    %305 = torch.aten.permute %279, %129 : !torch.vtensor<[4,12,512,64],f32>, !torch.list<int> -> !torch.vtensor<[4,512,12,64],f32>
    %306 = torch.aten.clone %305, %int0 : !torch.vtensor<[4,512,12,64],f32>, !torch.int -> !torch.vtensor<[4,512,12,64],f32>
    %307 = torch.aten.view %306, %28 : !torch.vtensor<[4,512,12,64],f32>, !torch.list<int> -> !torch.vtensor<[4,512,768],f32>
    %308 = torch.aten.view %307, %49 : !torch.vtensor<[4,512,768],f32>, !torch.list<int> -> !torch.vtensor<[2048,768],f32>
    %int0_181 = torch.constant.int 0
    %int1_182 = torch.constant.int 1
    %309 = torch.aten.transpose.int %arg291, %int0_181, %int1_182 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %310 = torch.aten.mm %308, %309 : !torch.vtensor<[2048,768],f32>, !torch.vtensor<[768,768],f32> -> !torch.vtensor<[2048,768],f32>
    %int0_183 = torch.constant.int 0
    %int1_184 = torch.constant.int 1
    %311 = torch.aten.transpose.int %308, %int0_183, %int1_184 : !torch.vtensor<[2048,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,2048],f32>
    %312 = torch.aten.mm %311, %arg292 : !torch.vtensor<[768,2048],f32>, !torch.vtensor<[2048,768],f32> -> !torch.vtensor<[768,768],f32>
    %int0_185 = torch.constant.int 0
    %int1_186 = torch.constant.int 1
    %313 = torch.aten.transpose.int %312, %int0_185, %int1_186 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %314 = torch.aten.sum.dim_IntList %308, %24, %true, %none : !torch.vtensor<[2048,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,768],f32>
    %315 = torch.aten.view %314, %3 : !torch.vtensor<[1,768],f32>, !torch.list<int> -> !torch.vtensor<[768],f32>
    %316 = torch.aten.view %310, %28 : !torch.vtensor<[2048,768],f32>, !torch.list<int> -> !torch.vtensor<[4,512,768],f32>
    %317 = torch.aten.add.Tensor %256, %316, %int1 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %int0_187 = torch.constant.int 0
    %int1_188 = torch.constant.int 1
    %318 = torch.aten.transpose.int %313, %int0_187, %int1_188 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %319 = torch.aten.permute %301, %129 : !torch.vtensor<[4,12,512,64],f32>, !torch.list<int> -> !torch.vtensor<[4,512,12,64],f32>
    %320 = torch.aten.view %319, %28 : !torch.vtensor<[4,512,12,64],f32>, !torch.list<int> -> !torch.vtensor<[4,512,768],f32>
    %321 = torch.aten.clone %320, %int0 : !torch.vtensor<[4,512,768],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %322 = torch.aten.view %321, %49 : !torch.vtensor<[4,512,768],f32>, !torch.list<int> -> !torch.vtensor<[2048,768],f32>
    %int0_189 = torch.constant.int 0
    %int1_190 = torch.constant.int 1
    %323 = torch.aten.transpose.int %arg289, %int0_189, %int1_190 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %324 = torch.aten.mm %322, %323 : !torch.vtensor<[2048,768],f32>, !torch.vtensor<[768,768],f32> -> !torch.vtensor<[2048,768],f32>
    %int0_191 = torch.constant.int 0
    %int1_192 = torch.constant.int 1
    %325 = torch.aten.transpose.int %322, %int0_191, %int1_192 : !torch.vtensor<[2048,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,2048],f32>
    %326 = torch.aten.mm %325, %arg290 : !torch.vtensor<[768,2048],f32>, !torch.vtensor<[2048,768],f32> -> !torch.vtensor<[768,768],f32>
    %int0_193 = torch.constant.int 0
    %int1_194 = torch.constant.int 1
    %327 = torch.aten.transpose.int %326, %int0_193, %int1_194 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %328 = torch.aten.sum.dim_IntList %322, %24, %true, %none : !torch.vtensor<[2048,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,768],f32>
    %329 = torch.aten.view %328, %3 : !torch.vtensor<[1,768],f32>, !torch.list<int> -> !torch.vtensor<[768],f32>
    %330 = torch.aten.view %324, %28 : !torch.vtensor<[2048,768],f32>, !torch.list<int> -> !torch.vtensor<[4,512,768],f32>
    %331 = torch.aten.add.Tensor %317, %330, %int1 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %int0_195 = torch.constant.int 0
    %int1_196 = torch.constant.int 1
    %332 = torch.aten.transpose.int %327, %int0_195, %int1_196 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %333 = torch.aten.view %304, %49 : !torch.vtensor<[4,512,768],f32>, !torch.list<int> -> !torch.vtensor<[2048,768],f32>
    %int0_197 = torch.constant.int 0
    %int1_198 = torch.constant.int 1
    %334 = torch.aten.transpose.int %arg287, %int0_197, %int1_198 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %335 = torch.aten.mm %333, %334 : !torch.vtensor<[2048,768],f32>, !torch.vtensor<[768,768],f32> -> !torch.vtensor<[2048,768],f32>
    %int0_199 = torch.constant.int 0
    %int1_200 = torch.constant.int 1
    %336 = torch.aten.transpose.int %333, %int0_199, %int1_200 : !torch.vtensor<[2048,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,2048],f32>
    %337 = torch.aten.mm %336, %arg288 : !torch.vtensor<[768,2048],f32>, !torch.vtensor<[2048,768],f32> -> !torch.vtensor<[768,768],f32>
    %int0_201 = torch.constant.int 0
    %int1_202 = torch.constant.int 1
    %338 = torch.aten.transpose.int %337, %int0_201, %int1_202 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %339 = torch.aten.sum.dim_IntList %333, %24, %true, %none : !torch.vtensor<[2048,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,768],f32>
    %340 = torch.aten.view %339, %3 : !torch.vtensor<[1,768],f32>, !torch.list<int> -> !torch.vtensor<[768],f32>
    %341 = torch.aten.view %335, %28 : !torch.vtensor<[2048,768],f32>, !torch.list<int> -> !torch.vtensor<[4,512,768],f32>
    %342 = torch.aten.add.Tensor %331, %341, %int1 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %int0_203 = torch.constant.int 0
    %int1_204 = torch.constant.int 1
    %343 = torch.aten.transpose.int %338, %int0_203, %int1_204 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %344 = torch.aten.sub.Tensor %arg286, %result1_58, %int1 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %345 = torch.aten.mul.Tensor %344, %result2_59 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32> -> !torch.vtensor<[4,512,768],f32>
    %346 = torch.aten.mul.Tensor %342, %arg49 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[768],f32> -> !torch.vtensor<[4,512,768],f32>
    %347 = torch.aten.mul.Tensor %346, %0 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[],si64> -> !torch.vtensor<[4,512,768],f32>
    %348 = torch.aten.sum.dim_IntList %346, %35, %true, %none : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[4,512,1],f32>
    %349 = torch.aten.mul.Tensor %346, %345 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32> -> !torch.vtensor<[4,512,768],f32>
    %350 = torch.aten.sum.dim_IntList %349, %35, %true, %none : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[4,512,1],f32>
    %351 = torch.aten.mul.Tensor %345, %350 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32> -> !torch.vtensor<[4,512,768],f32>
    %352 = torch.aten.sub.Tensor %347, %348, %int1 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %353 = torch.aten.sub.Tensor %352, %351, %int1 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %354 = torch.aten.div.Tensor %result2_59, %0 : !torch.vtensor<[4,512,1],f32>, !torch.vtensor<[],si64> -> !torch.vtensor<[4,512,1],f32>
    %355 = torch.aten.mul.Tensor %354, %353 : !torch.vtensor<[4,512,1],f32>, !torch.vtensor<[4,512,768],f32> -> !torch.vtensor<[4,512,768],f32>
    %356 = torch.aten.mul.Tensor %342, %345 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32> -> !torch.vtensor<[4,512,768],f32>
    %357 = torch.aten.sum.dim_IntList %356, %45, %false, %none : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[768],f32>
    %358 = torch.aten.sum.dim_IntList %342, %45, %false, %none : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[768],f32>
    %359 = torch.aten.mul.Tensor %355, %arg285 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32> -> !torch.vtensor<[4,512,768],f32>
    %360 = torch.aten.view %359, %49 : !torch.vtensor<[4,512,768],f32>, !torch.list<int> -> !torch.vtensor<[2048,768],f32>
    %int0_205 = torch.constant.int 0
    %int1_206 = torch.constant.int 1
    %361 = torch.aten.transpose.int %arg283, %int0_205, %int1_206 : !torch.vtensor<[3072,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,3072],f32>
    %362 = torch.aten.mm %360, %361 : !torch.vtensor<[2048,768],f32>, !torch.vtensor<[768,3072],f32> -> !torch.vtensor<[2048,3072],f32>
    %int0_207 = torch.constant.int 0
    %int1_208 = torch.constant.int 1
    %363 = torch.aten.transpose.int %360, %int0_207, %int1_208 : !torch.vtensor<[2048,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,2048],f32>
    %364 = torch.aten.mm %363, %arg284 : !torch.vtensor<[768,2048],f32>, !torch.vtensor<[2048,3072],f32> -> !torch.vtensor<[768,3072],f32>
    %int0_209 = torch.constant.int 0
    %int1_210 = torch.constant.int 1
    %365 = torch.aten.transpose.int %364, %int0_209, %int1_210 : !torch.vtensor<[768,3072],f32>, !torch.int, !torch.int -> !torch.vtensor<[3072,768],f32>
    %366 = torch.aten.sum.dim_IntList %360, %24, %true, %none : !torch.vtensor<[2048,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,768],f32>
    %367 = torch.aten.view %366, %3 : !torch.vtensor<[1,768],f32>, !torch.list<int> -> !torch.vtensor<[768],f32>
    %368 = torch.aten.view %362, %84 : !torch.vtensor<[2048,3072],f32>, !torch.list<int> -> !torch.vtensor<[4,512,3072],f32>
    %int0_211 = torch.constant.int 0
    %int1_212 = torch.constant.int 1
    %369 = torch.aten.transpose.int %365, %int0_211, %int1_212 : !torch.vtensor<[3072,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,3072],f32>
    %370 = torch.aten.gelu_backward %368, %arg282, %str : !torch.vtensor<[4,512,3072],f32>, !torch.vtensor<[4,512,3072],f32>, !torch.str -> !torch.vtensor<[4,512,3072],f32>
    %371 = torch.aten.view %370, %88 : !torch.vtensor<[4,512,3072],f32>, !torch.list<int> -> !torch.vtensor<[2048,3072],f32>
    %int0_213 = torch.constant.int 0
    %int1_214 = torch.constant.int 1
    %372 = torch.aten.transpose.int %arg280, %int0_213, %int1_214 : !torch.vtensor<[768,3072],f32>, !torch.int, !torch.int -> !torch.vtensor<[3072,768],f32>
    %373 = torch.aten.mm %371, %372 : !torch.vtensor<[2048,3072],f32>, !torch.vtensor<[3072,768],f32> -> !torch.vtensor<[2048,768],f32>
    %int0_215 = torch.constant.int 0
    %int1_216 = torch.constant.int 1
    %374 = torch.aten.transpose.int %371, %int0_215, %int1_216 : !torch.vtensor<[2048,3072],f32>, !torch.int, !torch.int -> !torch.vtensor<[3072,2048],f32>
    %375 = torch.aten.mm %374, %arg281 : !torch.vtensor<[3072,2048],f32>, !torch.vtensor<[2048,768],f32> -> !torch.vtensor<[3072,768],f32>
    %int0_217 = torch.constant.int 0
    %int1_218 = torch.constant.int 1
    %376 = torch.aten.transpose.int %375, %int0_217, %int1_218 : !torch.vtensor<[3072,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,3072],f32>
    %377 = torch.aten.sum.dim_IntList %371, %24, %true, %none : !torch.vtensor<[2048,3072],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,3072],f32>
    %378 = torch.aten.view %377, %96 : !torch.vtensor<[1,3072],f32>, !torch.list<int> -> !torch.vtensor<[3072],f32>
    %379 = torch.aten.view %373, %28 : !torch.vtensor<[2048,768],f32>, !torch.list<int> -> !torch.vtensor<[4,512,768],f32>
    %380 = torch.aten.add.Tensor %355, %379, %int1 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %int0_219 = torch.constant.int 0
    %int1_220 = torch.constant.int 1
    %381 = torch.aten.transpose.int %376, %int0_219, %int1_220 : !torch.vtensor<[768,3072],f32>, !torch.int, !torch.int -> !torch.vtensor<[3072,768],f32>
    %382 = torch.aten.sub.Tensor %arg279, %result1_55, %int1 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %383 = torch.aten.mul.Tensor %382, %result2_56 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32> -> !torch.vtensor<[4,512,768],f32>
    %384 = torch.aten.mul.Tensor %380, %arg47 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[768],f32> -> !torch.vtensor<[4,512,768],f32>
    %385 = torch.aten.mul.Tensor %384, %0 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[],si64> -> !torch.vtensor<[4,512,768],f32>
    %386 = torch.aten.sum.dim_IntList %384, %35, %true, %none : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[4,512,1],f32>
    %387 = torch.aten.mul.Tensor %384, %383 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32> -> !torch.vtensor<[4,512,768],f32>
    %388 = torch.aten.sum.dim_IntList %387, %35, %true, %none : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[4,512,1],f32>
    %389 = torch.aten.mul.Tensor %383, %388 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32> -> !torch.vtensor<[4,512,768],f32>
    %390 = torch.aten.sub.Tensor %385, %386, %int1 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %391 = torch.aten.sub.Tensor %390, %389, %int1 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %392 = torch.aten.div.Tensor %result2_56, %0 : !torch.vtensor<[4,512,1],f32>, !torch.vtensor<[],si64> -> !torch.vtensor<[4,512,1],f32>
    %393 = torch.aten.mul.Tensor %392, %391 : !torch.vtensor<[4,512,1],f32>, !torch.vtensor<[4,512,768],f32> -> !torch.vtensor<[4,512,768],f32>
    %394 = torch.aten.mul.Tensor %380, %383 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32> -> !torch.vtensor<[4,512,768],f32>
    %395 = torch.aten.sum.dim_IntList %394, %45, %false, %none : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[768],f32>
    %396 = torch.aten.sum.dim_IntList %380, %45, %false, %none : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[768],f32>
    %397 = torch.aten.mul.Tensor %393, %arg278 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32> -> !torch.vtensor<[4,512,768],f32>
    %398 = torch.aten.view %397, %49 : !torch.vtensor<[4,512,768],f32>, !torch.list<int> -> !torch.vtensor<[2048,768],f32>
    %int0_221 = torch.constant.int 0
    %int1_222 = torch.constant.int 1
    %399 = torch.aten.transpose.int %arg276, %int0_221, %int1_222 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %400 = torch.aten.mm %398, %399 : !torch.vtensor<[2048,768],f32>, !torch.vtensor<[768,768],f32> -> !torch.vtensor<[2048,768],f32>
    %int0_223 = torch.constant.int 0
    %int1_224 = torch.constant.int 1
    %401 = torch.aten.transpose.int %398, %int0_223, %int1_224 : !torch.vtensor<[2048,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,2048],f32>
    %402 = torch.aten.mm %401, %arg277 : !torch.vtensor<[768,2048],f32>, !torch.vtensor<[2048,768],f32> -> !torch.vtensor<[768,768],f32>
    %int0_225 = torch.constant.int 0
    %int1_226 = torch.constant.int 1
    %403 = torch.aten.transpose.int %402, %int0_225, %int1_226 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %404 = torch.aten.sum.dim_IntList %398, %24, %true, %none : !torch.vtensor<[2048,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,768],f32>
    %405 = torch.aten.view %404, %3 : !torch.vtensor<[1,768],f32>, !torch.list<int> -> !torch.vtensor<[768],f32>
    %406 = torch.aten.view %400, %28 : !torch.vtensor<[2048,768],f32>, !torch.list<int> -> !torch.vtensor<[4,512,768],f32>
    %int0_227 = torch.constant.int 0
    %int1_228 = torch.constant.int 1
    %407 = torch.aten.transpose.int %403, %int0_227, %int1_228 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %408 = torch.aten.view %406, %127 : !torch.vtensor<[4,512,768],f32>, !torch.list<int> -> !torch.vtensor<[4,512,12,64],f32>
    %409 = torch.aten.permute %408, %129 : !torch.vtensor<[4,512,12,64],f32>, !torch.list<int> -> !torch.vtensor<[4,12,512,64],f32>
    %410 = torch.aten.clone %409, %int0 : !torch.vtensor<[4,12,512,64],f32>, !torch.int -> !torch.vtensor<[4,12,512,64],f32>
    %411 = torch.aten.view %410, %132 : !torch.vtensor<[4,12,512,64],f32>, !torch.list<int> -> !torch.vtensor<[48,512,64],f32>
    %412 = torch.aten.transpose.int %arg274, %int1, %int2 : !torch.vtensor<[48,512,512],f32>, !torch.int, !torch.int -> !torch.vtensor<[48,512,512],f32>
    %413 = torch.aten.bmm %412, %411 : !torch.vtensor<[48,512,512],f32>, !torch.vtensor<[48,512,64],f32> -> !torch.vtensor<[48,512,64],f32>
    %414 = torch.aten.transpose.int %arg275, %int1, %int2 : !torch.vtensor<[48,512,64],f32>, !torch.int, !torch.int -> !torch.vtensor<[48,64,512],f32>
    %415 = torch.aten.bmm %411, %414 : !torch.vtensor<[48,512,64],f32>, !torch.vtensor<[48,64,512],f32> -> !torch.vtensor<[48,512,512],f32>
    %416 = torch.aten.view %413, %138 : !torch.vtensor<[48,512,64],f32>, !torch.list<int> -> !torch.vtensor<[4,12,512,64],f32>
    %417 = torch.aten.view %415, %140 : !torch.vtensor<[48,512,512],f32>, !torch.list<int> -> !torch.vtensor<[4,12,512,512],f32>
    %418 = torch.aten.mul.Tensor %417, %arg273 : !torch.vtensor<[4,12,512,512],f32>, !torch.vtensor<[4,12,512,512],f32> -> !torch.vtensor<[4,12,512,512],f32>
    %419 = torch.aten.mul.Tensor %418, %arg272 : !torch.vtensor<[4,12,512,512],f32>, !torch.vtensor<[4,12,512,512],f32> -> !torch.vtensor<[4,12,512,512],f32>
    %420 = torch.prim.ListConstruct %int-1 : (!torch.int) -> !torch.list<int>
    %true_229 = torch.constant.bool true
    %none_230 = torch.constant.none
    %421 = torch.aten.sum.dim_IntList %419, %420, %true_229, %none_230 : !torch.vtensor<[4,12,512,512],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[4,12,512,1],f32>
    %int0_231 = torch.constant.int 0
    %422 = torch.aten.size.int %419, %int0_231 : !torch.vtensor<[4,12,512,512],f32>, !torch.int -> !torch.int
    %int1_232 = torch.constant.int 1
    %423 = torch.aten.size.int %419, %int1_232 : !torch.vtensor<[4,12,512,512],f32>, !torch.int -> !torch.int
    %int2_233 = torch.constant.int 2
    %424 = torch.aten.size.int %419, %int2_233 : !torch.vtensor<[4,12,512,512],f32>, !torch.int -> !torch.int
    %int3_234 = torch.constant.int 3
    %425 = torch.aten.size.int %419, %int3_234 : !torch.vtensor<[4,12,512,512],f32>, !torch.int -> !torch.int
    %426 = torch.prim.ListConstruct %422, %423, %424, %425 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %427 = torch.aten.broadcast_to %421, %426 : !torch.vtensor<[4,12,512,1],f32>, !torch.list<int> -> !torch.vtensor<[4,12,512,512],f32>
    %428 = torch.aten.mul.Tensor %arg272, %427 : !torch.vtensor<[4,12,512,512],f32>, !torch.vtensor<[4,12,512,512],f32> -> !torch.vtensor<[4,12,512,512],f32>
    %float1.000000e00_235 = torch.constant.float 1.000000e+00
    %429 = torch.aten.sub.Tensor %419, %428, %float1.000000e00_235 : !torch.vtensor<[4,12,512,512],f32>, !torch.vtensor<[4,12,512,512],f32>, !torch.float -> !torch.vtensor<[4,12,512,512],f32>
    %430 = torch.aten.div.Tensor %429, %1 : !torch.vtensor<[4,12,512,512],f32>, !torch.vtensor<[],f64> -> !torch.vtensor<[4,12,512,512],f32>
    %431 = torch.aten.view %430, %155 : !torch.vtensor<[4,12,512,512],f32>, !torch.list<int> -> !torch.vtensor<[48,512,512],f32>
    %432 = torch.aten.transpose.int %arg270, %int1, %int2 : !torch.vtensor<[48,512,64],f32>, !torch.int, !torch.int -> !torch.vtensor<[48,64,512],f32>
    %433 = torch.aten.bmm %432, %431 : !torch.vtensor<[48,64,512],f32>, !torch.vtensor<[48,512,512],f32> -> !torch.vtensor<[48,64,512],f32>
    %434 = torch.aten.transpose.int %arg271, %int1, %int2 : !torch.vtensor<[48,64,512],f32>, !torch.int, !torch.int -> !torch.vtensor<[48,512,64],f32>
    %435 = torch.aten.bmm %431, %434 : !torch.vtensor<[48,512,512],f32>, !torch.vtensor<[48,512,64],f32> -> !torch.vtensor<[48,512,64],f32>
    %436 = torch.aten.view %433, %161 : !torch.vtensor<[48,64,512],f32>, !torch.list<int> -> !torch.vtensor<[4,12,64,512],f32>
    %437 = torch.aten.view %435, %138 : !torch.vtensor<[48,512,64],f32>, !torch.list<int> -> !torch.vtensor<[4,12,512,64],f32>
    %438 = torch.aten.transpose.int %436, %int-1, %int-2 : !torch.vtensor<[4,12,64,512],f32>, !torch.int, !torch.int -> !torch.vtensor<[4,12,512,64],f32>
    %439 = torch.aten.permute %437, %129 : !torch.vtensor<[4,12,512,64],f32>, !torch.list<int> -> !torch.vtensor<[4,512,12,64],f32>
    %440 = torch.aten.clone %439, %int0 : !torch.vtensor<[4,512,12,64],f32>, !torch.int -> !torch.vtensor<[4,512,12,64],f32>
    %441 = torch.aten.view %440, %28 : !torch.vtensor<[4,512,12,64],f32>, !torch.list<int> -> !torch.vtensor<[4,512,768],f32>
    %442 = torch.aten.permute %416, %129 : !torch.vtensor<[4,12,512,64],f32>, !torch.list<int> -> !torch.vtensor<[4,512,12,64],f32>
    %443 = torch.aten.clone %442, %int0 : !torch.vtensor<[4,512,12,64],f32>, !torch.int -> !torch.vtensor<[4,512,12,64],f32>
    %444 = torch.aten.view %443, %28 : !torch.vtensor<[4,512,12,64],f32>, !torch.list<int> -> !torch.vtensor<[4,512,768],f32>
    %445 = torch.aten.view %444, %49 : !torch.vtensor<[4,512,768],f32>, !torch.list<int> -> !torch.vtensor<[2048,768],f32>
    %int0_236 = torch.constant.int 0
    %int1_237 = torch.constant.int 1
    %446 = torch.aten.transpose.int %arg268, %int0_236, %int1_237 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %447 = torch.aten.mm %445, %446 : !torch.vtensor<[2048,768],f32>, !torch.vtensor<[768,768],f32> -> !torch.vtensor<[2048,768],f32>
    %int0_238 = torch.constant.int 0
    %int1_239 = torch.constant.int 1
    %448 = torch.aten.transpose.int %445, %int0_238, %int1_239 : !torch.vtensor<[2048,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,2048],f32>
    %449 = torch.aten.mm %448, %arg269 : !torch.vtensor<[768,2048],f32>, !torch.vtensor<[2048,768],f32> -> !torch.vtensor<[768,768],f32>
    %int0_240 = torch.constant.int 0
    %int1_241 = torch.constant.int 1
    %450 = torch.aten.transpose.int %449, %int0_240, %int1_241 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %451 = torch.aten.sum.dim_IntList %445, %24, %true, %none : !torch.vtensor<[2048,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,768],f32>
    %452 = torch.aten.view %451, %3 : !torch.vtensor<[1,768],f32>, !torch.list<int> -> !torch.vtensor<[768],f32>
    %453 = torch.aten.view %447, %28 : !torch.vtensor<[2048,768],f32>, !torch.list<int> -> !torch.vtensor<[4,512,768],f32>
    %454 = torch.aten.add.Tensor %393, %453, %int1 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %int0_242 = torch.constant.int 0
    %int1_243 = torch.constant.int 1
    %455 = torch.aten.transpose.int %450, %int0_242, %int1_243 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %456 = torch.aten.permute %438, %129 : !torch.vtensor<[4,12,512,64],f32>, !torch.list<int> -> !torch.vtensor<[4,512,12,64],f32>
    %457 = torch.aten.view %456, %28 : !torch.vtensor<[4,512,12,64],f32>, !torch.list<int> -> !torch.vtensor<[4,512,768],f32>
    %458 = torch.aten.clone %457, %int0 : !torch.vtensor<[4,512,768],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %459 = torch.aten.view %458, %49 : !torch.vtensor<[4,512,768],f32>, !torch.list<int> -> !torch.vtensor<[2048,768],f32>
    %int0_244 = torch.constant.int 0
    %int1_245 = torch.constant.int 1
    %460 = torch.aten.transpose.int %arg266, %int0_244, %int1_245 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %461 = torch.aten.mm %459, %460 : !torch.vtensor<[2048,768],f32>, !torch.vtensor<[768,768],f32> -> !torch.vtensor<[2048,768],f32>
    %int0_246 = torch.constant.int 0
    %int1_247 = torch.constant.int 1
    %462 = torch.aten.transpose.int %459, %int0_246, %int1_247 : !torch.vtensor<[2048,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,2048],f32>
    %463 = torch.aten.mm %462, %arg267 : !torch.vtensor<[768,2048],f32>, !torch.vtensor<[2048,768],f32> -> !torch.vtensor<[768,768],f32>
    %int0_248 = torch.constant.int 0
    %int1_249 = torch.constant.int 1
    %464 = torch.aten.transpose.int %463, %int0_248, %int1_249 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %465 = torch.aten.sum.dim_IntList %459, %24, %true, %none : !torch.vtensor<[2048,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,768],f32>
    %466 = torch.aten.view %465, %3 : !torch.vtensor<[1,768],f32>, !torch.list<int> -> !torch.vtensor<[768],f32>
    %467 = torch.aten.view %461, %28 : !torch.vtensor<[2048,768],f32>, !torch.list<int> -> !torch.vtensor<[4,512,768],f32>
    %468 = torch.aten.add.Tensor %454, %467, %int1 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %int0_250 = torch.constant.int 0
    %int1_251 = torch.constant.int 1
    %469 = torch.aten.transpose.int %464, %int0_250, %int1_251 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %470 = torch.aten.view %441, %49 : !torch.vtensor<[4,512,768],f32>, !torch.list<int> -> !torch.vtensor<[2048,768],f32>
    %int0_252 = torch.constant.int 0
    %int1_253 = torch.constant.int 1
    %471 = torch.aten.transpose.int %arg264, %int0_252, %int1_253 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %472 = torch.aten.mm %470, %471 : !torch.vtensor<[2048,768],f32>, !torch.vtensor<[768,768],f32> -> !torch.vtensor<[2048,768],f32>
    %int0_254 = torch.constant.int 0
    %int1_255 = torch.constant.int 1
    %473 = torch.aten.transpose.int %470, %int0_254, %int1_255 : !torch.vtensor<[2048,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,2048],f32>
    %474 = torch.aten.mm %473, %arg265 : !torch.vtensor<[768,2048],f32>, !torch.vtensor<[2048,768],f32> -> !torch.vtensor<[768,768],f32>
    %int0_256 = torch.constant.int 0
    %int1_257 = torch.constant.int 1
    %475 = torch.aten.transpose.int %474, %int0_256, %int1_257 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %476 = torch.aten.sum.dim_IntList %470, %24, %true, %none : !torch.vtensor<[2048,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,768],f32>
    %477 = torch.aten.view %476, %3 : !torch.vtensor<[1,768],f32>, !torch.list<int> -> !torch.vtensor<[768],f32>
    %478 = torch.aten.view %472, %28 : !torch.vtensor<[2048,768],f32>, !torch.list<int> -> !torch.vtensor<[4,512,768],f32>
    %479 = torch.aten.add.Tensor %468, %478, %int1 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %int0_258 = torch.constant.int 0
    %int1_259 = torch.constant.int 1
    %480 = torch.aten.transpose.int %475, %int0_258, %int1_259 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %481 = torch.aten.sub.Tensor %arg263, %result1_52, %int1 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %482 = torch.aten.mul.Tensor %481, %result2_53 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32> -> !torch.vtensor<[4,512,768],f32>
    %483 = torch.aten.mul.Tensor %479, %arg45 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[768],f32> -> !torch.vtensor<[4,512,768],f32>
    %484 = torch.aten.mul.Tensor %483, %0 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[],si64> -> !torch.vtensor<[4,512,768],f32>
    %485 = torch.aten.sum.dim_IntList %483, %35, %true, %none : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[4,512,1],f32>
    %486 = torch.aten.mul.Tensor %483, %482 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32> -> !torch.vtensor<[4,512,768],f32>
    %487 = torch.aten.sum.dim_IntList %486, %35, %true, %none : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[4,512,1],f32>
    %488 = torch.aten.mul.Tensor %482, %487 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32> -> !torch.vtensor<[4,512,768],f32>
    %489 = torch.aten.sub.Tensor %484, %485, %int1 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %490 = torch.aten.sub.Tensor %489, %488, %int1 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %491 = torch.aten.div.Tensor %result2_53, %0 : !torch.vtensor<[4,512,1],f32>, !torch.vtensor<[],si64> -> !torch.vtensor<[4,512,1],f32>
    %492 = torch.aten.mul.Tensor %491, %490 : !torch.vtensor<[4,512,1],f32>, !torch.vtensor<[4,512,768],f32> -> !torch.vtensor<[4,512,768],f32>
    %493 = torch.aten.mul.Tensor %479, %482 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32> -> !torch.vtensor<[4,512,768],f32>
    %494 = torch.aten.sum.dim_IntList %493, %45, %false, %none : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[768],f32>
    %495 = torch.aten.sum.dim_IntList %479, %45, %false, %none : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[768],f32>
    %496 = torch.aten.mul.Tensor %492, %arg262 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32> -> !torch.vtensor<[4,512,768],f32>
    %497 = torch.aten.view %496, %49 : !torch.vtensor<[4,512,768],f32>, !torch.list<int> -> !torch.vtensor<[2048,768],f32>
    %int0_260 = torch.constant.int 0
    %int1_261 = torch.constant.int 1
    %498 = torch.aten.transpose.int %arg260, %int0_260, %int1_261 : !torch.vtensor<[3072,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,3072],f32>
    %499 = torch.aten.mm %497, %498 : !torch.vtensor<[2048,768],f32>, !torch.vtensor<[768,3072],f32> -> !torch.vtensor<[2048,3072],f32>
    %int0_262 = torch.constant.int 0
    %int1_263 = torch.constant.int 1
    %500 = torch.aten.transpose.int %497, %int0_262, %int1_263 : !torch.vtensor<[2048,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,2048],f32>
    %501 = torch.aten.mm %500, %arg261 : !torch.vtensor<[768,2048],f32>, !torch.vtensor<[2048,3072],f32> -> !torch.vtensor<[768,3072],f32>
    %int0_264 = torch.constant.int 0
    %int1_265 = torch.constant.int 1
    %502 = torch.aten.transpose.int %501, %int0_264, %int1_265 : !torch.vtensor<[768,3072],f32>, !torch.int, !torch.int -> !torch.vtensor<[3072,768],f32>
    %503 = torch.aten.sum.dim_IntList %497, %24, %true, %none : !torch.vtensor<[2048,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,768],f32>
    %504 = torch.aten.view %503, %3 : !torch.vtensor<[1,768],f32>, !torch.list<int> -> !torch.vtensor<[768],f32>
    %505 = torch.aten.view %499, %84 : !torch.vtensor<[2048,3072],f32>, !torch.list<int> -> !torch.vtensor<[4,512,3072],f32>
    %int0_266 = torch.constant.int 0
    %int1_267 = torch.constant.int 1
    %506 = torch.aten.transpose.int %502, %int0_266, %int1_267 : !torch.vtensor<[3072,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,3072],f32>
    %507 = torch.aten.gelu_backward %505, %arg259, %str : !torch.vtensor<[4,512,3072],f32>, !torch.vtensor<[4,512,3072],f32>, !torch.str -> !torch.vtensor<[4,512,3072],f32>
    %508 = torch.aten.view %507, %88 : !torch.vtensor<[4,512,3072],f32>, !torch.list<int> -> !torch.vtensor<[2048,3072],f32>
    %int0_268 = torch.constant.int 0
    %int1_269 = torch.constant.int 1
    %509 = torch.aten.transpose.int %arg257, %int0_268, %int1_269 : !torch.vtensor<[768,3072],f32>, !torch.int, !torch.int -> !torch.vtensor<[3072,768],f32>
    %510 = torch.aten.mm %508, %509 : !torch.vtensor<[2048,3072],f32>, !torch.vtensor<[3072,768],f32> -> !torch.vtensor<[2048,768],f32>
    %int0_270 = torch.constant.int 0
    %int1_271 = torch.constant.int 1
    %511 = torch.aten.transpose.int %508, %int0_270, %int1_271 : !torch.vtensor<[2048,3072],f32>, !torch.int, !torch.int -> !torch.vtensor<[3072,2048],f32>
    %512 = torch.aten.mm %511, %arg258 : !torch.vtensor<[3072,2048],f32>, !torch.vtensor<[2048,768],f32> -> !torch.vtensor<[3072,768],f32>
    %int0_272 = torch.constant.int 0
    %int1_273 = torch.constant.int 1
    %513 = torch.aten.transpose.int %512, %int0_272, %int1_273 : !torch.vtensor<[3072,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,3072],f32>
    %514 = torch.aten.sum.dim_IntList %508, %24, %true, %none : !torch.vtensor<[2048,3072],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,3072],f32>
    %515 = torch.aten.view %514, %96 : !torch.vtensor<[1,3072],f32>, !torch.list<int> -> !torch.vtensor<[3072],f32>
    %516 = torch.aten.view %510, %28 : !torch.vtensor<[2048,768],f32>, !torch.list<int> -> !torch.vtensor<[4,512,768],f32>
    %517 = torch.aten.add.Tensor %492, %516, %int1 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %int0_274 = torch.constant.int 0
    %int1_275 = torch.constant.int 1
    %518 = torch.aten.transpose.int %513, %int0_274, %int1_275 : !torch.vtensor<[768,3072],f32>, !torch.int, !torch.int -> !torch.vtensor<[3072,768],f32>
    %519 = torch.aten.sub.Tensor %arg256, %result1_49, %int1 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %520 = torch.aten.mul.Tensor %519, %result2_50 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32> -> !torch.vtensor<[4,512,768],f32>
    %521 = torch.aten.mul.Tensor %517, %arg43 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[768],f32> -> !torch.vtensor<[4,512,768],f32>
    %522 = torch.aten.mul.Tensor %521, %0 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[],si64> -> !torch.vtensor<[4,512,768],f32>
    %523 = torch.aten.sum.dim_IntList %521, %35, %true, %none : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[4,512,1],f32>
    %524 = torch.aten.mul.Tensor %521, %520 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32> -> !torch.vtensor<[4,512,768],f32>
    %525 = torch.aten.sum.dim_IntList %524, %35, %true, %none : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[4,512,1],f32>
    %526 = torch.aten.mul.Tensor %520, %525 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32> -> !torch.vtensor<[4,512,768],f32>
    %527 = torch.aten.sub.Tensor %522, %523, %int1 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %528 = torch.aten.sub.Tensor %527, %526, %int1 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %529 = torch.aten.div.Tensor %result2_50, %0 : !torch.vtensor<[4,512,1],f32>, !torch.vtensor<[],si64> -> !torch.vtensor<[4,512,1],f32>
    %530 = torch.aten.mul.Tensor %529, %528 : !torch.vtensor<[4,512,1],f32>, !torch.vtensor<[4,512,768],f32> -> !torch.vtensor<[4,512,768],f32>
    %531 = torch.aten.mul.Tensor %517, %520 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32> -> !torch.vtensor<[4,512,768],f32>
    %532 = torch.aten.sum.dim_IntList %531, %45, %false, %none : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[768],f32>
    %533 = torch.aten.sum.dim_IntList %517, %45, %false, %none : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[768],f32>
    %534 = torch.aten.mul.Tensor %530, %arg255 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32> -> !torch.vtensor<[4,512,768],f32>
    %535 = torch.aten.view %534, %49 : !torch.vtensor<[4,512,768],f32>, !torch.list<int> -> !torch.vtensor<[2048,768],f32>
    %int0_276 = torch.constant.int 0
    %int1_277 = torch.constant.int 1
    %536 = torch.aten.transpose.int %arg253, %int0_276, %int1_277 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %537 = torch.aten.mm %535, %536 : !torch.vtensor<[2048,768],f32>, !torch.vtensor<[768,768],f32> -> !torch.vtensor<[2048,768],f32>
    %int0_278 = torch.constant.int 0
    %int1_279 = torch.constant.int 1
    %538 = torch.aten.transpose.int %535, %int0_278, %int1_279 : !torch.vtensor<[2048,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,2048],f32>
    %539 = torch.aten.mm %538, %arg254 : !torch.vtensor<[768,2048],f32>, !torch.vtensor<[2048,768],f32> -> !torch.vtensor<[768,768],f32>
    %int0_280 = torch.constant.int 0
    %int1_281 = torch.constant.int 1
    %540 = torch.aten.transpose.int %539, %int0_280, %int1_281 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %541 = torch.aten.sum.dim_IntList %535, %24, %true, %none : !torch.vtensor<[2048,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,768],f32>
    %542 = torch.aten.view %541, %3 : !torch.vtensor<[1,768],f32>, !torch.list<int> -> !torch.vtensor<[768],f32>
    %543 = torch.aten.view %537, %28 : !torch.vtensor<[2048,768],f32>, !torch.list<int> -> !torch.vtensor<[4,512,768],f32>
    %int0_282 = torch.constant.int 0
    %int1_283 = torch.constant.int 1
    %544 = torch.aten.transpose.int %540, %int0_282, %int1_283 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %545 = torch.aten.view %543, %127 : !torch.vtensor<[4,512,768],f32>, !torch.list<int> -> !torch.vtensor<[4,512,12,64],f32>
    %546 = torch.aten.permute %545, %129 : !torch.vtensor<[4,512,12,64],f32>, !torch.list<int> -> !torch.vtensor<[4,12,512,64],f32>
    %547 = torch.aten.clone %546, %int0 : !torch.vtensor<[4,12,512,64],f32>, !torch.int -> !torch.vtensor<[4,12,512,64],f32>
    %548 = torch.aten.view %547, %132 : !torch.vtensor<[4,12,512,64],f32>, !torch.list<int> -> !torch.vtensor<[48,512,64],f32>
    %549 = torch.aten.transpose.int %arg251, %int1, %int2 : !torch.vtensor<[48,512,512],f32>, !torch.int, !torch.int -> !torch.vtensor<[48,512,512],f32>
    %550 = torch.aten.bmm %549, %548 : !torch.vtensor<[48,512,512],f32>, !torch.vtensor<[48,512,64],f32> -> !torch.vtensor<[48,512,64],f32>
    %551 = torch.aten.transpose.int %arg252, %int1, %int2 : !torch.vtensor<[48,512,64],f32>, !torch.int, !torch.int -> !torch.vtensor<[48,64,512],f32>
    %552 = torch.aten.bmm %548, %551 : !torch.vtensor<[48,512,64],f32>, !torch.vtensor<[48,64,512],f32> -> !torch.vtensor<[48,512,512],f32>
    %553 = torch.aten.view %550, %138 : !torch.vtensor<[48,512,64],f32>, !torch.list<int> -> !torch.vtensor<[4,12,512,64],f32>
    %554 = torch.aten.view %552, %140 : !torch.vtensor<[48,512,512],f32>, !torch.list<int> -> !torch.vtensor<[4,12,512,512],f32>
    %555 = torch.aten.mul.Tensor %554, %arg250 : !torch.vtensor<[4,12,512,512],f32>, !torch.vtensor<[4,12,512,512],f32> -> !torch.vtensor<[4,12,512,512],f32>
    %556 = torch.aten.mul.Tensor %555, %arg249 : !torch.vtensor<[4,12,512,512],f32>, !torch.vtensor<[4,12,512,512],f32> -> !torch.vtensor<[4,12,512,512],f32>
    %557 = torch.prim.ListConstruct %int-1 : (!torch.int) -> !torch.list<int>
    %true_284 = torch.constant.bool true
    %none_285 = torch.constant.none
    %558 = torch.aten.sum.dim_IntList %556, %557, %true_284, %none_285 : !torch.vtensor<[4,12,512,512],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[4,12,512,1],f32>
    %int0_286 = torch.constant.int 0
    %559 = torch.aten.size.int %556, %int0_286 : !torch.vtensor<[4,12,512,512],f32>, !torch.int -> !torch.int
    %int1_287 = torch.constant.int 1
    %560 = torch.aten.size.int %556, %int1_287 : !torch.vtensor<[4,12,512,512],f32>, !torch.int -> !torch.int
    %int2_288 = torch.constant.int 2
    %561 = torch.aten.size.int %556, %int2_288 : !torch.vtensor<[4,12,512,512],f32>, !torch.int -> !torch.int
    %int3_289 = torch.constant.int 3
    %562 = torch.aten.size.int %556, %int3_289 : !torch.vtensor<[4,12,512,512],f32>, !torch.int -> !torch.int
    %563 = torch.prim.ListConstruct %559, %560, %561, %562 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %564 = torch.aten.broadcast_to %558, %563 : !torch.vtensor<[4,12,512,1],f32>, !torch.list<int> -> !torch.vtensor<[4,12,512,512],f32>
    %565 = torch.aten.mul.Tensor %arg249, %564 : !torch.vtensor<[4,12,512,512],f32>, !torch.vtensor<[4,12,512,512],f32> -> !torch.vtensor<[4,12,512,512],f32>
    %float1.000000e00_290 = torch.constant.float 1.000000e+00
    %566 = torch.aten.sub.Tensor %556, %565, %float1.000000e00_290 : !torch.vtensor<[4,12,512,512],f32>, !torch.vtensor<[4,12,512,512],f32>, !torch.float -> !torch.vtensor<[4,12,512,512],f32>
    %567 = torch.aten.div.Tensor %566, %1 : !torch.vtensor<[4,12,512,512],f32>, !torch.vtensor<[],f64> -> !torch.vtensor<[4,12,512,512],f32>
    %568 = torch.aten.view %567, %155 : !torch.vtensor<[4,12,512,512],f32>, !torch.list<int> -> !torch.vtensor<[48,512,512],f32>
    %569 = torch.aten.transpose.int %arg247, %int1, %int2 : !torch.vtensor<[48,512,64],f32>, !torch.int, !torch.int -> !torch.vtensor<[48,64,512],f32>
    %570 = torch.aten.bmm %569, %568 : !torch.vtensor<[48,64,512],f32>, !torch.vtensor<[48,512,512],f32> -> !torch.vtensor<[48,64,512],f32>
    %571 = torch.aten.transpose.int %arg248, %int1, %int2 : !torch.vtensor<[48,64,512],f32>, !torch.int, !torch.int -> !torch.vtensor<[48,512,64],f32>
    %572 = torch.aten.bmm %568, %571 : !torch.vtensor<[48,512,512],f32>, !torch.vtensor<[48,512,64],f32> -> !torch.vtensor<[48,512,64],f32>
    %573 = torch.aten.view %570, %161 : !torch.vtensor<[48,64,512],f32>, !torch.list<int> -> !torch.vtensor<[4,12,64,512],f32>
    %574 = torch.aten.view %572, %138 : !torch.vtensor<[48,512,64],f32>, !torch.list<int> -> !torch.vtensor<[4,12,512,64],f32>
    %575 = torch.aten.transpose.int %573, %int-1, %int-2 : !torch.vtensor<[4,12,64,512],f32>, !torch.int, !torch.int -> !torch.vtensor<[4,12,512,64],f32>
    %576 = torch.aten.permute %574, %129 : !torch.vtensor<[4,12,512,64],f32>, !torch.list<int> -> !torch.vtensor<[4,512,12,64],f32>
    %577 = torch.aten.clone %576, %int0 : !torch.vtensor<[4,512,12,64],f32>, !torch.int -> !torch.vtensor<[4,512,12,64],f32>
    %578 = torch.aten.view %577, %28 : !torch.vtensor<[4,512,12,64],f32>, !torch.list<int> -> !torch.vtensor<[4,512,768],f32>
    %579 = torch.aten.permute %553, %129 : !torch.vtensor<[4,12,512,64],f32>, !torch.list<int> -> !torch.vtensor<[4,512,12,64],f32>
    %580 = torch.aten.clone %579, %int0 : !torch.vtensor<[4,512,12,64],f32>, !torch.int -> !torch.vtensor<[4,512,12,64],f32>
    %581 = torch.aten.view %580, %28 : !torch.vtensor<[4,512,12,64],f32>, !torch.list<int> -> !torch.vtensor<[4,512,768],f32>
    %582 = torch.aten.view %581, %49 : !torch.vtensor<[4,512,768],f32>, !torch.list<int> -> !torch.vtensor<[2048,768],f32>
    %int0_291 = torch.constant.int 0
    %int1_292 = torch.constant.int 1
    %583 = torch.aten.transpose.int %arg245, %int0_291, %int1_292 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %584 = torch.aten.mm %582, %583 : !torch.vtensor<[2048,768],f32>, !torch.vtensor<[768,768],f32> -> !torch.vtensor<[2048,768],f32>
    %int0_293 = torch.constant.int 0
    %int1_294 = torch.constant.int 1
    %585 = torch.aten.transpose.int %582, %int0_293, %int1_294 : !torch.vtensor<[2048,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,2048],f32>
    %586 = torch.aten.mm %585, %arg246 : !torch.vtensor<[768,2048],f32>, !torch.vtensor<[2048,768],f32> -> !torch.vtensor<[768,768],f32>
    %int0_295 = torch.constant.int 0
    %int1_296 = torch.constant.int 1
    %587 = torch.aten.transpose.int %586, %int0_295, %int1_296 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %588 = torch.aten.sum.dim_IntList %582, %24, %true, %none : !torch.vtensor<[2048,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,768],f32>
    %589 = torch.aten.view %588, %3 : !torch.vtensor<[1,768],f32>, !torch.list<int> -> !torch.vtensor<[768],f32>
    %590 = torch.aten.view %584, %28 : !torch.vtensor<[2048,768],f32>, !torch.list<int> -> !torch.vtensor<[4,512,768],f32>
    %591 = torch.aten.add.Tensor %530, %590, %int1 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %int0_297 = torch.constant.int 0
    %int1_298 = torch.constant.int 1
    %592 = torch.aten.transpose.int %587, %int0_297, %int1_298 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %593 = torch.aten.permute %575, %129 : !torch.vtensor<[4,12,512,64],f32>, !torch.list<int> -> !torch.vtensor<[4,512,12,64],f32>
    %594 = torch.aten.view %593, %28 : !torch.vtensor<[4,512,12,64],f32>, !torch.list<int> -> !torch.vtensor<[4,512,768],f32>
    %595 = torch.aten.clone %594, %int0 : !torch.vtensor<[4,512,768],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %596 = torch.aten.view %595, %49 : !torch.vtensor<[4,512,768],f32>, !torch.list<int> -> !torch.vtensor<[2048,768],f32>
    %int0_299 = torch.constant.int 0
    %int1_300 = torch.constant.int 1
    %597 = torch.aten.transpose.int %arg243, %int0_299, %int1_300 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %598 = torch.aten.mm %596, %597 : !torch.vtensor<[2048,768],f32>, !torch.vtensor<[768,768],f32> -> !torch.vtensor<[2048,768],f32>
    %int0_301 = torch.constant.int 0
    %int1_302 = torch.constant.int 1
    %599 = torch.aten.transpose.int %596, %int0_301, %int1_302 : !torch.vtensor<[2048,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,2048],f32>
    %600 = torch.aten.mm %599, %arg244 : !torch.vtensor<[768,2048],f32>, !torch.vtensor<[2048,768],f32> -> !torch.vtensor<[768,768],f32>
    %int0_303 = torch.constant.int 0
    %int1_304 = torch.constant.int 1
    %601 = torch.aten.transpose.int %600, %int0_303, %int1_304 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %602 = torch.aten.sum.dim_IntList %596, %24, %true, %none : !torch.vtensor<[2048,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,768],f32>
    %603 = torch.aten.view %602, %3 : !torch.vtensor<[1,768],f32>, !torch.list<int> -> !torch.vtensor<[768],f32>
    %604 = torch.aten.view %598, %28 : !torch.vtensor<[2048,768],f32>, !torch.list<int> -> !torch.vtensor<[4,512,768],f32>
    %605 = torch.aten.add.Tensor %591, %604, %int1 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %int0_305 = torch.constant.int 0
    %int1_306 = torch.constant.int 1
    %606 = torch.aten.transpose.int %601, %int0_305, %int1_306 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %607 = torch.aten.view %578, %49 : !torch.vtensor<[4,512,768],f32>, !torch.list<int> -> !torch.vtensor<[2048,768],f32>
    %int0_307 = torch.constant.int 0
    %int1_308 = torch.constant.int 1
    %608 = torch.aten.transpose.int %arg241, %int0_307, %int1_308 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %609 = torch.aten.mm %607, %608 : !torch.vtensor<[2048,768],f32>, !torch.vtensor<[768,768],f32> -> !torch.vtensor<[2048,768],f32>
    %int0_309 = torch.constant.int 0
    %int1_310 = torch.constant.int 1
    %610 = torch.aten.transpose.int %607, %int0_309, %int1_310 : !torch.vtensor<[2048,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,2048],f32>
    %611 = torch.aten.mm %610, %arg242 : !torch.vtensor<[768,2048],f32>, !torch.vtensor<[2048,768],f32> -> !torch.vtensor<[768,768],f32>
    %int0_311 = torch.constant.int 0
    %int1_312 = torch.constant.int 1
    %612 = torch.aten.transpose.int %611, %int0_311, %int1_312 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %613 = torch.aten.sum.dim_IntList %607, %24, %true, %none : !torch.vtensor<[2048,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,768],f32>
    %614 = torch.aten.view %613, %3 : !torch.vtensor<[1,768],f32>, !torch.list<int> -> !torch.vtensor<[768],f32>
    %615 = torch.aten.view %609, %28 : !torch.vtensor<[2048,768],f32>, !torch.list<int> -> !torch.vtensor<[4,512,768],f32>
    %616 = torch.aten.add.Tensor %605, %615, %int1 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %int0_313 = torch.constant.int 0
    %int1_314 = torch.constant.int 1
    %617 = torch.aten.transpose.int %612, %int0_313, %int1_314 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %618 = torch.aten.sub.Tensor %arg240, %result1_46, %int1 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %619 = torch.aten.mul.Tensor %618, %result2_47 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32> -> !torch.vtensor<[4,512,768],f32>
    %620 = torch.aten.mul.Tensor %616, %arg41 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[768],f32> -> !torch.vtensor<[4,512,768],f32>
    %621 = torch.aten.mul.Tensor %620, %0 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[],si64> -> !torch.vtensor<[4,512,768],f32>
    %622 = torch.aten.sum.dim_IntList %620, %35, %true, %none : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[4,512,1],f32>
    %623 = torch.aten.mul.Tensor %620, %619 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32> -> !torch.vtensor<[4,512,768],f32>
    %624 = torch.aten.sum.dim_IntList %623, %35, %true, %none : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[4,512,1],f32>
    %625 = torch.aten.mul.Tensor %619, %624 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32> -> !torch.vtensor<[4,512,768],f32>
    %626 = torch.aten.sub.Tensor %621, %622, %int1 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %627 = torch.aten.sub.Tensor %626, %625, %int1 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %628 = torch.aten.div.Tensor %result2_47, %0 : !torch.vtensor<[4,512,1],f32>, !torch.vtensor<[],si64> -> !torch.vtensor<[4,512,1],f32>
    %629 = torch.aten.mul.Tensor %628, %627 : !torch.vtensor<[4,512,1],f32>, !torch.vtensor<[4,512,768],f32> -> !torch.vtensor<[4,512,768],f32>
    %630 = torch.aten.mul.Tensor %616, %619 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32> -> !torch.vtensor<[4,512,768],f32>
    %631 = torch.aten.sum.dim_IntList %630, %45, %false, %none : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[768],f32>
    %632 = torch.aten.sum.dim_IntList %616, %45, %false, %none : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[768],f32>
    %633 = torch.aten.mul.Tensor %629, %arg239 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32> -> !torch.vtensor<[4,512,768],f32>
    %634 = torch.aten.view %633, %49 : !torch.vtensor<[4,512,768],f32>, !torch.list<int> -> !torch.vtensor<[2048,768],f32>
    %int0_315 = torch.constant.int 0
    %int1_316 = torch.constant.int 1
    %635 = torch.aten.transpose.int %arg237, %int0_315, %int1_316 : !torch.vtensor<[3072,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,3072],f32>
    %636 = torch.aten.mm %634, %635 : !torch.vtensor<[2048,768],f32>, !torch.vtensor<[768,3072],f32> -> !torch.vtensor<[2048,3072],f32>
    %int0_317 = torch.constant.int 0
    %int1_318 = torch.constant.int 1
    %637 = torch.aten.transpose.int %634, %int0_317, %int1_318 : !torch.vtensor<[2048,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,2048],f32>
    %638 = torch.aten.mm %637, %arg238 : !torch.vtensor<[768,2048],f32>, !torch.vtensor<[2048,3072],f32> -> !torch.vtensor<[768,3072],f32>
    %int0_319 = torch.constant.int 0
    %int1_320 = torch.constant.int 1
    %639 = torch.aten.transpose.int %638, %int0_319, %int1_320 : !torch.vtensor<[768,3072],f32>, !torch.int, !torch.int -> !torch.vtensor<[3072,768],f32>
    %640 = torch.aten.sum.dim_IntList %634, %24, %true, %none : !torch.vtensor<[2048,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,768],f32>
    %641 = torch.aten.view %640, %3 : !torch.vtensor<[1,768],f32>, !torch.list<int> -> !torch.vtensor<[768],f32>
    %642 = torch.aten.view %636, %84 : !torch.vtensor<[2048,3072],f32>, !torch.list<int> -> !torch.vtensor<[4,512,3072],f32>
    %int0_321 = torch.constant.int 0
    %int1_322 = torch.constant.int 1
    %643 = torch.aten.transpose.int %639, %int0_321, %int1_322 : !torch.vtensor<[3072,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,3072],f32>
    %644 = torch.aten.gelu_backward %642, %arg236, %str : !torch.vtensor<[4,512,3072],f32>, !torch.vtensor<[4,512,3072],f32>, !torch.str -> !torch.vtensor<[4,512,3072],f32>
    %645 = torch.aten.view %644, %88 : !torch.vtensor<[4,512,3072],f32>, !torch.list<int> -> !torch.vtensor<[2048,3072],f32>
    %int0_323 = torch.constant.int 0
    %int1_324 = torch.constant.int 1
    %646 = torch.aten.transpose.int %arg234, %int0_323, %int1_324 : !torch.vtensor<[768,3072],f32>, !torch.int, !torch.int -> !torch.vtensor<[3072,768],f32>
    %647 = torch.aten.mm %645, %646 : !torch.vtensor<[2048,3072],f32>, !torch.vtensor<[3072,768],f32> -> !torch.vtensor<[2048,768],f32>
    %int0_325 = torch.constant.int 0
    %int1_326 = torch.constant.int 1
    %648 = torch.aten.transpose.int %645, %int0_325, %int1_326 : !torch.vtensor<[2048,3072],f32>, !torch.int, !torch.int -> !torch.vtensor<[3072,2048],f32>
    %649 = torch.aten.mm %648, %arg235 : !torch.vtensor<[3072,2048],f32>, !torch.vtensor<[2048,768],f32> -> !torch.vtensor<[3072,768],f32>
    %int0_327 = torch.constant.int 0
    %int1_328 = torch.constant.int 1
    %650 = torch.aten.transpose.int %649, %int0_327, %int1_328 : !torch.vtensor<[3072,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,3072],f32>
    %651 = torch.aten.sum.dim_IntList %645, %24, %true, %none : !torch.vtensor<[2048,3072],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,3072],f32>
    %652 = torch.aten.view %651, %96 : !torch.vtensor<[1,3072],f32>, !torch.list<int> -> !torch.vtensor<[3072],f32>
    %653 = torch.aten.view %647, %28 : !torch.vtensor<[2048,768],f32>, !torch.list<int> -> !torch.vtensor<[4,512,768],f32>
    %654 = torch.aten.add.Tensor %629, %653, %int1 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %int0_329 = torch.constant.int 0
    %int1_330 = torch.constant.int 1
    %655 = torch.aten.transpose.int %650, %int0_329, %int1_330 : !torch.vtensor<[768,3072],f32>, !torch.int, !torch.int -> !torch.vtensor<[3072,768],f32>
    %656 = torch.aten.sub.Tensor %arg233, %result1_43, %int1 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %657 = torch.aten.mul.Tensor %656, %result2_44 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32> -> !torch.vtensor<[4,512,768],f32>
    %658 = torch.aten.mul.Tensor %654, %arg39 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[768],f32> -> !torch.vtensor<[4,512,768],f32>
    %659 = torch.aten.mul.Tensor %658, %0 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[],si64> -> !torch.vtensor<[4,512,768],f32>
    %660 = torch.aten.sum.dim_IntList %658, %35, %true, %none : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[4,512,1],f32>
    %661 = torch.aten.mul.Tensor %658, %657 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32> -> !torch.vtensor<[4,512,768],f32>
    %662 = torch.aten.sum.dim_IntList %661, %35, %true, %none : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[4,512,1],f32>
    %663 = torch.aten.mul.Tensor %657, %662 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32> -> !torch.vtensor<[4,512,768],f32>
    %664 = torch.aten.sub.Tensor %659, %660, %int1 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %665 = torch.aten.sub.Tensor %664, %663, %int1 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %666 = torch.aten.div.Tensor %result2_44, %0 : !torch.vtensor<[4,512,1],f32>, !torch.vtensor<[],si64> -> !torch.vtensor<[4,512,1],f32>
    %667 = torch.aten.mul.Tensor %666, %665 : !torch.vtensor<[4,512,1],f32>, !torch.vtensor<[4,512,768],f32> -> !torch.vtensor<[4,512,768],f32>
    %668 = torch.aten.mul.Tensor %654, %657 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32> -> !torch.vtensor<[4,512,768],f32>
    %669 = torch.aten.sum.dim_IntList %668, %45, %false, %none : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[768],f32>
    %670 = torch.aten.sum.dim_IntList %654, %45, %false, %none : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[768],f32>
    %671 = torch.aten.mul.Tensor %667, %arg232 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32> -> !torch.vtensor<[4,512,768],f32>
    %672 = torch.aten.view %671, %49 : !torch.vtensor<[4,512,768],f32>, !torch.list<int> -> !torch.vtensor<[2048,768],f32>
    %int0_331 = torch.constant.int 0
    %int1_332 = torch.constant.int 1
    %673 = torch.aten.transpose.int %arg230, %int0_331, %int1_332 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %674 = torch.aten.mm %672, %673 : !torch.vtensor<[2048,768],f32>, !torch.vtensor<[768,768],f32> -> !torch.vtensor<[2048,768],f32>
    %int0_333 = torch.constant.int 0
    %int1_334 = torch.constant.int 1
    %675 = torch.aten.transpose.int %672, %int0_333, %int1_334 : !torch.vtensor<[2048,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,2048],f32>
    %676 = torch.aten.mm %675, %arg231 : !torch.vtensor<[768,2048],f32>, !torch.vtensor<[2048,768],f32> -> !torch.vtensor<[768,768],f32>
    %int0_335 = torch.constant.int 0
    %int1_336 = torch.constant.int 1
    %677 = torch.aten.transpose.int %676, %int0_335, %int1_336 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %678 = torch.aten.sum.dim_IntList %672, %24, %true, %none : !torch.vtensor<[2048,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,768],f32>
    %679 = torch.aten.view %678, %3 : !torch.vtensor<[1,768],f32>, !torch.list<int> -> !torch.vtensor<[768],f32>
    %680 = torch.aten.view %674, %28 : !torch.vtensor<[2048,768],f32>, !torch.list<int> -> !torch.vtensor<[4,512,768],f32>
    %int0_337 = torch.constant.int 0
    %int1_338 = torch.constant.int 1
    %681 = torch.aten.transpose.int %677, %int0_337, %int1_338 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %682 = torch.aten.view %680, %127 : !torch.vtensor<[4,512,768],f32>, !torch.list<int> -> !torch.vtensor<[4,512,12,64],f32>
    %683 = torch.aten.permute %682, %129 : !torch.vtensor<[4,512,12,64],f32>, !torch.list<int> -> !torch.vtensor<[4,12,512,64],f32>
    %684 = torch.aten.clone %683, %int0 : !torch.vtensor<[4,12,512,64],f32>, !torch.int -> !torch.vtensor<[4,12,512,64],f32>
    %685 = torch.aten.view %684, %132 : !torch.vtensor<[4,12,512,64],f32>, !torch.list<int> -> !torch.vtensor<[48,512,64],f32>
    %686 = torch.aten.transpose.int %arg228, %int1, %int2 : !torch.vtensor<[48,512,512],f32>, !torch.int, !torch.int -> !torch.vtensor<[48,512,512],f32>
    %687 = torch.aten.bmm %686, %685 : !torch.vtensor<[48,512,512],f32>, !torch.vtensor<[48,512,64],f32> -> !torch.vtensor<[48,512,64],f32>
    %688 = torch.aten.transpose.int %arg229, %int1, %int2 : !torch.vtensor<[48,512,64],f32>, !torch.int, !torch.int -> !torch.vtensor<[48,64,512],f32>
    %689 = torch.aten.bmm %685, %688 : !torch.vtensor<[48,512,64],f32>, !torch.vtensor<[48,64,512],f32> -> !torch.vtensor<[48,512,512],f32>
    %690 = torch.aten.view %687, %138 : !torch.vtensor<[48,512,64],f32>, !torch.list<int> -> !torch.vtensor<[4,12,512,64],f32>
    %691 = torch.aten.view %689, %140 : !torch.vtensor<[48,512,512],f32>, !torch.list<int> -> !torch.vtensor<[4,12,512,512],f32>
    %692 = torch.aten.mul.Tensor %691, %arg227 : !torch.vtensor<[4,12,512,512],f32>, !torch.vtensor<[4,12,512,512],f32> -> !torch.vtensor<[4,12,512,512],f32>
    %693 = torch.aten.mul.Tensor %692, %arg226 : !torch.vtensor<[4,12,512,512],f32>, !torch.vtensor<[4,12,512,512],f32> -> !torch.vtensor<[4,12,512,512],f32>
    %694 = torch.prim.ListConstruct %int-1 : (!torch.int) -> !torch.list<int>
    %true_339 = torch.constant.bool true
    %none_340 = torch.constant.none
    %695 = torch.aten.sum.dim_IntList %693, %694, %true_339, %none_340 : !torch.vtensor<[4,12,512,512],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[4,12,512,1],f32>
    %int0_341 = torch.constant.int 0
    %696 = torch.aten.size.int %693, %int0_341 : !torch.vtensor<[4,12,512,512],f32>, !torch.int -> !torch.int
    %int1_342 = torch.constant.int 1
    %697 = torch.aten.size.int %693, %int1_342 : !torch.vtensor<[4,12,512,512],f32>, !torch.int -> !torch.int
    %int2_343 = torch.constant.int 2
    %698 = torch.aten.size.int %693, %int2_343 : !torch.vtensor<[4,12,512,512],f32>, !torch.int -> !torch.int
    %int3_344 = torch.constant.int 3
    %699 = torch.aten.size.int %693, %int3_344 : !torch.vtensor<[4,12,512,512],f32>, !torch.int -> !torch.int
    %700 = torch.prim.ListConstruct %696, %697, %698, %699 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %701 = torch.aten.broadcast_to %695, %700 : !torch.vtensor<[4,12,512,1],f32>, !torch.list<int> -> !torch.vtensor<[4,12,512,512],f32>
    %702 = torch.aten.mul.Tensor %arg226, %701 : !torch.vtensor<[4,12,512,512],f32>, !torch.vtensor<[4,12,512,512],f32> -> !torch.vtensor<[4,12,512,512],f32>
    %float1.000000e00_345 = torch.constant.float 1.000000e+00
    %703 = torch.aten.sub.Tensor %693, %702, %float1.000000e00_345 : !torch.vtensor<[4,12,512,512],f32>, !torch.vtensor<[4,12,512,512],f32>, !torch.float -> !torch.vtensor<[4,12,512,512],f32>
    %704 = torch.aten.div.Tensor %703, %1 : !torch.vtensor<[4,12,512,512],f32>, !torch.vtensor<[],f64> -> !torch.vtensor<[4,12,512,512],f32>
    %705 = torch.aten.view %704, %155 : !torch.vtensor<[4,12,512,512],f32>, !torch.list<int> -> !torch.vtensor<[48,512,512],f32>
    %706 = torch.aten.transpose.int %arg224, %int1, %int2 : !torch.vtensor<[48,512,64],f32>, !torch.int, !torch.int -> !torch.vtensor<[48,64,512],f32>
    %707 = torch.aten.bmm %706, %705 : !torch.vtensor<[48,64,512],f32>, !torch.vtensor<[48,512,512],f32> -> !torch.vtensor<[48,64,512],f32>
    %708 = torch.aten.transpose.int %arg225, %int1, %int2 : !torch.vtensor<[48,64,512],f32>, !torch.int, !torch.int -> !torch.vtensor<[48,512,64],f32>
    %709 = torch.aten.bmm %705, %708 : !torch.vtensor<[48,512,512],f32>, !torch.vtensor<[48,512,64],f32> -> !torch.vtensor<[48,512,64],f32>
    %710 = torch.aten.view %707, %161 : !torch.vtensor<[48,64,512],f32>, !torch.list<int> -> !torch.vtensor<[4,12,64,512],f32>
    %711 = torch.aten.view %709, %138 : !torch.vtensor<[48,512,64],f32>, !torch.list<int> -> !torch.vtensor<[4,12,512,64],f32>
    %712 = torch.aten.transpose.int %710, %int-1, %int-2 : !torch.vtensor<[4,12,64,512],f32>, !torch.int, !torch.int -> !torch.vtensor<[4,12,512,64],f32>
    %713 = torch.aten.permute %711, %129 : !torch.vtensor<[4,12,512,64],f32>, !torch.list<int> -> !torch.vtensor<[4,512,12,64],f32>
    %714 = torch.aten.clone %713, %int0 : !torch.vtensor<[4,512,12,64],f32>, !torch.int -> !torch.vtensor<[4,512,12,64],f32>
    %715 = torch.aten.view %714, %28 : !torch.vtensor<[4,512,12,64],f32>, !torch.list<int> -> !torch.vtensor<[4,512,768],f32>
    %716 = torch.aten.permute %690, %129 : !torch.vtensor<[4,12,512,64],f32>, !torch.list<int> -> !torch.vtensor<[4,512,12,64],f32>
    %717 = torch.aten.clone %716, %int0 : !torch.vtensor<[4,512,12,64],f32>, !torch.int -> !torch.vtensor<[4,512,12,64],f32>
    %718 = torch.aten.view %717, %28 : !torch.vtensor<[4,512,12,64],f32>, !torch.list<int> -> !torch.vtensor<[4,512,768],f32>
    %719 = torch.aten.view %718, %49 : !torch.vtensor<[4,512,768],f32>, !torch.list<int> -> !torch.vtensor<[2048,768],f32>
    %int0_346 = torch.constant.int 0
    %int1_347 = torch.constant.int 1
    %720 = torch.aten.transpose.int %arg222, %int0_346, %int1_347 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %721 = torch.aten.mm %719, %720 : !torch.vtensor<[2048,768],f32>, !torch.vtensor<[768,768],f32> -> !torch.vtensor<[2048,768],f32>
    %int0_348 = torch.constant.int 0
    %int1_349 = torch.constant.int 1
    %722 = torch.aten.transpose.int %719, %int0_348, %int1_349 : !torch.vtensor<[2048,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,2048],f32>
    %723 = torch.aten.mm %722, %arg223 : !torch.vtensor<[768,2048],f32>, !torch.vtensor<[2048,768],f32> -> !torch.vtensor<[768,768],f32>
    %int0_350 = torch.constant.int 0
    %int1_351 = torch.constant.int 1
    %724 = torch.aten.transpose.int %723, %int0_350, %int1_351 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %725 = torch.aten.sum.dim_IntList %719, %24, %true, %none : !torch.vtensor<[2048,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,768],f32>
    %726 = torch.aten.view %725, %3 : !torch.vtensor<[1,768],f32>, !torch.list<int> -> !torch.vtensor<[768],f32>
    %727 = torch.aten.view %721, %28 : !torch.vtensor<[2048,768],f32>, !torch.list<int> -> !torch.vtensor<[4,512,768],f32>
    %728 = torch.aten.add.Tensor %667, %727, %int1 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %int0_352 = torch.constant.int 0
    %int1_353 = torch.constant.int 1
    %729 = torch.aten.transpose.int %724, %int0_352, %int1_353 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %730 = torch.aten.permute %712, %129 : !torch.vtensor<[4,12,512,64],f32>, !torch.list<int> -> !torch.vtensor<[4,512,12,64],f32>
    %731 = torch.aten.view %730, %28 : !torch.vtensor<[4,512,12,64],f32>, !torch.list<int> -> !torch.vtensor<[4,512,768],f32>
    %732 = torch.aten.clone %731, %int0 : !torch.vtensor<[4,512,768],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %733 = torch.aten.view %732, %49 : !torch.vtensor<[4,512,768],f32>, !torch.list<int> -> !torch.vtensor<[2048,768],f32>
    %int0_354 = torch.constant.int 0
    %int1_355 = torch.constant.int 1
    %734 = torch.aten.transpose.int %arg220, %int0_354, %int1_355 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %735 = torch.aten.mm %733, %734 : !torch.vtensor<[2048,768],f32>, !torch.vtensor<[768,768],f32> -> !torch.vtensor<[2048,768],f32>
    %int0_356 = torch.constant.int 0
    %int1_357 = torch.constant.int 1
    %736 = torch.aten.transpose.int %733, %int0_356, %int1_357 : !torch.vtensor<[2048,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,2048],f32>
    %737 = torch.aten.mm %736, %arg221 : !torch.vtensor<[768,2048],f32>, !torch.vtensor<[2048,768],f32> -> !torch.vtensor<[768,768],f32>
    %int0_358 = torch.constant.int 0
    %int1_359 = torch.constant.int 1
    %738 = torch.aten.transpose.int %737, %int0_358, %int1_359 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %739 = torch.aten.sum.dim_IntList %733, %24, %true, %none : !torch.vtensor<[2048,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,768],f32>
    %740 = torch.aten.view %739, %3 : !torch.vtensor<[1,768],f32>, !torch.list<int> -> !torch.vtensor<[768],f32>
    %741 = torch.aten.view %735, %28 : !torch.vtensor<[2048,768],f32>, !torch.list<int> -> !torch.vtensor<[4,512,768],f32>
    %742 = torch.aten.add.Tensor %728, %741, %int1 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %int0_360 = torch.constant.int 0
    %int1_361 = torch.constant.int 1
    %743 = torch.aten.transpose.int %738, %int0_360, %int1_361 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %744 = torch.aten.view %715, %49 : !torch.vtensor<[4,512,768],f32>, !torch.list<int> -> !torch.vtensor<[2048,768],f32>
    %int0_362 = torch.constant.int 0
    %int1_363 = torch.constant.int 1
    %745 = torch.aten.transpose.int %arg218, %int0_362, %int1_363 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %746 = torch.aten.mm %744, %745 : !torch.vtensor<[2048,768],f32>, !torch.vtensor<[768,768],f32> -> !torch.vtensor<[2048,768],f32>
    %int0_364 = torch.constant.int 0
    %int1_365 = torch.constant.int 1
    %747 = torch.aten.transpose.int %744, %int0_364, %int1_365 : !torch.vtensor<[2048,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,2048],f32>
    %748 = torch.aten.mm %747, %arg219 : !torch.vtensor<[768,2048],f32>, !torch.vtensor<[2048,768],f32> -> !torch.vtensor<[768,768],f32>
    %int0_366 = torch.constant.int 0
    %int1_367 = torch.constant.int 1
    %749 = torch.aten.transpose.int %748, %int0_366, %int1_367 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %750 = torch.aten.sum.dim_IntList %744, %24, %true, %none : !torch.vtensor<[2048,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,768],f32>
    %751 = torch.aten.view %750, %3 : !torch.vtensor<[1,768],f32>, !torch.list<int> -> !torch.vtensor<[768],f32>
    %752 = torch.aten.view %746, %28 : !torch.vtensor<[2048,768],f32>, !torch.list<int> -> !torch.vtensor<[4,512,768],f32>
    %753 = torch.aten.add.Tensor %742, %752, %int1 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %int0_368 = torch.constant.int 0
    %int1_369 = torch.constant.int 1
    %754 = torch.aten.transpose.int %749, %int0_368, %int1_369 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %755 = torch.aten.sub.Tensor %arg217, %result1_40, %int1 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %756 = torch.aten.mul.Tensor %755, %result2_41 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32> -> !torch.vtensor<[4,512,768],f32>
    %757 = torch.aten.mul.Tensor %753, %arg37 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[768],f32> -> !torch.vtensor<[4,512,768],f32>
    %758 = torch.aten.mul.Tensor %757, %0 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[],si64> -> !torch.vtensor<[4,512,768],f32>
    %759 = torch.aten.sum.dim_IntList %757, %35, %true, %none : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[4,512,1],f32>
    %760 = torch.aten.mul.Tensor %757, %756 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32> -> !torch.vtensor<[4,512,768],f32>
    %761 = torch.aten.sum.dim_IntList %760, %35, %true, %none : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[4,512,1],f32>
    %762 = torch.aten.mul.Tensor %756, %761 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32> -> !torch.vtensor<[4,512,768],f32>
    %763 = torch.aten.sub.Tensor %758, %759, %int1 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %764 = torch.aten.sub.Tensor %763, %762, %int1 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %765 = torch.aten.div.Tensor %result2_41, %0 : !torch.vtensor<[4,512,1],f32>, !torch.vtensor<[],si64> -> !torch.vtensor<[4,512,1],f32>
    %766 = torch.aten.mul.Tensor %765, %764 : !torch.vtensor<[4,512,1],f32>, !torch.vtensor<[4,512,768],f32> -> !torch.vtensor<[4,512,768],f32>
    %767 = torch.aten.mul.Tensor %753, %756 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32> -> !torch.vtensor<[4,512,768],f32>
    %768 = torch.aten.sum.dim_IntList %767, %45, %false, %none : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[768],f32>
    %769 = torch.aten.sum.dim_IntList %753, %45, %false, %none : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[768],f32>
    %770 = torch.aten.mul.Tensor %766, %arg216 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32> -> !torch.vtensor<[4,512,768],f32>
    %771 = torch.aten.view %770, %49 : !torch.vtensor<[4,512,768],f32>, !torch.list<int> -> !torch.vtensor<[2048,768],f32>
    %int0_370 = torch.constant.int 0
    %int1_371 = torch.constant.int 1
    %772 = torch.aten.transpose.int %arg214, %int0_370, %int1_371 : !torch.vtensor<[3072,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,3072],f32>
    %773 = torch.aten.mm %771, %772 : !torch.vtensor<[2048,768],f32>, !torch.vtensor<[768,3072],f32> -> !torch.vtensor<[2048,3072],f32>
    %int0_372 = torch.constant.int 0
    %int1_373 = torch.constant.int 1
    %774 = torch.aten.transpose.int %771, %int0_372, %int1_373 : !torch.vtensor<[2048,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,2048],f32>
    %775 = torch.aten.mm %774, %arg215 : !torch.vtensor<[768,2048],f32>, !torch.vtensor<[2048,3072],f32> -> !torch.vtensor<[768,3072],f32>
    %int0_374 = torch.constant.int 0
    %int1_375 = torch.constant.int 1
    %776 = torch.aten.transpose.int %775, %int0_374, %int1_375 : !torch.vtensor<[768,3072],f32>, !torch.int, !torch.int -> !torch.vtensor<[3072,768],f32>
    %777 = torch.aten.sum.dim_IntList %771, %24, %true, %none : !torch.vtensor<[2048,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,768],f32>
    %778 = torch.aten.view %777, %3 : !torch.vtensor<[1,768],f32>, !torch.list<int> -> !torch.vtensor<[768],f32>
    %779 = torch.aten.view %773, %84 : !torch.vtensor<[2048,3072],f32>, !torch.list<int> -> !torch.vtensor<[4,512,3072],f32>
    %int0_376 = torch.constant.int 0
    %int1_377 = torch.constant.int 1
    %780 = torch.aten.transpose.int %776, %int0_376, %int1_377 : !torch.vtensor<[3072,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,3072],f32>
    %781 = torch.aten.gelu_backward %779, %arg213, %str : !torch.vtensor<[4,512,3072],f32>, !torch.vtensor<[4,512,3072],f32>, !torch.str -> !torch.vtensor<[4,512,3072],f32>
    %782 = torch.aten.view %781, %88 : !torch.vtensor<[4,512,3072],f32>, !torch.list<int> -> !torch.vtensor<[2048,3072],f32>
    %int0_378 = torch.constant.int 0
    %int1_379 = torch.constant.int 1
    %783 = torch.aten.transpose.int %arg211, %int0_378, %int1_379 : !torch.vtensor<[768,3072],f32>, !torch.int, !torch.int -> !torch.vtensor<[3072,768],f32>
    %784 = torch.aten.mm %782, %783 : !torch.vtensor<[2048,3072],f32>, !torch.vtensor<[3072,768],f32> -> !torch.vtensor<[2048,768],f32>
    %int0_380 = torch.constant.int 0
    %int1_381 = torch.constant.int 1
    %785 = torch.aten.transpose.int %782, %int0_380, %int1_381 : !torch.vtensor<[2048,3072],f32>, !torch.int, !torch.int -> !torch.vtensor<[3072,2048],f32>
    %786 = torch.aten.mm %785, %arg212 : !torch.vtensor<[3072,2048],f32>, !torch.vtensor<[2048,768],f32> -> !torch.vtensor<[3072,768],f32>
    %int0_382 = torch.constant.int 0
    %int1_383 = torch.constant.int 1
    %787 = torch.aten.transpose.int %786, %int0_382, %int1_383 : !torch.vtensor<[3072,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,3072],f32>
    %788 = torch.aten.sum.dim_IntList %782, %24, %true, %none : !torch.vtensor<[2048,3072],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,3072],f32>
    %789 = torch.aten.view %788, %96 : !torch.vtensor<[1,3072],f32>, !torch.list<int> -> !torch.vtensor<[3072],f32>
    %790 = torch.aten.view %784, %28 : !torch.vtensor<[2048,768],f32>, !torch.list<int> -> !torch.vtensor<[4,512,768],f32>
    %791 = torch.aten.add.Tensor %766, %790, %int1 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %int0_384 = torch.constant.int 0
    %int1_385 = torch.constant.int 1
    %792 = torch.aten.transpose.int %787, %int0_384, %int1_385 : !torch.vtensor<[768,3072],f32>, !torch.int, !torch.int -> !torch.vtensor<[3072,768],f32>
    %793 = torch.aten.sub.Tensor %arg210, %result1_37, %int1 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %794 = torch.aten.mul.Tensor %793, %result2_38 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32> -> !torch.vtensor<[4,512,768],f32>
    %795 = torch.aten.mul.Tensor %791, %arg35 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[768],f32> -> !torch.vtensor<[4,512,768],f32>
    %796 = torch.aten.mul.Tensor %795, %0 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[],si64> -> !torch.vtensor<[4,512,768],f32>
    %797 = torch.aten.sum.dim_IntList %795, %35, %true, %none : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[4,512,1],f32>
    %798 = torch.aten.mul.Tensor %795, %794 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32> -> !torch.vtensor<[4,512,768],f32>
    %799 = torch.aten.sum.dim_IntList %798, %35, %true, %none : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[4,512,1],f32>
    %800 = torch.aten.mul.Tensor %794, %799 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32> -> !torch.vtensor<[4,512,768],f32>
    %801 = torch.aten.sub.Tensor %796, %797, %int1 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %802 = torch.aten.sub.Tensor %801, %800, %int1 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %803 = torch.aten.div.Tensor %result2_38, %0 : !torch.vtensor<[4,512,1],f32>, !torch.vtensor<[],si64> -> !torch.vtensor<[4,512,1],f32>
    %804 = torch.aten.mul.Tensor %803, %802 : !torch.vtensor<[4,512,1],f32>, !torch.vtensor<[4,512,768],f32> -> !torch.vtensor<[4,512,768],f32>
    %805 = torch.aten.mul.Tensor %791, %794 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32> -> !torch.vtensor<[4,512,768],f32>
    %806 = torch.aten.sum.dim_IntList %805, %45, %false, %none : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[768],f32>
    %807 = torch.aten.sum.dim_IntList %791, %45, %false, %none : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[768],f32>
    %808 = torch.aten.mul.Tensor %804, %arg209 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32> -> !torch.vtensor<[4,512,768],f32>
    %809 = torch.aten.view %808, %49 : !torch.vtensor<[4,512,768],f32>, !torch.list<int> -> !torch.vtensor<[2048,768],f32>
    %int0_386 = torch.constant.int 0
    %int1_387 = torch.constant.int 1
    %810 = torch.aten.transpose.int %arg207, %int0_386, %int1_387 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %811 = torch.aten.mm %809, %810 : !torch.vtensor<[2048,768],f32>, !torch.vtensor<[768,768],f32> -> !torch.vtensor<[2048,768],f32>
    %int0_388 = torch.constant.int 0
    %int1_389 = torch.constant.int 1
    %812 = torch.aten.transpose.int %809, %int0_388, %int1_389 : !torch.vtensor<[2048,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,2048],f32>
    %813 = torch.aten.mm %812, %arg208 : !torch.vtensor<[768,2048],f32>, !torch.vtensor<[2048,768],f32> -> !torch.vtensor<[768,768],f32>
    %int0_390 = torch.constant.int 0
    %int1_391 = torch.constant.int 1
    %814 = torch.aten.transpose.int %813, %int0_390, %int1_391 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %815 = torch.aten.sum.dim_IntList %809, %24, %true, %none : !torch.vtensor<[2048,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,768],f32>
    %816 = torch.aten.view %815, %3 : !torch.vtensor<[1,768],f32>, !torch.list<int> -> !torch.vtensor<[768],f32>
    %817 = torch.aten.view %811, %28 : !torch.vtensor<[2048,768],f32>, !torch.list<int> -> !torch.vtensor<[4,512,768],f32>
    %int0_392 = torch.constant.int 0
    %int1_393 = torch.constant.int 1
    %818 = torch.aten.transpose.int %814, %int0_392, %int1_393 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %819 = torch.aten.view %817, %127 : !torch.vtensor<[4,512,768],f32>, !torch.list<int> -> !torch.vtensor<[4,512,12,64],f32>
    %820 = torch.aten.permute %819, %129 : !torch.vtensor<[4,512,12,64],f32>, !torch.list<int> -> !torch.vtensor<[4,12,512,64],f32>
    %821 = torch.aten.clone %820, %int0 : !torch.vtensor<[4,12,512,64],f32>, !torch.int -> !torch.vtensor<[4,12,512,64],f32>
    %822 = torch.aten.view %821, %132 : !torch.vtensor<[4,12,512,64],f32>, !torch.list<int> -> !torch.vtensor<[48,512,64],f32>
    %823 = torch.aten.transpose.int %arg205, %int1, %int2 : !torch.vtensor<[48,512,512],f32>, !torch.int, !torch.int -> !torch.vtensor<[48,512,512],f32>
    %824 = torch.aten.bmm %823, %822 : !torch.vtensor<[48,512,512],f32>, !torch.vtensor<[48,512,64],f32> -> !torch.vtensor<[48,512,64],f32>
    %825 = torch.aten.transpose.int %arg206, %int1, %int2 : !torch.vtensor<[48,512,64],f32>, !torch.int, !torch.int -> !torch.vtensor<[48,64,512],f32>
    %826 = torch.aten.bmm %822, %825 : !torch.vtensor<[48,512,64],f32>, !torch.vtensor<[48,64,512],f32> -> !torch.vtensor<[48,512,512],f32>
    %827 = torch.aten.view %824, %138 : !torch.vtensor<[48,512,64],f32>, !torch.list<int> -> !torch.vtensor<[4,12,512,64],f32>
    %828 = torch.aten.view %826, %140 : !torch.vtensor<[48,512,512],f32>, !torch.list<int> -> !torch.vtensor<[4,12,512,512],f32>
    %829 = torch.aten.mul.Tensor %828, %arg204 : !torch.vtensor<[4,12,512,512],f32>, !torch.vtensor<[4,12,512,512],f32> -> !torch.vtensor<[4,12,512,512],f32>
    %830 = torch.aten.mul.Tensor %829, %arg203 : !torch.vtensor<[4,12,512,512],f32>, !torch.vtensor<[4,12,512,512],f32> -> !torch.vtensor<[4,12,512,512],f32>
    %831 = torch.prim.ListConstruct %int-1 : (!torch.int) -> !torch.list<int>
    %true_394 = torch.constant.bool true
    %none_395 = torch.constant.none
    %832 = torch.aten.sum.dim_IntList %830, %831, %true_394, %none_395 : !torch.vtensor<[4,12,512,512],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[4,12,512,1],f32>
    %int0_396 = torch.constant.int 0
    %833 = torch.aten.size.int %830, %int0_396 : !torch.vtensor<[4,12,512,512],f32>, !torch.int -> !torch.int
    %int1_397 = torch.constant.int 1
    %834 = torch.aten.size.int %830, %int1_397 : !torch.vtensor<[4,12,512,512],f32>, !torch.int -> !torch.int
    %int2_398 = torch.constant.int 2
    %835 = torch.aten.size.int %830, %int2_398 : !torch.vtensor<[4,12,512,512],f32>, !torch.int -> !torch.int
    %int3_399 = torch.constant.int 3
    %836 = torch.aten.size.int %830, %int3_399 : !torch.vtensor<[4,12,512,512],f32>, !torch.int -> !torch.int
    %837 = torch.prim.ListConstruct %833, %834, %835, %836 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %838 = torch.aten.broadcast_to %832, %837 : !torch.vtensor<[4,12,512,1],f32>, !torch.list<int> -> !torch.vtensor<[4,12,512,512],f32>
    %839 = torch.aten.mul.Tensor %arg203, %838 : !torch.vtensor<[4,12,512,512],f32>, !torch.vtensor<[4,12,512,512],f32> -> !torch.vtensor<[4,12,512,512],f32>
    %float1.000000e00_400 = torch.constant.float 1.000000e+00
    %840 = torch.aten.sub.Tensor %830, %839, %float1.000000e00_400 : !torch.vtensor<[4,12,512,512],f32>, !torch.vtensor<[4,12,512,512],f32>, !torch.float -> !torch.vtensor<[4,12,512,512],f32>
    %841 = torch.aten.div.Tensor %840, %1 : !torch.vtensor<[4,12,512,512],f32>, !torch.vtensor<[],f64> -> !torch.vtensor<[4,12,512,512],f32>
    %842 = torch.aten.view %841, %155 : !torch.vtensor<[4,12,512,512],f32>, !torch.list<int> -> !torch.vtensor<[48,512,512],f32>
    %843 = torch.aten.transpose.int %arg201, %int1, %int2 : !torch.vtensor<[48,512,64],f32>, !torch.int, !torch.int -> !torch.vtensor<[48,64,512],f32>
    %844 = torch.aten.bmm %843, %842 : !torch.vtensor<[48,64,512],f32>, !torch.vtensor<[48,512,512],f32> -> !torch.vtensor<[48,64,512],f32>
    %845 = torch.aten.transpose.int %arg202, %int1, %int2 : !torch.vtensor<[48,64,512],f32>, !torch.int, !torch.int -> !torch.vtensor<[48,512,64],f32>
    %846 = torch.aten.bmm %842, %845 : !torch.vtensor<[48,512,512],f32>, !torch.vtensor<[48,512,64],f32> -> !torch.vtensor<[48,512,64],f32>
    %847 = torch.aten.view %844, %161 : !torch.vtensor<[48,64,512],f32>, !torch.list<int> -> !torch.vtensor<[4,12,64,512],f32>
    %848 = torch.aten.view %846, %138 : !torch.vtensor<[48,512,64],f32>, !torch.list<int> -> !torch.vtensor<[4,12,512,64],f32>
    %849 = torch.aten.transpose.int %847, %int-1, %int-2 : !torch.vtensor<[4,12,64,512],f32>, !torch.int, !torch.int -> !torch.vtensor<[4,12,512,64],f32>
    %850 = torch.aten.permute %848, %129 : !torch.vtensor<[4,12,512,64],f32>, !torch.list<int> -> !torch.vtensor<[4,512,12,64],f32>
    %851 = torch.aten.clone %850, %int0 : !torch.vtensor<[4,512,12,64],f32>, !torch.int -> !torch.vtensor<[4,512,12,64],f32>
    %852 = torch.aten.view %851, %28 : !torch.vtensor<[4,512,12,64],f32>, !torch.list<int> -> !torch.vtensor<[4,512,768],f32>
    %853 = torch.aten.permute %827, %129 : !torch.vtensor<[4,12,512,64],f32>, !torch.list<int> -> !torch.vtensor<[4,512,12,64],f32>
    %854 = torch.aten.clone %853, %int0 : !torch.vtensor<[4,512,12,64],f32>, !torch.int -> !torch.vtensor<[4,512,12,64],f32>
    %855 = torch.aten.view %854, %28 : !torch.vtensor<[4,512,12,64],f32>, !torch.list<int> -> !torch.vtensor<[4,512,768],f32>
    %856 = torch.aten.view %855, %49 : !torch.vtensor<[4,512,768],f32>, !torch.list<int> -> !torch.vtensor<[2048,768],f32>
    %int0_401 = torch.constant.int 0
    %int1_402 = torch.constant.int 1
    %857 = torch.aten.transpose.int %arg199, %int0_401, %int1_402 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %858 = torch.aten.mm %856, %857 : !torch.vtensor<[2048,768],f32>, !torch.vtensor<[768,768],f32> -> !torch.vtensor<[2048,768],f32>
    %int0_403 = torch.constant.int 0
    %int1_404 = torch.constant.int 1
    %859 = torch.aten.transpose.int %856, %int0_403, %int1_404 : !torch.vtensor<[2048,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,2048],f32>
    %860 = torch.aten.mm %859, %arg200 : !torch.vtensor<[768,2048],f32>, !torch.vtensor<[2048,768],f32> -> !torch.vtensor<[768,768],f32>
    %int0_405 = torch.constant.int 0
    %int1_406 = torch.constant.int 1
    %861 = torch.aten.transpose.int %860, %int0_405, %int1_406 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %862 = torch.aten.sum.dim_IntList %856, %24, %true, %none : !torch.vtensor<[2048,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,768],f32>
    %863 = torch.aten.view %862, %3 : !torch.vtensor<[1,768],f32>, !torch.list<int> -> !torch.vtensor<[768],f32>
    %864 = torch.aten.view %858, %28 : !torch.vtensor<[2048,768],f32>, !torch.list<int> -> !torch.vtensor<[4,512,768],f32>
    %865 = torch.aten.add.Tensor %804, %864, %int1 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %int0_407 = torch.constant.int 0
    %int1_408 = torch.constant.int 1
    %866 = torch.aten.transpose.int %861, %int0_407, %int1_408 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %867 = torch.aten.permute %849, %129 : !torch.vtensor<[4,12,512,64],f32>, !torch.list<int> -> !torch.vtensor<[4,512,12,64],f32>
    %868 = torch.aten.view %867, %28 : !torch.vtensor<[4,512,12,64],f32>, !torch.list<int> -> !torch.vtensor<[4,512,768],f32>
    %869 = torch.aten.clone %868, %int0 : !torch.vtensor<[4,512,768],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %870 = torch.aten.view %869, %49 : !torch.vtensor<[4,512,768],f32>, !torch.list<int> -> !torch.vtensor<[2048,768],f32>
    %int0_409 = torch.constant.int 0
    %int1_410 = torch.constant.int 1
    %871 = torch.aten.transpose.int %arg197, %int0_409, %int1_410 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %872 = torch.aten.mm %870, %871 : !torch.vtensor<[2048,768],f32>, !torch.vtensor<[768,768],f32> -> !torch.vtensor<[2048,768],f32>
    %int0_411 = torch.constant.int 0
    %int1_412 = torch.constant.int 1
    %873 = torch.aten.transpose.int %870, %int0_411, %int1_412 : !torch.vtensor<[2048,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,2048],f32>
    %874 = torch.aten.mm %873, %arg198 : !torch.vtensor<[768,2048],f32>, !torch.vtensor<[2048,768],f32> -> !torch.vtensor<[768,768],f32>
    %int0_413 = torch.constant.int 0
    %int1_414 = torch.constant.int 1
    %875 = torch.aten.transpose.int %874, %int0_413, %int1_414 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %876 = torch.aten.sum.dim_IntList %870, %24, %true, %none : !torch.vtensor<[2048,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,768],f32>
    %877 = torch.aten.view %876, %3 : !torch.vtensor<[1,768],f32>, !torch.list<int> -> !torch.vtensor<[768],f32>
    %878 = torch.aten.view %872, %28 : !torch.vtensor<[2048,768],f32>, !torch.list<int> -> !torch.vtensor<[4,512,768],f32>
    %879 = torch.aten.add.Tensor %865, %878, %int1 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %int0_415 = torch.constant.int 0
    %int1_416 = torch.constant.int 1
    %880 = torch.aten.transpose.int %875, %int0_415, %int1_416 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %881 = torch.aten.view %852, %49 : !torch.vtensor<[4,512,768],f32>, !torch.list<int> -> !torch.vtensor<[2048,768],f32>
    %int0_417 = torch.constant.int 0
    %int1_418 = torch.constant.int 1
    %882 = torch.aten.transpose.int %arg195, %int0_417, %int1_418 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %883 = torch.aten.mm %881, %882 : !torch.vtensor<[2048,768],f32>, !torch.vtensor<[768,768],f32> -> !torch.vtensor<[2048,768],f32>
    %int0_419 = torch.constant.int 0
    %int1_420 = torch.constant.int 1
    %884 = torch.aten.transpose.int %881, %int0_419, %int1_420 : !torch.vtensor<[2048,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,2048],f32>
    %885 = torch.aten.mm %884, %arg196 : !torch.vtensor<[768,2048],f32>, !torch.vtensor<[2048,768],f32> -> !torch.vtensor<[768,768],f32>
    %int0_421 = torch.constant.int 0
    %int1_422 = torch.constant.int 1
    %886 = torch.aten.transpose.int %885, %int0_421, %int1_422 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %887 = torch.aten.sum.dim_IntList %881, %24, %true, %none : !torch.vtensor<[2048,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,768],f32>
    %888 = torch.aten.view %887, %3 : !torch.vtensor<[1,768],f32>, !torch.list<int> -> !torch.vtensor<[768],f32>
    %889 = torch.aten.view %883, %28 : !torch.vtensor<[2048,768],f32>, !torch.list<int> -> !torch.vtensor<[4,512,768],f32>
    %890 = torch.aten.add.Tensor %879, %889, %int1 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %int0_423 = torch.constant.int 0
    %int1_424 = torch.constant.int 1
    %891 = torch.aten.transpose.int %886, %int0_423, %int1_424 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %892 = torch.aten.sub.Tensor %arg194, %result1_34, %int1 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %893 = torch.aten.mul.Tensor %892, %result2_35 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32> -> !torch.vtensor<[4,512,768],f32>
    %894 = torch.aten.mul.Tensor %890, %arg33 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[768],f32> -> !torch.vtensor<[4,512,768],f32>
    %895 = torch.aten.mul.Tensor %894, %0 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[],si64> -> !torch.vtensor<[4,512,768],f32>
    %896 = torch.aten.sum.dim_IntList %894, %35, %true, %none : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[4,512,1],f32>
    %897 = torch.aten.mul.Tensor %894, %893 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32> -> !torch.vtensor<[4,512,768],f32>
    %898 = torch.aten.sum.dim_IntList %897, %35, %true, %none : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[4,512,1],f32>
    %899 = torch.aten.mul.Tensor %893, %898 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32> -> !torch.vtensor<[4,512,768],f32>
    %900 = torch.aten.sub.Tensor %895, %896, %int1 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %901 = torch.aten.sub.Tensor %900, %899, %int1 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %902 = torch.aten.div.Tensor %result2_35, %0 : !torch.vtensor<[4,512,1],f32>, !torch.vtensor<[],si64> -> !torch.vtensor<[4,512,1],f32>
    %903 = torch.aten.mul.Tensor %902, %901 : !torch.vtensor<[4,512,1],f32>, !torch.vtensor<[4,512,768],f32> -> !torch.vtensor<[4,512,768],f32>
    %904 = torch.aten.mul.Tensor %890, %893 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32> -> !torch.vtensor<[4,512,768],f32>
    %905 = torch.aten.sum.dim_IntList %904, %45, %false, %none : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[768],f32>
    %906 = torch.aten.sum.dim_IntList %890, %45, %false, %none : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[768],f32>
    %907 = torch.aten.mul.Tensor %903, %arg193 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32> -> !torch.vtensor<[4,512,768],f32>
    %908 = torch.aten.view %907, %49 : !torch.vtensor<[4,512,768],f32>, !torch.list<int> -> !torch.vtensor<[2048,768],f32>
    %int0_425 = torch.constant.int 0
    %int1_426 = torch.constant.int 1
    %909 = torch.aten.transpose.int %arg191, %int0_425, %int1_426 : !torch.vtensor<[3072,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,3072],f32>
    %910 = torch.aten.mm %908, %909 : !torch.vtensor<[2048,768],f32>, !torch.vtensor<[768,3072],f32> -> !torch.vtensor<[2048,3072],f32>
    %int0_427 = torch.constant.int 0
    %int1_428 = torch.constant.int 1
    %911 = torch.aten.transpose.int %908, %int0_427, %int1_428 : !torch.vtensor<[2048,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,2048],f32>
    %912 = torch.aten.mm %911, %arg192 : !torch.vtensor<[768,2048],f32>, !torch.vtensor<[2048,3072],f32> -> !torch.vtensor<[768,3072],f32>
    %int0_429 = torch.constant.int 0
    %int1_430 = torch.constant.int 1
    %913 = torch.aten.transpose.int %912, %int0_429, %int1_430 : !torch.vtensor<[768,3072],f32>, !torch.int, !torch.int -> !torch.vtensor<[3072,768],f32>
    %914 = torch.aten.sum.dim_IntList %908, %24, %true, %none : !torch.vtensor<[2048,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,768],f32>
    %915 = torch.aten.view %914, %3 : !torch.vtensor<[1,768],f32>, !torch.list<int> -> !torch.vtensor<[768],f32>
    %916 = torch.aten.view %910, %84 : !torch.vtensor<[2048,3072],f32>, !torch.list<int> -> !torch.vtensor<[4,512,3072],f32>
    %int0_431 = torch.constant.int 0
    %int1_432 = torch.constant.int 1
    %917 = torch.aten.transpose.int %913, %int0_431, %int1_432 : !torch.vtensor<[3072,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,3072],f32>
    %918 = torch.aten.gelu_backward %916, %arg190, %str : !torch.vtensor<[4,512,3072],f32>, !torch.vtensor<[4,512,3072],f32>, !torch.str -> !torch.vtensor<[4,512,3072],f32>
    %919 = torch.aten.view %918, %88 : !torch.vtensor<[4,512,3072],f32>, !torch.list<int> -> !torch.vtensor<[2048,3072],f32>
    %int0_433 = torch.constant.int 0
    %int1_434 = torch.constant.int 1
    %920 = torch.aten.transpose.int %arg188, %int0_433, %int1_434 : !torch.vtensor<[768,3072],f32>, !torch.int, !torch.int -> !torch.vtensor<[3072,768],f32>
    %921 = torch.aten.mm %919, %920 : !torch.vtensor<[2048,3072],f32>, !torch.vtensor<[3072,768],f32> -> !torch.vtensor<[2048,768],f32>
    %int0_435 = torch.constant.int 0
    %int1_436 = torch.constant.int 1
    %922 = torch.aten.transpose.int %919, %int0_435, %int1_436 : !torch.vtensor<[2048,3072],f32>, !torch.int, !torch.int -> !torch.vtensor<[3072,2048],f32>
    %923 = torch.aten.mm %922, %arg189 : !torch.vtensor<[3072,2048],f32>, !torch.vtensor<[2048,768],f32> -> !torch.vtensor<[3072,768],f32>
    %int0_437 = torch.constant.int 0
    %int1_438 = torch.constant.int 1
    %924 = torch.aten.transpose.int %923, %int0_437, %int1_438 : !torch.vtensor<[3072,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,3072],f32>
    %925 = torch.aten.sum.dim_IntList %919, %24, %true, %none : !torch.vtensor<[2048,3072],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,3072],f32>
    %926 = torch.aten.view %925, %96 : !torch.vtensor<[1,3072],f32>, !torch.list<int> -> !torch.vtensor<[3072],f32>
    %927 = torch.aten.view %921, %28 : !torch.vtensor<[2048,768],f32>, !torch.list<int> -> !torch.vtensor<[4,512,768],f32>
    %928 = torch.aten.add.Tensor %903, %927, %int1 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %int0_439 = torch.constant.int 0
    %int1_440 = torch.constant.int 1
    %929 = torch.aten.transpose.int %924, %int0_439, %int1_440 : !torch.vtensor<[768,3072],f32>, !torch.int, !torch.int -> !torch.vtensor<[3072,768],f32>
    %930 = torch.aten.sub.Tensor %arg187, %result1_31, %int1 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %931 = torch.aten.mul.Tensor %930, %result2_32 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32> -> !torch.vtensor<[4,512,768],f32>
    %932 = torch.aten.mul.Tensor %928, %arg31 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[768],f32> -> !torch.vtensor<[4,512,768],f32>
    %933 = torch.aten.mul.Tensor %932, %0 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[],si64> -> !torch.vtensor<[4,512,768],f32>
    %934 = torch.aten.sum.dim_IntList %932, %35, %true, %none : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[4,512,1],f32>
    %935 = torch.aten.mul.Tensor %932, %931 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32> -> !torch.vtensor<[4,512,768],f32>
    %936 = torch.aten.sum.dim_IntList %935, %35, %true, %none : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[4,512,1],f32>
    %937 = torch.aten.mul.Tensor %931, %936 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32> -> !torch.vtensor<[4,512,768],f32>
    %938 = torch.aten.sub.Tensor %933, %934, %int1 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %939 = torch.aten.sub.Tensor %938, %937, %int1 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %940 = torch.aten.div.Tensor %result2_32, %0 : !torch.vtensor<[4,512,1],f32>, !torch.vtensor<[],si64> -> !torch.vtensor<[4,512,1],f32>
    %941 = torch.aten.mul.Tensor %940, %939 : !torch.vtensor<[4,512,1],f32>, !torch.vtensor<[4,512,768],f32> -> !torch.vtensor<[4,512,768],f32>
    %942 = torch.aten.mul.Tensor %928, %931 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32> -> !torch.vtensor<[4,512,768],f32>
    %943 = torch.aten.sum.dim_IntList %942, %45, %false, %none : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[768],f32>
    %944 = torch.aten.sum.dim_IntList %928, %45, %false, %none : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[768],f32>
    %945 = torch.aten.mul.Tensor %941, %arg186 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32> -> !torch.vtensor<[4,512,768],f32>
    %946 = torch.aten.view %945, %49 : !torch.vtensor<[4,512,768],f32>, !torch.list<int> -> !torch.vtensor<[2048,768],f32>
    %int0_441 = torch.constant.int 0
    %int1_442 = torch.constant.int 1
    %947 = torch.aten.transpose.int %arg184, %int0_441, %int1_442 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %948 = torch.aten.mm %946, %947 : !torch.vtensor<[2048,768],f32>, !torch.vtensor<[768,768],f32> -> !torch.vtensor<[2048,768],f32>
    %int0_443 = torch.constant.int 0
    %int1_444 = torch.constant.int 1
    %949 = torch.aten.transpose.int %946, %int0_443, %int1_444 : !torch.vtensor<[2048,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,2048],f32>
    %950 = torch.aten.mm %949, %arg185 : !torch.vtensor<[768,2048],f32>, !torch.vtensor<[2048,768],f32> -> !torch.vtensor<[768,768],f32>
    %int0_445 = torch.constant.int 0
    %int1_446 = torch.constant.int 1
    %951 = torch.aten.transpose.int %950, %int0_445, %int1_446 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %952 = torch.aten.sum.dim_IntList %946, %24, %true, %none : !torch.vtensor<[2048,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,768],f32>
    %953 = torch.aten.view %952, %3 : !torch.vtensor<[1,768],f32>, !torch.list<int> -> !torch.vtensor<[768],f32>
    %954 = torch.aten.view %948, %28 : !torch.vtensor<[2048,768],f32>, !torch.list<int> -> !torch.vtensor<[4,512,768],f32>
    %int0_447 = torch.constant.int 0
    %int1_448 = torch.constant.int 1
    %955 = torch.aten.transpose.int %951, %int0_447, %int1_448 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %956 = torch.aten.view %954, %127 : !torch.vtensor<[4,512,768],f32>, !torch.list<int> -> !torch.vtensor<[4,512,12,64],f32>
    %957 = torch.aten.permute %956, %129 : !torch.vtensor<[4,512,12,64],f32>, !torch.list<int> -> !torch.vtensor<[4,12,512,64],f32>
    %958 = torch.aten.clone %957, %int0 : !torch.vtensor<[4,12,512,64],f32>, !torch.int -> !torch.vtensor<[4,12,512,64],f32>
    %959 = torch.aten.view %958, %132 : !torch.vtensor<[4,12,512,64],f32>, !torch.list<int> -> !torch.vtensor<[48,512,64],f32>
    %960 = torch.aten.transpose.int %arg182, %int1, %int2 : !torch.vtensor<[48,512,512],f32>, !torch.int, !torch.int -> !torch.vtensor<[48,512,512],f32>
    %961 = torch.aten.bmm %960, %959 : !torch.vtensor<[48,512,512],f32>, !torch.vtensor<[48,512,64],f32> -> !torch.vtensor<[48,512,64],f32>
    %962 = torch.aten.transpose.int %arg183, %int1, %int2 : !torch.vtensor<[48,512,64],f32>, !torch.int, !torch.int -> !torch.vtensor<[48,64,512],f32>
    %963 = torch.aten.bmm %959, %962 : !torch.vtensor<[48,512,64],f32>, !torch.vtensor<[48,64,512],f32> -> !torch.vtensor<[48,512,512],f32>
    %964 = torch.aten.view %961, %138 : !torch.vtensor<[48,512,64],f32>, !torch.list<int> -> !torch.vtensor<[4,12,512,64],f32>
    %965 = torch.aten.view %963, %140 : !torch.vtensor<[48,512,512],f32>, !torch.list<int> -> !torch.vtensor<[4,12,512,512],f32>
    %966 = torch.aten.mul.Tensor %965, %arg181 : !torch.vtensor<[4,12,512,512],f32>, !torch.vtensor<[4,12,512,512],f32> -> !torch.vtensor<[4,12,512,512],f32>
    %967 = torch.aten.mul.Tensor %966, %arg180 : !torch.vtensor<[4,12,512,512],f32>, !torch.vtensor<[4,12,512,512],f32> -> !torch.vtensor<[4,12,512,512],f32>
    %968 = torch.prim.ListConstruct %int-1 : (!torch.int) -> !torch.list<int>
    %true_449 = torch.constant.bool true
    %none_450 = torch.constant.none
    %969 = torch.aten.sum.dim_IntList %967, %968, %true_449, %none_450 : !torch.vtensor<[4,12,512,512],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[4,12,512,1],f32>
    %int0_451 = torch.constant.int 0
    %970 = torch.aten.size.int %967, %int0_451 : !torch.vtensor<[4,12,512,512],f32>, !torch.int -> !torch.int
    %int1_452 = torch.constant.int 1
    %971 = torch.aten.size.int %967, %int1_452 : !torch.vtensor<[4,12,512,512],f32>, !torch.int -> !torch.int
    %int2_453 = torch.constant.int 2
    %972 = torch.aten.size.int %967, %int2_453 : !torch.vtensor<[4,12,512,512],f32>, !torch.int -> !torch.int
    %int3_454 = torch.constant.int 3
    %973 = torch.aten.size.int %967, %int3_454 : !torch.vtensor<[4,12,512,512],f32>, !torch.int -> !torch.int
    %974 = torch.prim.ListConstruct %970, %971, %972, %973 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %975 = torch.aten.broadcast_to %969, %974 : !torch.vtensor<[4,12,512,1],f32>, !torch.list<int> -> !torch.vtensor<[4,12,512,512],f32>
    %976 = torch.aten.mul.Tensor %arg180, %975 : !torch.vtensor<[4,12,512,512],f32>, !torch.vtensor<[4,12,512,512],f32> -> !torch.vtensor<[4,12,512,512],f32>
    %float1.000000e00_455 = torch.constant.float 1.000000e+00
    %977 = torch.aten.sub.Tensor %967, %976, %float1.000000e00_455 : !torch.vtensor<[4,12,512,512],f32>, !torch.vtensor<[4,12,512,512],f32>, !torch.float -> !torch.vtensor<[4,12,512,512],f32>
    %978 = torch.aten.div.Tensor %977, %1 : !torch.vtensor<[4,12,512,512],f32>, !torch.vtensor<[],f64> -> !torch.vtensor<[4,12,512,512],f32>
    %979 = torch.aten.view %978, %155 : !torch.vtensor<[4,12,512,512],f32>, !torch.list<int> -> !torch.vtensor<[48,512,512],f32>
    %980 = torch.aten.transpose.int %arg178, %int1, %int2 : !torch.vtensor<[48,512,64],f32>, !torch.int, !torch.int -> !torch.vtensor<[48,64,512],f32>
    %981 = torch.aten.bmm %980, %979 : !torch.vtensor<[48,64,512],f32>, !torch.vtensor<[48,512,512],f32> -> !torch.vtensor<[48,64,512],f32>
    %982 = torch.aten.transpose.int %arg179, %int1, %int2 : !torch.vtensor<[48,64,512],f32>, !torch.int, !torch.int -> !torch.vtensor<[48,512,64],f32>
    %983 = torch.aten.bmm %979, %982 : !torch.vtensor<[48,512,512],f32>, !torch.vtensor<[48,512,64],f32> -> !torch.vtensor<[48,512,64],f32>
    %984 = torch.aten.view %981, %161 : !torch.vtensor<[48,64,512],f32>, !torch.list<int> -> !torch.vtensor<[4,12,64,512],f32>
    %985 = torch.aten.view %983, %138 : !torch.vtensor<[48,512,64],f32>, !torch.list<int> -> !torch.vtensor<[4,12,512,64],f32>
    %986 = torch.aten.transpose.int %984, %int-1, %int-2 : !torch.vtensor<[4,12,64,512],f32>, !torch.int, !torch.int -> !torch.vtensor<[4,12,512,64],f32>
    %987 = torch.aten.permute %985, %129 : !torch.vtensor<[4,12,512,64],f32>, !torch.list<int> -> !torch.vtensor<[4,512,12,64],f32>
    %988 = torch.aten.clone %987, %int0 : !torch.vtensor<[4,512,12,64],f32>, !torch.int -> !torch.vtensor<[4,512,12,64],f32>
    %989 = torch.aten.view %988, %28 : !torch.vtensor<[4,512,12,64],f32>, !torch.list<int> -> !torch.vtensor<[4,512,768],f32>
    %990 = torch.aten.permute %964, %129 : !torch.vtensor<[4,12,512,64],f32>, !torch.list<int> -> !torch.vtensor<[4,512,12,64],f32>
    %991 = torch.aten.clone %990, %int0 : !torch.vtensor<[4,512,12,64],f32>, !torch.int -> !torch.vtensor<[4,512,12,64],f32>
    %992 = torch.aten.view %991, %28 : !torch.vtensor<[4,512,12,64],f32>, !torch.list<int> -> !torch.vtensor<[4,512,768],f32>
    %993 = torch.aten.view %992, %49 : !torch.vtensor<[4,512,768],f32>, !torch.list<int> -> !torch.vtensor<[2048,768],f32>
    %int0_456 = torch.constant.int 0
    %int1_457 = torch.constant.int 1
    %994 = torch.aten.transpose.int %arg176, %int0_456, %int1_457 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %995 = torch.aten.mm %993, %994 : !torch.vtensor<[2048,768],f32>, !torch.vtensor<[768,768],f32> -> !torch.vtensor<[2048,768],f32>
    %int0_458 = torch.constant.int 0
    %int1_459 = torch.constant.int 1
    %996 = torch.aten.transpose.int %993, %int0_458, %int1_459 : !torch.vtensor<[2048,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,2048],f32>
    %997 = torch.aten.mm %996, %arg177 : !torch.vtensor<[768,2048],f32>, !torch.vtensor<[2048,768],f32> -> !torch.vtensor<[768,768],f32>
    %int0_460 = torch.constant.int 0
    %int1_461 = torch.constant.int 1
    %998 = torch.aten.transpose.int %997, %int0_460, %int1_461 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %999 = torch.aten.sum.dim_IntList %993, %24, %true, %none : !torch.vtensor<[2048,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,768],f32>
    %1000 = torch.aten.view %999, %3 : !torch.vtensor<[1,768],f32>, !torch.list<int> -> !torch.vtensor<[768],f32>
    %1001 = torch.aten.view %995, %28 : !torch.vtensor<[2048,768],f32>, !torch.list<int> -> !torch.vtensor<[4,512,768],f32>
    %1002 = torch.aten.add.Tensor %941, %1001, %int1 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %int0_462 = torch.constant.int 0
    %int1_463 = torch.constant.int 1
    %1003 = torch.aten.transpose.int %998, %int0_462, %int1_463 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %1004 = torch.aten.permute %986, %129 : !torch.vtensor<[4,12,512,64],f32>, !torch.list<int> -> !torch.vtensor<[4,512,12,64],f32>
    %1005 = torch.aten.view %1004, %28 : !torch.vtensor<[4,512,12,64],f32>, !torch.list<int> -> !torch.vtensor<[4,512,768],f32>
    %1006 = torch.aten.clone %1005, %int0 : !torch.vtensor<[4,512,768],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %1007 = torch.aten.view %1006, %49 : !torch.vtensor<[4,512,768],f32>, !torch.list<int> -> !torch.vtensor<[2048,768],f32>
    %int0_464 = torch.constant.int 0
    %int1_465 = torch.constant.int 1
    %1008 = torch.aten.transpose.int %arg174, %int0_464, %int1_465 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %1009 = torch.aten.mm %1007, %1008 : !torch.vtensor<[2048,768],f32>, !torch.vtensor<[768,768],f32> -> !torch.vtensor<[2048,768],f32>
    %int0_466 = torch.constant.int 0
    %int1_467 = torch.constant.int 1
    %1010 = torch.aten.transpose.int %1007, %int0_466, %int1_467 : !torch.vtensor<[2048,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,2048],f32>
    %1011 = torch.aten.mm %1010, %arg175 : !torch.vtensor<[768,2048],f32>, !torch.vtensor<[2048,768],f32> -> !torch.vtensor<[768,768],f32>
    %int0_468 = torch.constant.int 0
    %int1_469 = torch.constant.int 1
    %1012 = torch.aten.transpose.int %1011, %int0_468, %int1_469 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %1013 = torch.aten.sum.dim_IntList %1007, %24, %true, %none : !torch.vtensor<[2048,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,768],f32>
    %1014 = torch.aten.view %1013, %3 : !torch.vtensor<[1,768],f32>, !torch.list<int> -> !torch.vtensor<[768],f32>
    %1015 = torch.aten.view %1009, %28 : !torch.vtensor<[2048,768],f32>, !torch.list<int> -> !torch.vtensor<[4,512,768],f32>
    %1016 = torch.aten.add.Tensor %1002, %1015, %int1 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %int0_470 = torch.constant.int 0
    %int1_471 = torch.constant.int 1
    %1017 = torch.aten.transpose.int %1012, %int0_470, %int1_471 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %1018 = torch.aten.view %989, %49 : !torch.vtensor<[4,512,768],f32>, !torch.list<int> -> !torch.vtensor<[2048,768],f32>
    %int0_472 = torch.constant.int 0
    %int1_473 = torch.constant.int 1
    %1019 = torch.aten.transpose.int %arg172, %int0_472, %int1_473 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %1020 = torch.aten.mm %1018, %1019 : !torch.vtensor<[2048,768],f32>, !torch.vtensor<[768,768],f32> -> !torch.vtensor<[2048,768],f32>
    %int0_474 = torch.constant.int 0
    %int1_475 = torch.constant.int 1
    %1021 = torch.aten.transpose.int %1018, %int0_474, %int1_475 : !torch.vtensor<[2048,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,2048],f32>
    %1022 = torch.aten.mm %1021, %arg173 : !torch.vtensor<[768,2048],f32>, !torch.vtensor<[2048,768],f32> -> !torch.vtensor<[768,768],f32>
    %int0_476 = torch.constant.int 0
    %int1_477 = torch.constant.int 1
    %1023 = torch.aten.transpose.int %1022, %int0_476, %int1_477 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %1024 = torch.aten.sum.dim_IntList %1018, %24, %true, %none : !torch.vtensor<[2048,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,768],f32>
    %1025 = torch.aten.view %1024, %3 : !torch.vtensor<[1,768],f32>, !torch.list<int> -> !torch.vtensor<[768],f32>
    %1026 = torch.aten.view %1020, %28 : !torch.vtensor<[2048,768],f32>, !torch.list<int> -> !torch.vtensor<[4,512,768],f32>
    %1027 = torch.aten.add.Tensor %1016, %1026, %int1 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %int0_478 = torch.constant.int 0
    %int1_479 = torch.constant.int 1
    %1028 = torch.aten.transpose.int %1023, %int0_478, %int1_479 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %1029 = torch.aten.sub.Tensor %arg171, %result1_28, %int1 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %1030 = torch.aten.mul.Tensor %1029, %result2_29 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32> -> !torch.vtensor<[4,512,768],f32>
    %1031 = torch.aten.mul.Tensor %1027, %arg29 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[768],f32> -> !torch.vtensor<[4,512,768],f32>
    %1032 = torch.aten.mul.Tensor %1031, %0 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[],si64> -> !torch.vtensor<[4,512,768],f32>
    %1033 = torch.aten.sum.dim_IntList %1031, %35, %true, %none : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[4,512,1],f32>
    %1034 = torch.aten.mul.Tensor %1031, %1030 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32> -> !torch.vtensor<[4,512,768],f32>
    %1035 = torch.aten.sum.dim_IntList %1034, %35, %true, %none : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[4,512,1],f32>
    %1036 = torch.aten.mul.Tensor %1030, %1035 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32> -> !torch.vtensor<[4,512,768],f32>
    %1037 = torch.aten.sub.Tensor %1032, %1033, %int1 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %1038 = torch.aten.sub.Tensor %1037, %1036, %int1 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %1039 = torch.aten.div.Tensor %result2_29, %0 : !torch.vtensor<[4,512,1],f32>, !torch.vtensor<[],si64> -> !torch.vtensor<[4,512,1],f32>
    %1040 = torch.aten.mul.Tensor %1039, %1038 : !torch.vtensor<[4,512,1],f32>, !torch.vtensor<[4,512,768],f32> -> !torch.vtensor<[4,512,768],f32>
    %1041 = torch.aten.mul.Tensor %1027, %1030 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32> -> !torch.vtensor<[4,512,768],f32>
    %1042 = torch.aten.sum.dim_IntList %1041, %45, %false, %none : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[768],f32>
    %1043 = torch.aten.sum.dim_IntList %1027, %45, %false, %none : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[768],f32>
    %1044 = torch.aten.mul.Tensor %1040, %arg170 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32> -> !torch.vtensor<[4,512,768],f32>
    %1045 = torch.aten.view %1044, %49 : !torch.vtensor<[4,512,768],f32>, !torch.list<int> -> !torch.vtensor<[2048,768],f32>
    %int0_480 = torch.constant.int 0
    %int1_481 = torch.constant.int 1
    %1046 = torch.aten.transpose.int %arg168, %int0_480, %int1_481 : !torch.vtensor<[3072,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,3072],f32>
    %1047 = torch.aten.mm %1045, %1046 : !torch.vtensor<[2048,768],f32>, !torch.vtensor<[768,3072],f32> -> !torch.vtensor<[2048,3072],f32>
    %int0_482 = torch.constant.int 0
    %int1_483 = torch.constant.int 1
    %1048 = torch.aten.transpose.int %1045, %int0_482, %int1_483 : !torch.vtensor<[2048,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,2048],f32>
    %1049 = torch.aten.mm %1048, %arg169 : !torch.vtensor<[768,2048],f32>, !torch.vtensor<[2048,3072],f32> -> !torch.vtensor<[768,3072],f32>
    %int0_484 = torch.constant.int 0
    %int1_485 = torch.constant.int 1
    %1050 = torch.aten.transpose.int %1049, %int0_484, %int1_485 : !torch.vtensor<[768,3072],f32>, !torch.int, !torch.int -> !torch.vtensor<[3072,768],f32>
    %1051 = torch.aten.sum.dim_IntList %1045, %24, %true, %none : !torch.vtensor<[2048,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,768],f32>
    %1052 = torch.aten.view %1051, %3 : !torch.vtensor<[1,768],f32>, !torch.list<int> -> !torch.vtensor<[768],f32>
    %1053 = torch.aten.view %1047, %84 : !torch.vtensor<[2048,3072],f32>, !torch.list<int> -> !torch.vtensor<[4,512,3072],f32>
    %int0_486 = torch.constant.int 0
    %int1_487 = torch.constant.int 1
    %1054 = torch.aten.transpose.int %1050, %int0_486, %int1_487 : !torch.vtensor<[3072,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,3072],f32>
    %1055 = torch.aten.gelu_backward %1053, %arg167, %str : !torch.vtensor<[4,512,3072],f32>, !torch.vtensor<[4,512,3072],f32>, !torch.str -> !torch.vtensor<[4,512,3072],f32>
    %1056 = torch.aten.view %1055, %88 : !torch.vtensor<[4,512,3072],f32>, !torch.list<int> -> !torch.vtensor<[2048,3072],f32>
    %int0_488 = torch.constant.int 0
    %int1_489 = torch.constant.int 1
    %1057 = torch.aten.transpose.int %arg165, %int0_488, %int1_489 : !torch.vtensor<[768,3072],f32>, !torch.int, !torch.int -> !torch.vtensor<[3072,768],f32>
    %1058 = torch.aten.mm %1056, %1057 : !torch.vtensor<[2048,3072],f32>, !torch.vtensor<[3072,768],f32> -> !torch.vtensor<[2048,768],f32>
    %int0_490 = torch.constant.int 0
    %int1_491 = torch.constant.int 1
    %1059 = torch.aten.transpose.int %1056, %int0_490, %int1_491 : !torch.vtensor<[2048,3072],f32>, !torch.int, !torch.int -> !torch.vtensor<[3072,2048],f32>
    %1060 = torch.aten.mm %1059, %arg166 : !torch.vtensor<[3072,2048],f32>, !torch.vtensor<[2048,768],f32> -> !torch.vtensor<[3072,768],f32>
    %int0_492 = torch.constant.int 0
    %int1_493 = torch.constant.int 1
    %1061 = torch.aten.transpose.int %1060, %int0_492, %int1_493 : !torch.vtensor<[3072,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,3072],f32>
    %1062 = torch.aten.sum.dim_IntList %1056, %24, %true, %none : !torch.vtensor<[2048,3072],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,3072],f32>
    %1063 = torch.aten.view %1062, %96 : !torch.vtensor<[1,3072],f32>, !torch.list<int> -> !torch.vtensor<[3072],f32>
    %1064 = torch.aten.view %1058, %28 : !torch.vtensor<[2048,768],f32>, !torch.list<int> -> !torch.vtensor<[4,512,768],f32>
    %1065 = torch.aten.add.Tensor %1040, %1064, %int1 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %int0_494 = torch.constant.int 0
    %int1_495 = torch.constant.int 1
    %1066 = torch.aten.transpose.int %1061, %int0_494, %int1_495 : !torch.vtensor<[768,3072],f32>, !torch.int, !torch.int -> !torch.vtensor<[3072,768],f32>
    %1067 = torch.aten.sub.Tensor %arg164, %result1_25, %int1 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %1068 = torch.aten.mul.Tensor %1067, %result2_26 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32> -> !torch.vtensor<[4,512,768],f32>
    %1069 = torch.aten.mul.Tensor %1065, %arg27 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[768],f32> -> !torch.vtensor<[4,512,768],f32>
    %1070 = torch.aten.mul.Tensor %1069, %0 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[],si64> -> !torch.vtensor<[4,512,768],f32>
    %1071 = torch.aten.sum.dim_IntList %1069, %35, %true, %none : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[4,512,1],f32>
    %1072 = torch.aten.mul.Tensor %1069, %1068 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32> -> !torch.vtensor<[4,512,768],f32>
    %1073 = torch.aten.sum.dim_IntList %1072, %35, %true, %none : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[4,512,1],f32>
    %1074 = torch.aten.mul.Tensor %1068, %1073 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32> -> !torch.vtensor<[4,512,768],f32>
    %1075 = torch.aten.sub.Tensor %1070, %1071, %int1 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %1076 = torch.aten.sub.Tensor %1075, %1074, %int1 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %1077 = torch.aten.div.Tensor %result2_26, %0 : !torch.vtensor<[4,512,1],f32>, !torch.vtensor<[],si64> -> !torch.vtensor<[4,512,1],f32>
    %1078 = torch.aten.mul.Tensor %1077, %1076 : !torch.vtensor<[4,512,1],f32>, !torch.vtensor<[4,512,768],f32> -> !torch.vtensor<[4,512,768],f32>
    %1079 = torch.aten.mul.Tensor %1065, %1068 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32> -> !torch.vtensor<[4,512,768],f32>
    %1080 = torch.aten.sum.dim_IntList %1079, %45, %false, %none : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[768],f32>
    %1081 = torch.aten.sum.dim_IntList %1065, %45, %false, %none : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[768],f32>
    %1082 = torch.aten.mul.Tensor %1078, %arg163 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32> -> !torch.vtensor<[4,512,768],f32>
    %1083 = torch.aten.view %1082, %49 : !torch.vtensor<[4,512,768],f32>, !torch.list<int> -> !torch.vtensor<[2048,768],f32>
    %int0_496 = torch.constant.int 0
    %int1_497 = torch.constant.int 1
    %1084 = torch.aten.transpose.int %arg161, %int0_496, %int1_497 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %1085 = torch.aten.mm %1083, %1084 : !torch.vtensor<[2048,768],f32>, !torch.vtensor<[768,768],f32> -> !torch.vtensor<[2048,768],f32>
    %int0_498 = torch.constant.int 0
    %int1_499 = torch.constant.int 1
    %1086 = torch.aten.transpose.int %1083, %int0_498, %int1_499 : !torch.vtensor<[2048,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,2048],f32>
    %1087 = torch.aten.mm %1086, %arg162 : !torch.vtensor<[768,2048],f32>, !torch.vtensor<[2048,768],f32> -> !torch.vtensor<[768,768],f32>
    %int0_500 = torch.constant.int 0
    %int1_501 = torch.constant.int 1
    %1088 = torch.aten.transpose.int %1087, %int0_500, %int1_501 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %1089 = torch.aten.sum.dim_IntList %1083, %24, %true, %none : !torch.vtensor<[2048,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,768],f32>
    %1090 = torch.aten.view %1089, %3 : !torch.vtensor<[1,768],f32>, !torch.list<int> -> !torch.vtensor<[768],f32>
    %1091 = torch.aten.view %1085, %28 : !torch.vtensor<[2048,768],f32>, !torch.list<int> -> !torch.vtensor<[4,512,768],f32>
    %int0_502 = torch.constant.int 0
    %int1_503 = torch.constant.int 1
    %1092 = torch.aten.transpose.int %1088, %int0_502, %int1_503 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %1093 = torch.aten.view %1091, %127 : !torch.vtensor<[4,512,768],f32>, !torch.list<int> -> !torch.vtensor<[4,512,12,64],f32>
    %1094 = torch.aten.permute %1093, %129 : !torch.vtensor<[4,512,12,64],f32>, !torch.list<int> -> !torch.vtensor<[4,12,512,64],f32>
    %1095 = torch.aten.clone %1094, %int0 : !torch.vtensor<[4,12,512,64],f32>, !torch.int -> !torch.vtensor<[4,12,512,64],f32>
    %1096 = torch.aten.view %1095, %132 : !torch.vtensor<[4,12,512,64],f32>, !torch.list<int> -> !torch.vtensor<[48,512,64],f32>
    %1097 = torch.aten.transpose.int %arg159, %int1, %int2 : !torch.vtensor<[48,512,512],f32>, !torch.int, !torch.int -> !torch.vtensor<[48,512,512],f32>
    %1098 = torch.aten.bmm %1097, %1096 : !torch.vtensor<[48,512,512],f32>, !torch.vtensor<[48,512,64],f32> -> !torch.vtensor<[48,512,64],f32>
    %1099 = torch.aten.transpose.int %arg160, %int1, %int2 : !torch.vtensor<[48,512,64],f32>, !torch.int, !torch.int -> !torch.vtensor<[48,64,512],f32>
    %1100 = torch.aten.bmm %1096, %1099 : !torch.vtensor<[48,512,64],f32>, !torch.vtensor<[48,64,512],f32> -> !torch.vtensor<[48,512,512],f32>
    %1101 = torch.aten.view %1098, %138 : !torch.vtensor<[48,512,64],f32>, !torch.list<int> -> !torch.vtensor<[4,12,512,64],f32>
    %1102 = torch.aten.view %1100, %140 : !torch.vtensor<[48,512,512],f32>, !torch.list<int> -> !torch.vtensor<[4,12,512,512],f32>
    %1103 = torch.aten.mul.Tensor %1102, %arg158 : !torch.vtensor<[4,12,512,512],f32>, !torch.vtensor<[4,12,512,512],f32> -> !torch.vtensor<[4,12,512,512],f32>
    %1104 = torch.aten.mul.Tensor %1103, %arg157 : !torch.vtensor<[4,12,512,512],f32>, !torch.vtensor<[4,12,512,512],f32> -> !torch.vtensor<[4,12,512,512],f32>
    %1105 = torch.prim.ListConstruct %int-1 : (!torch.int) -> !torch.list<int>
    %true_504 = torch.constant.bool true
    %none_505 = torch.constant.none
    %1106 = torch.aten.sum.dim_IntList %1104, %1105, %true_504, %none_505 : !torch.vtensor<[4,12,512,512],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[4,12,512,1],f32>
    %int0_506 = torch.constant.int 0
    %1107 = torch.aten.size.int %1104, %int0_506 : !torch.vtensor<[4,12,512,512],f32>, !torch.int -> !torch.int
    %int1_507 = torch.constant.int 1
    %1108 = torch.aten.size.int %1104, %int1_507 : !torch.vtensor<[4,12,512,512],f32>, !torch.int -> !torch.int
    %int2_508 = torch.constant.int 2
    %1109 = torch.aten.size.int %1104, %int2_508 : !torch.vtensor<[4,12,512,512],f32>, !torch.int -> !torch.int
    %int3_509 = torch.constant.int 3
    %1110 = torch.aten.size.int %1104, %int3_509 : !torch.vtensor<[4,12,512,512],f32>, !torch.int -> !torch.int
    %1111 = torch.prim.ListConstruct %1107, %1108, %1109, %1110 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1112 = torch.aten.broadcast_to %1106, %1111 : !torch.vtensor<[4,12,512,1],f32>, !torch.list<int> -> !torch.vtensor<[4,12,512,512],f32>
    %1113 = torch.aten.mul.Tensor %arg157, %1112 : !torch.vtensor<[4,12,512,512],f32>, !torch.vtensor<[4,12,512,512],f32> -> !torch.vtensor<[4,12,512,512],f32>
    %float1.000000e00_510 = torch.constant.float 1.000000e+00
    %1114 = torch.aten.sub.Tensor %1104, %1113, %float1.000000e00_510 : !torch.vtensor<[4,12,512,512],f32>, !torch.vtensor<[4,12,512,512],f32>, !torch.float -> !torch.vtensor<[4,12,512,512],f32>
    %1115 = torch.aten.div.Tensor %1114, %1 : !torch.vtensor<[4,12,512,512],f32>, !torch.vtensor<[],f64> -> !torch.vtensor<[4,12,512,512],f32>
    %1116 = torch.aten.view %1115, %155 : !torch.vtensor<[4,12,512,512],f32>, !torch.list<int> -> !torch.vtensor<[48,512,512],f32>
    %1117 = torch.aten.transpose.int %arg155, %int1, %int2 : !torch.vtensor<[48,512,64],f32>, !torch.int, !torch.int -> !torch.vtensor<[48,64,512],f32>
    %1118 = torch.aten.bmm %1117, %1116 : !torch.vtensor<[48,64,512],f32>, !torch.vtensor<[48,512,512],f32> -> !torch.vtensor<[48,64,512],f32>
    %1119 = torch.aten.transpose.int %arg156, %int1, %int2 : !torch.vtensor<[48,64,512],f32>, !torch.int, !torch.int -> !torch.vtensor<[48,512,64],f32>
    %1120 = torch.aten.bmm %1116, %1119 : !torch.vtensor<[48,512,512],f32>, !torch.vtensor<[48,512,64],f32> -> !torch.vtensor<[48,512,64],f32>
    %1121 = torch.aten.view %1118, %161 : !torch.vtensor<[48,64,512],f32>, !torch.list<int> -> !torch.vtensor<[4,12,64,512],f32>
    %1122 = torch.aten.view %1120, %138 : !torch.vtensor<[48,512,64],f32>, !torch.list<int> -> !torch.vtensor<[4,12,512,64],f32>
    %1123 = torch.aten.transpose.int %1121, %int-1, %int-2 : !torch.vtensor<[4,12,64,512],f32>, !torch.int, !torch.int -> !torch.vtensor<[4,12,512,64],f32>
    %1124 = torch.aten.permute %1122, %129 : !torch.vtensor<[4,12,512,64],f32>, !torch.list<int> -> !torch.vtensor<[4,512,12,64],f32>
    %1125 = torch.aten.clone %1124, %int0 : !torch.vtensor<[4,512,12,64],f32>, !torch.int -> !torch.vtensor<[4,512,12,64],f32>
    %1126 = torch.aten.view %1125, %28 : !torch.vtensor<[4,512,12,64],f32>, !torch.list<int> -> !torch.vtensor<[4,512,768],f32>
    %1127 = torch.aten.permute %1101, %129 : !torch.vtensor<[4,12,512,64],f32>, !torch.list<int> -> !torch.vtensor<[4,512,12,64],f32>
    %1128 = torch.aten.clone %1127, %int0 : !torch.vtensor<[4,512,12,64],f32>, !torch.int -> !torch.vtensor<[4,512,12,64],f32>
    %1129 = torch.aten.view %1128, %28 : !torch.vtensor<[4,512,12,64],f32>, !torch.list<int> -> !torch.vtensor<[4,512,768],f32>
    %1130 = torch.aten.view %1129, %49 : !torch.vtensor<[4,512,768],f32>, !torch.list<int> -> !torch.vtensor<[2048,768],f32>
    %int0_511 = torch.constant.int 0
    %int1_512 = torch.constant.int 1
    %1131 = torch.aten.transpose.int %arg153, %int0_511, %int1_512 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %1132 = torch.aten.mm %1130, %1131 : !torch.vtensor<[2048,768],f32>, !torch.vtensor<[768,768],f32> -> !torch.vtensor<[2048,768],f32>
    %int0_513 = torch.constant.int 0
    %int1_514 = torch.constant.int 1
    %1133 = torch.aten.transpose.int %1130, %int0_513, %int1_514 : !torch.vtensor<[2048,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,2048],f32>
    %1134 = torch.aten.mm %1133, %arg154 : !torch.vtensor<[768,2048],f32>, !torch.vtensor<[2048,768],f32> -> !torch.vtensor<[768,768],f32>
    %int0_515 = torch.constant.int 0
    %int1_516 = torch.constant.int 1
    %1135 = torch.aten.transpose.int %1134, %int0_515, %int1_516 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %1136 = torch.aten.sum.dim_IntList %1130, %24, %true, %none : !torch.vtensor<[2048,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,768],f32>
    %1137 = torch.aten.view %1136, %3 : !torch.vtensor<[1,768],f32>, !torch.list<int> -> !torch.vtensor<[768],f32>
    %1138 = torch.aten.view %1132, %28 : !torch.vtensor<[2048,768],f32>, !torch.list<int> -> !torch.vtensor<[4,512,768],f32>
    %1139 = torch.aten.add.Tensor %1078, %1138, %int1 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %int0_517 = torch.constant.int 0
    %int1_518 = torch.constant.int 1
    %1140 = torch.aten.transpose.int %1135, %int0_517, %int1_518 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %1141 = torch.aten.permute %1123, %129 : !torch.vtensor<[4,12,512,64],f32>, !torch.list<int> -> !torch.vtensor<[4,512,12,64],f32>
    %1142 = torch.aten.view %1141, %28 : !torch.vtensor<[4,512,12,64],f32>, !torch.list<int> -> !torch.vtensor<[4,512,768],f32>
    %1143 = torch.aten.clone %1142, %int0 : !torch.vtensor<[4,512,768],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %1144 = torch.aten.view %1143, %49 : !torch.vtensor<[4,512,768],f32>, !torch.list<int> -> !torch.vtensor<[2048,768],f32>
    %int0_519 = torch.constant.int 0
    %int1_520 = torch.constant.int 1
    %1145 = torch.aten.transpose.int %arg151, %int0_519, %int1_520 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %1146 = torch.aten.mm %1144, %1145 : !torch.vtensor<[2048,768],f32>, !torch.vtensor<[768,768],f32> -> !torch.vtensor<[2048,768],f32>
    %int0_521 = torch.constant.int 0
    %int1_522 = torch.constant.int 1
    %1147 = torch.aten.transpose.int %1144, %int0_521, %int1_522 : !torch.vtensor<[2048,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,2048],f32>
    %1148 = torch.aten.mm %1147, %arg152 : !torch.vtensor<[768,2048],f32>, !torch.vtensor<[2048,768],f32> -> !torch.vtensor<[768,768],f32>
    %int0_523 = torch.constant.int 0
    %int1_524 = torch.constant.int 1
    %1149 = torch.aten.transpose.int %1148, %int0_523, %int1_524 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %1150 = torch.aten.sum.dim_IntList %1144, %24, %true, %none : !torch.vtensor<[2048,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,768],f32>
    %1151 = torch.aten.view %1150, %3 : !torch.vtensor<[1,768],f32>, !torch.list<int> -> !torch.vtensor<[768],f32>
    %1152 = torch.aten.view %1146, %28 : !torch.vtensor<[2048,768],f32>, !torch.list<int> -> !torch.vtensor<[4,512,768],f32>
    %1153 = torch.aten.add.Tensor %1139, %1152, %int1 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %int0_525 = torch.constant.int 0
    %int1_526 = torch.constant.int 1
    %1154 = torch.aten.transpose.int %1149, %int0_525, %int1_526 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %1155 = torch.aten.view %1126, %49 : !torch.vtensor<[4,512,768],f32>, !torch.list<int> -> !torch.vtensor<[2048,768],f32>
    %int0_527 = torch.constant.int 0
    %int1_528 = torch.constant.int 1
    %1156 = torch.aten.transpose.int %arg149, %int0_527, %int1_528 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %1157 = torch.aten.mm %1155, %1156 : !torch.vtensor<[2048,768],f32>, !torch.vtensor<[768,768],f32> -> !torch.vtensor<[2048,768],f32>
    %int0_529 = torch.constant.int 0
    %int1_530 = torch.constant.int 1
    %1158 = torch.aten.transpose.int %1155, %int0_529, %int1_530 : !torch.vtensor<[2048,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,2048],f32>
    %1159 = torch.aten.mm %1158, %arg150 : !torch.vtensor<[768,2048],f32>, !torch.vtensor<[2048,768],f32> -> !torch.vtensor<[768,768],f32>
    %int0_531 = torch.constant.int 0
    %int1_532 = torch.constant.int 1
    %1160 = torch.aten.transpose.int %1159, %int0_531, %int1_532 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %1161 = torch.aten.sum.dim_IntList %1155, %24, %true, %none : !torch.vtensor<[2048,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,768],f32>
    %1162 = torch.aten.view %1161, %3 : !torch.vtensor<[1,768],f32>, !torch.list<int> -> !torch.vtensor<[768],f32>
    %1163 = torch.aten.view %1157, %28 : !torch.vtensor<[2048,768],f32>, !torch.list<int> -> !torch.vtensor<[4,512,768],f32>
    %1164 = torch.aten.add.Tensor %1153, %1163, %int1 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %int0_533 = torch.constant.int 0
    %int1_534 = torch.constant.int 1
    %1165 = torch.aten.transpose.int %1160, %int0_533, %int1_534 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %1166 = torch.aten.sub.Tensor %arg148, %result1_22, %int1 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %1167 = torch.aten.mul.Tensor %1166, %result2_23 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32> -> !torch.vtensor<[4,512,768],f32>
    %1168 = torch.aten.mul.Tensor %1164, %arg25 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[768],f32> -> !torch.vtensor<[4,512,768],f32>
    %1169 = torch.aten.mul.Tensor %1168, %0 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[],si64> -> !torch.vtensor<[4,512,768],f32>
    %1170 = torch.aten.sum.dim_IntList %1168, %35, %true, %none : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[4,512,1],f32>
    %1171 = torch.aten.mul.Tensor %1168, %1167 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32> -> !torch.vtensor<[4,512,768],f32>
    %1172 = torch.aten.sum.dim_IntList %1171, %35, %true, %none : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[4,512,1],f32>
    %1173 = torch.aten.mul.Tensor %1167, %1172 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32> -> !torch.vtensor<[4,512,768],f32>
    %1174 = torch.aten.sub.Tensor %1169, %1170, %int1 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %1175 = torch.aten.sub.Tensor %1174, %1173, %int1 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %1176 = torch.aten.div.Tensor %result2_23, %0 : !torch.vtensor<[4,512,1],f32>, !torch.vtensor<[],si64> -> !torch.vtensor<[4,512,1],f32>
    %1177 = torch.aten.mul.Tensor %1176, %1175 : !torch.vtensor<[4,512,1],f32>, !torch.vtensor<[4,512,768],f32> -> !torch.vtensor<[4,512,768],f32>
    %1178 = torch.aten.mul.Tensor %1164, %1167 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32> -> !torch.vtensor<[4,512,768],f32>
    %1179 = torch.aten.sum.dim_IntList %1178, %45, %false, %none : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[768],f32>
    %1180 = torch.aten.sum.dim_IntList %1164, %45, %false, %none : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[768],f32>
    %1181 = torch.aten.mul.Tensor %1177, %arg147 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32> -> !torch.vtensor<[4,512,768],f32>
    %1182 = torch.aten.view %1181, %49 : !torch.vtensor<[4,512,768],f32>, !torch.list<int> -> !torch.vtensor<[2048,768],f32>
    %int0_535 = torch.constant.int 0
    %int1_536 = torch.constant.int 1
    %1183 = torch.aten.transpose.int %arg145, %int0_535, %int1_536 : !torch.vtensor<[3072,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,3072],f32>
    %1184 = torch.aten.mm %1182, %1183 : !torch.vtensor<[2048,768],f32>, !torch.vtensor<[768,3072],f32> -> !torch.vtensor<[2048,3072],f32>
    %int0_537 = torch.constant.int 0
    %int1_538 = torch.constant.int 1
    %1185 = torch.aten.transpose.int %1182, %int0_537, %int1_538 : !torch.vtensor<[2048,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,2048],f32>
    %1186 = torch.aten.mm %1185, %arg146 : !torch.vtensor<[768,2048],f32>, !torch.vtensor<[2048,3072],f32> -> !torch.vtensor<[768,3072],f32>
    %int0_539 = torch.constant.int 0
    %int1_540 = torch.constant.int 1
    %1187 = torch.aten.transpose.int %1186, %int0_539, %int1_540 : !torch.vtensor<[768,3072],f32>, !torch.int, !torch.int -> !torch.vtensor<[3072,768],f32>
    %1188 = torch.aten.sum.dim_IntList %1182, %24, %true, %none : !torch.vtensor<[2048,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,768],f32>
    %1189 = torch.aten.view %1188, %3 : !torch.vtensor<[1,768],f32>, !torch.list<int> -> !torch.vtensor<[768],f32>
    %1190 = torch.aten.view %1184, %84 : !torch.vtensor<[2048,3072],f32>, !torch.list<int> -> !torch.vtensor<[4,512,3072],f32>
    %int0_541 = torch.constant.int 0
    %int1_542 = torch.constant.int 1
    %1191 = torch.aten.transpose.int %1187, %int0_541, %int1_542 : !torch.vtensor<[3072,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,3072],f32>
    %1192 = torch.aten.gelu_backward %1190, %arg144, %str : !torch.vtensor<[4,512,3072],f32>, !torch.vtensor<[4,512,3072],f32>, !torch.str -> !torch.vtensor<[4,512,3072],f32>
    %1193 = torch.aten.view %1192, %88 : !torch.vtensor<[4,512,3072],f32>, !torch.list<int> -> !torch.vtensor<[2048,3072],f32>
    %int0_543 = torch.constant.int 0
    %int1_544 = torch.constant.int 1
    %1194 = torch.aten.transpose.int %arg142, %int0_543, %int1_544 : !torch.vtensor<[768,3072],f32>, !torch.int, !torch.int -> !torch.vtensor<[3072,768],f32>
    %1195 = torch.aten.mm %1193, %1194 : !torch.vtensor<[2048,3072],f32>, !torch.vtensor<[3072,768],f32> -> !torch.vtensor<[2048,768],f32>
    %int0_545 = torch.constant.int 0
    %int1_546 = torch.constant.int 1
    %1196 = torch.aten.transpose.int %1193, %int0_545, %int1_546 : !torch.vtensor<[2048,3072],f32>, !torch.int, !torch.int -> !torch.vtensor<[3072,2048],f32>
    %1197 = torch.aten.mm %1196, %arg143 : !torch.vtensor<[3072,2048],f32>, !torch.vtensor<[2048,768],f32> -> !torch.vtensor<[3072,768],f32>
    %int0_547 = torch.constant.int 0
    %int1_548 = torch.constant.int 1
    %1198 = torch.aten.transpose.int %1197, %int0_547, %int1_548 : !torch.vtensor<[3072,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,3072],f32>
    %1199 = torch.aten.sum.dim_IntList %1193, %24, %true, %none : !torch.vtensor<[2048,3072],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,3072],f32>
    %1200 = torch.aten.view %1199, %96 : !torch.vtensor<[1,3072],f32>, !torch.list<int> -> !torch.vtensor<[3072],f32>
    %1201 = torch.aten.view %1195, %28 : !torch.vtensor<[2048,768],f32>, !torch.list<int> -> !torch.vtensor<[4,512,768],f32>
    %1202 = torch.aten.add.Tensor %1177, %1201, %int1 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %int0_549 = torch.constant.int 0
    %int1_550 = torch.constant.int 1
    %1203 = torch.aten.transpose.int %1198, %int0_549, %int1_550 : !torch.vtensor<[768,3072],f32>, !torch.int, !torch.int -> !torch.vtensor<[3072,768],f32>
    %1204 = torch.aten.sub.Tensor %arg141, %result1_19, %int1 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %1205 = torch.aten.mul.Tensor %1204, %result2_20 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32> -> !torch.vtensor<[4,512,768],f32>
    %1206 = torch.aten.mul.Tensor %1202, %arg23 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[768],f32> -> !torch.vtensor<[4,512,768],f32>
    %1207 = torch.aten.mul.Tensor %1206, %0 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[],si64> -> !torch.vtensor<[4,512,768],f32>
    %1208 = torch.aten.sum.dim_IntList %1206, %35, %true, %none : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[4,512,1],f32>
    %1209 = torch.aten.mul.Tensor %1206, %1205 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32> -> !torch.vtensor<[4,512,768],f32>
    %1210 = torch.aten.sum.dim_IntList %1209, %35, %true, %none : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[4,512,1],f32>
    %1211 = torch.aten.mul.Tensor %1205, %1210 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32> -> !torch.vtensor<[4,512,768],f32>
    %1212 = torch.aten.sub.Tensor %1207, %1208, %int1 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %1213 = torch.aten.sub.Tensor %1212, %1211, %int1 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %1214 = torch.aten.div.Tensor %result2_20, %0 : !torch.vtensor<[4,512,1],f32>, !torch.vtensor<[],si64> -> !torch.vtensor<[4,512,1],f32>
    %1215 = torch.aten.mul.Tensor %1214, %1213 : !torch.vtensor<[4,512,1],f32>, !torch.vtensor<[4,512,768],f32> -> !torch.vtensor<[4,512,768],f32>
    %1216 = torch.aten.mul.Tensor %1202, %1205 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32> -> !torch.vtensor<[4,512,768],f32>
    %1217 = torch.aten.sum.dim_IntList %1216, %45, %false, %none : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[768],f32>
    %1218 = torch.aten.sum.dim_IntList %1202, %45, %false, %none : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[768],f32>
    %1219 = torch.aten.mul.Tensor %1215, %arg140 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32> -> !torch.vtensor<[4,512,768],f32>
    %1220 = torch.aten.view %1219, %49 : !torch.vtensor<[4,512,768],f32>, !torch.list<int> -> !torch.vtensor<[2048,768],f32>
    %int0_551 = torch.constant.int 0
    %int1_552 = torch.constant.int 1
    %1221 = torch.aten.transpose.int %arg138, %int0_551, %int1_552 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %1222 = torch.aten.mm %1220, %1221 : !torch.vtensor<[2048,768],f32>, !torch.vtensor<[768,768],f32> -> !torch.vtensor<[2048,768],f32>
    %int0_553 = torch.constant.int 0
    %int1_554 = torch.constant.int 1
    %1223 = torch.aten.transpose.int %1220, %int0_553, %int1_554 : !torch.vtensor<[2048,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,2048],f32>
    %1224 = torch.aten.mm %1223, %arg139 : !torch.vtensor<[768,2048],f32>, !torch.vtensor<[2048,768],f32> -> !torch.vtensor<[768,768],f32>
    %int0_555 = torch.constant.int 0
    %int1_556 = torch.constant.int 1
    %1225 = torch.aten.transpose.int %1224, %int0_555, %int1_556 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %1226 = torch.aten.sum.dim_IntList %1220, %24, %true, %none : !torch.vtensor<[2048,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,768],f32>
    %1227 = torch.aten.view %1226, %3 : !torch.vtensor<[1,768],f32>, !torch.list<int> -> !torch.vtensor<[768],f32>
    %1228 = torch.aten.view %1222, %28 : !torch.vtensor<[2048,768],f32>, !torch.list<int> -> !torch.vtensor<[4,512,768],f32>
    %int0_557 = torch.constant.int 0
    %int1_558 = torch.constant.int 1
    %1229 = torch.aten.transpose.int %1225, %int0_557, %int1_558 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %1230 = torch.aten.view %1228, %127 : !torch.vtensor<[4,512,768],f32>, !torch.list<int> -> !torch.vtensor<[4,512,12,64],f32>
    %1231 = torch.aten.permute %1230, %129 : !torch.vtensor<[4,512,12,64],f32>, !torch.list<int> -> !torch.vtensor<[4,12,512,64],f32>
    %1232 = torch.aten.clone %1231, %int0 : !torch.vtensor<[4,12,512,64],f32>, !torch.int -> !torch.vtensor<[4,12,512,64],f32>
    %1233 = torch.aten.view %1232, %132 : !torch.vtensor<[4,12,512,64],f32>, !torch.list<int> -> !torch.vtensor<[48,512,64],f32>
    %1234 = torch.aten.transpose.int %arg136, %int1, %int2 : !torch.vtensor<[48,512,512],f32>, !torch.int, !torch.int -> !torch.vtensor<[48,512,512],f32>
    %1235 = torch.aten.bmm %1234, %1233 : !torch.vtensor<[48,512,512],f32>, !torch.vtensor<[48,512,64],f32> -> !torch.vtensor<[48,512,64],f32>
    %1236 = torch.aten.transpose.int %arg137, %int1, %int2 : !torch.vtensor<[48,512,64],f32>, !torch.int, !torch.int -> !torch.vtensor<[48,64,512],f32>
    %1237 = torch.aten.bmm %1233, %1236 : !torch.vtensor<[48,512,64],f32>, !torch.vtensor<[48,64,512],f32> -> !torch.vtensor<[48,512,512],f32>
    %1238 = torch.aten.view %1235, %138 : !torch.vtensor<[48,512,64],f32>, !torch.list<int> -> !torch.vtensor<[4,12,512,64],f32>
    %1239 = torch.aten.view %1237, %140 : !torch.vtensor<[48,512,512],f32>, !torch.list<int> -> !torch.vtensor<[4,12,512,512],f32>
    %1240 = torch.aten.mul.Tensor %1239, %arg135 : !torch.vtensor<[4,12,512,512],f32>, !torch.vtensor<[4,12,512,512],f32> -> !torch.vtensor<[4,12,512,512],f32>
    %1241 = torch.aten.mul.Tensor %1240, %arg134 : !torch.vtensor<[4,12,512,512],f32>, !torch.vtensor<[4,12,512,512],f32> -> !torch.vtensor<[4,12,512,512],f32>
    %1242 = torch.prim.ListConstruct %int-1 : (!torch.int) -> !torch.list<int>
    %true_559 = torch.constant.bool true
    %none_560 = torch.constant.none
    %1243 = torch.aten.sum.dim_IntList %1241, %1242, %true_559, %none_560 : !torch.vtensor<[4,12,512,512],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[4,12,512,1],f32>
    %int0_561 = torch.constant.int 0
    %1244 = torch.aten.size.int %1241, %int0_561 : !torch.vtensor<[4,12,512,512],f32>, !torch.int -> !torch.int
    %int1_562 = torch.constant.int 1
    %1245 = torch.aten.size.int %1241, %int1_562 : !torch.vtensor<[4,12,512,512],f32>, !torch.int -> !torch.int
    %int2_563 = torch.constant.int 2
    %1246 = torch.aten.size.int %1241, %int2_563 : !torch.vtensor<[4,12,512,512],f32>, !torch.int -> !torch.int
    %int3_564 = torch.constant.int 3
    %1247 = torch.aten.size.int %1241, %int3_564 : !torch.vtensor<[4,12,512,512],f32>, !torch.int -> !torch.int
    %1248 = torch.prim.ListConstruct %1244, %1245, %1246, %1247 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1249 = torch.aten.broadcast_to %1243, %1248 : !torch.vtensor<[4,12,512,1],f32>, !torch.list<int> -> !torch.vtensor<[4,12,512,512],f32>
    %1250 = torch.aten.mul.Tensor %arg134, %1249 : !torch.vtensor<[4,12,512,512],f32>, !torch.vtensor<[4,12,512,512],f32> -> !torch.vtensor<[4,12,512,512],f32>
    %float1.000000e00_565 = torch.constant.float 1.000000e+00
    %1251 = torch.aten.sub.Tensor %1241, %1250, %float1.000000e00_565 : !torch.vtensor<[4,12,512,512],f32>, !torch.vtensor<[4,12,512,512],f32>, !torch.float -> !torch.vtensor<[4,12,512,512],f32>
    %1252 = torch.aten.div.Tensor %1251, %1 : !torch.vtensor<[4,12,512,512],f32>, !torch.vtensor<[],f64> -> !torch.vtensor<[4,12,512,512],f32>
    %1253 = torch.aten.view %1252, %155 : !torch.vtensor<[4,12,512,512],f32>, !torch.list<int> -> !torch.vtensor<[48,512,512],f32>
    %1254 = torch.aten.transpose.int %arg132, %int1, %int2 : !torch.vtensor<[48,512,64],f32>, !torch.int, !torch.int -> !torch.vtensor<[48,64,512],f32>
    %1255 = torch.aten.bmm %1254, %1253 : !torch.vtensor<[48,64,512],f32>, !torch.vtensor<[48,512,512],f32> -> !torch.vtensor<[48,64,512],f32>
    %1256 = torch.aten.transpose.int %arg133, %int1, %int2 : !torch.vtensor<[48,64,512],f32>, !torch.int, !torch.int -> !torch.vtensor<[48,512,64],f32>
    %1257 = torch.aten.bmm %1253, %1256 : !torch.vtensor<[48,512,512],f32>, !torch.vtensor<[48,512,64],f32> -> !torch.vtensor<[48,512,64],f32>
    %1258 = torch.aten.view %1255, %161 : !torch.vtensor<[48,64,512],f32>, !torch.list<int> -> !torch.vtensor<[4,12,64,512],f32>
    %1259 = torch.aten.view %1257, %138 : !torch.vtensor<[48,512,64],f32>, !torch.list<int> -> !torch.vtensor<[4,12,512,64],f32>
    %1260 = torch.aten.transpose.int %1258, %int-1, %int-2 : !torch.vtensor<[4,12,64,512],f32>, !torch.int, !torch.int -> !torch.vtensor<[4,12,512,64],f32>
    %1261 = torch.aten.permute %1259, %129 : !torch.vtensor<[4,12,512,64],f32>, !torch.list<int> -> !torch.vtensor<[4,512,12,64],f32>
    %1262 = torch.aten.clone %1261, %int0 : !torch.vtensor<[4,512,12,64],f32>, !torch.int -> !torch.vtensor<[4,512,12,64],f32>
    %1263 = torch.aten.view %1262, %28 : !torch.vtensor<[4,512,12,64],f32>, !torch.list<int> -> !torch.vtensor<[4,512,768],f32>
    %1264 = torch.aten.permute %1238, %129 : !torch.vtensor<[4,12,512,64],f32>, !torch.list<int> -> !torch.vtensor<[4,512,12,64],f32>
    %1265 = torch.aten.clone %1264, %int0 : !torch.vtensor<[4,512,12,64],f32>, !torch.int -> !torch.vtensor<[4,512,12,64],f32>
    %1266 = torch.aten.view %1265, %28 : !torch.vtensor<[4,512,12,64],f32>, !torch.list<int> -> !torch.vtensor<[4,512,768],f32>
    %1267 = torch.aten.view %1266, %49 : !torch.vtensor<[4,512,768],f32>, !torch.list<int> -> !torch.vtensor<[2048,768],f32>
    %int0_566 = torch.constant.int 0
    %int1_567 = torch.constant.int 1
    %1268 = torch.aten.transpose.int %arg130, %int0_566, %int1_567 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %1269 = torch.aten.mm %1267, %1268 : !torch.vtensor<[2048,768],f32>, !torch.vtensor<[768,768],f32> -> !torch.vtensor<[2048,768],f32>
    %int0_568 = torch.constant.int 0
    %int1_569 = torch.constant.int 1
    %1270 = torch.aten.transpose.int %1267, %int0_568, %int1_569 : !torch.vtensor<[2048,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,2048],f32>
    %1271 = torch.aten.mm %1270, %arg131 : !torch.vtensor<[768,2048],f32>, !torch.vtensor<[2048,768],f32> -> !torch.vtensor<[768,768],f32>
    %int0_570 = torch.constant.int 0
    %int1_571 = torch.constant.int 1
    %1272 = torch.aten.transpose.int %1271, %int0_570, %int1_571 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %1273 = torch.aten.sum.dim_IntList %1267, %24, %true, %none : !torch.vtensor<[2048,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,768],f32>
    %1274 = torch.aten.view %1273, %3 : !torch.vtensor<[1,768],f32>, !torch.list<int> -> !torch.vtensor<[768],f32>
    %1275 = torch.aten.view %1269, %28 : !torch.vtensor<[2048,768],f32>, !torch.list<int> -> !torch.vtensor<[4,512,768],f32>
    %1276 = torch.aten.add.Tensor %1215, %1275, %int1 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %int0_572 = torch.constant.int 0
    %int1_573 = torch.constant.int 1
    %1277 = torch.aten.transpose.int %1272, %int0_572, %int1_573 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %1278 = torch.aten.permute %1260, %129 : !torch.vtensor<[4,12,512,64],f32>, !torch.list<int> -> !torch.vtensor<[4,512,12,64],f32>
    %1279 = torch.aten.view %1278, %28 : !torch.vtensor<[4,512,12,64],f32>, !torch.list<int> -> !torch.vtensor<[4,512,768],f32>
    %1280 = torch.aten.clone %1279, %int0 : !torch.vtensor<[4,512,768],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %1281 = torch.aten.view %1280, %49 : !torch.vtensor<[4,512,768],f32>, !torch.list<int> -> !torch.vtensor<[2048,768],f32>
    %int0_574 = torch.constant.int 0
    %int1_575 = torch.constant.int 1
    %1282 = torch.aten.transpose.int %arg128, %int0_574, %int1_575 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %1283 = torch.aten.mm %1281, %1282 : !torch.vtensor<[2048,768],f32>, !torch.vtensor<[768,768],f32> -> !torch.vtensor<[2048,768],f32>
    %int0_576 = torch.constant.int 0
    %int1_577 = torch.constant.int 1
    %1284 = torch.aten.transpose.int %1281, %int0_576, %int1_577 : !torch.vtensor<[2048,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,2048],f32>
    %1285 = torch.aten.mm %1284, %arg129 : !torch.vtensor<[768,2048],f32>, !torch.vtensor<[2048,768],f32> -> !torch.vtensor<[768,768],f32>
    %int0_578 = torch.constant.int 0
    %int1_579 = torch.constant.int 1
    %1286 = torch.aten.transpose.int %1285, %int0_578, %int1_579 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %1287 = torch.aten.sum.dim_IntList %1281, %24, %true, %none : !torch.vtensor<[2048,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,768],f32>
    %1288 = torch.aten.view %1287, %3 : !torch.vtensor<[1,768],f32>, !torch.list<int> -> !torch.vtensor<[768],f32>
    %1289 = torch.aten.view %1283, %28 : !torch.vtensor<[2048,768],f32>, !torch.list<int> -> !torch.vtensor<[4,512,768],f32>
    %1290 = torch.aten.add.Tensor %1276, %1289, %int1 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %int0_580 = torch.constant.int 0
    %int1_581 = torch.constant.int 1
    %1291 = torch.aten.transpose.int %1286, %int0_580, %int1_581 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %1292 = torch.aten.view %1263, %49 : !torch.vtensor<[4,512,768],f32>, !torch.list<int> -> !torch.vtensor<[2048,768],f32>
    %int0_582 = torch.constant.int 0
    %int1_583 = torch.constant.int 1
    %1293 = torch.aten.transpose.int %arg126, %int0_582, %int1_583 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %1294 = torch.aten.mm %1292, %1293 : !torch.vtensor<[2048,768],f32>, !torch.vtensor<[768,768],f32> -> !torch.vtensor<[2048,768],f32>
    %int0_584 = torch.constant.int 0
    %int1_585 = torch.constant.int 1
    %1295 = torch.aten.transpose.int %1292, %int0_584, %int1_585 : !torch.vtensor<[2048,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,2048],f32>
    %1296 = torch.aten.mm %1295, %arg127 : !torch.vtensor<[768,2048],f32>, !torch.vtensor<[2048,768],f32> -> !torch.vtensor<[768,768],f32>
    %int0_586 = torch.constant.int 0
    %int1_587 = torch.constant.int 1
    %1297 = torch.aten.transpose.int %1296, %int0_586, %int1_587 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %1298 = torch.aten.sum.dim_IntList %1292, %24, %true, %none : !torch.vtensor<[2048,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,768],f32>
    %1299 = torch.aten.view %1298, %3 : !torch.vtensor<[1,768],f32>, !torch.list<int> -> !torch.vtensor<[768],f32>
    %1300 = torch.aten.view %1294, %28 : !torch.vtensor<[2048,768],f32>, !torch.list<int> -> !torch.vtensor<[4,512,768],f32>
    %1301 = torch.aten.add.Tensor %1290, %1300, %int1 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %int0_588 = torch.constant.int 0
    %int1_589 = torch.constant.int 1
    %1302 = torch.aten.transpose.int %1297, %int0_588, %int1_589 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %1303 = torch.aten.sub.Tensor %arg125, %result1_16, %int1 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %1304 = torch.aten.mul.Tensor %1303, %result2_17 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32> -> !torch.vtensor<[4,512,768],f32>
    %1305 = torch.aten.mul.Tensor %1301, %arg21 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[768],f32> -> !torch.vtensor<[4,512,768],f32>
    %1306 = torch.aten.mul.Tensor %1305, %0 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[],si64> -> !torch.vtensor<[4,512,768],f32>
    %1307 = torch.aten.sum.dim_IntList %1305, %35, %true, %none : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[4,512,1],f32>
    %1308 = torch.aten.mul.Tensor %1305, %1304 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32> -> !torch.vtensor<[4,512,768],f32>
    %1309 = torch.aten.sum.dim_IntList %1308, %35, %true, %none : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[4,512,1],f32>
    %1310 = torch.aten.mul.Tensor %1304, %1309 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32> -> !torch.vtensor<[4,512,768],f32>
    %1311 = torch.aten.sub.Tensor %1306, %1307, %int1 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %1312 = torch.aten.sub.Tensor %1311, %1310, %int1 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %1313 = torch.aten.div.Tensor %result2_17, %0 : !torch.vtensor<[4,512,1],f32>, !torch.vtensor<[],si64> -> !torch.vtensor<[4,512,1],f32>
    %1314 = torch.aten.mul.Tensor %1313, %1312 : !torch.vtensor<[4,512,1],f32>, !torch.vtensor<[4,512,768],f32> -> !torch.vtensor<[4,512,768],f32>
    %1315 = torch.aten.mul.Tensor %1301, %1304 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32> -> !torch.vtensor<[4,512,768],f32>
    %1316 = torch.aten.sum.dim_IntList %1315, %45, %false, %none : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[768],f32>
    %1317 = torch.aten.sum.dim_IntList %1301, %45, %false, %none : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[768],f32>
    %1318 = torch.aten.mul.Tensor %1314, %arg124 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32> -> !torch.vtensor<[4,512,768],f32>
    %1319 = torch.aten.view %1318, %49 : !torch.vtensor<[4,512,768],f32>, !torch.list<int> -> !torch.vtensor<[2048,768],f32>
    %int0_590 = torch.constant.int 0
    %int1_591 = torch.constant.int 1
    %1320 = torch.aten.transpose.int %arg122, %int0_590, %int1_591 : !torch.vtensor<[3072,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,3072],f32>
    %1321 = torch.aten.mm %1319, %1320 : !torch.vtensor<[2048,768],f32>, !torch.vtensor<[768,3072],f32> -> !torch.vtensor<[2048,3072],f32>
    %int0_592 = torch.constant.int 0
    %int1_593 = torch.constant.int 1
    %1322 = torch.aten.transpose.int %1319, %int0_592, %int1_593 : !torch.vtensor<[2048,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,2048],f32>
    %1323 = torch.aten.mm %1322, %arg123 : !torch.vtensor<[768,2048],f32>, !torch.vtensor<[2048,3072],f32> -> !torch.vtensor<[768,3072],f32>
    %int0_594 = torch.constant.int 0
    %int1_595 = torch.constant.int 1
    %1324 = torch.aten.transpose.int %1323, %int0_594, %int1_595 : !torch.vtensor<[768,3072],f32>, !torch.int, !torch.int -> !torch.vtensor<[3072,768],f32>
    %1325 = torch.aten.sum.dim_IntList %1319, %24, %true, %none : !torch.vtensor<[2048,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,768],f32>
    %1326 = torch.aten.view %1325, %3 : !torch.vtensor<[1,768],f32>, !torch.list<int> -> !torch.vtensor<[768],f32>
    %1327 = torch.aten.view %1321, %84 : !torch.vtensor<[2048,3072],f32>, !torch.list<int> -> !torch.vtensor<[4,512,3072],f32>
    %int0_596 = torch.constant.int 0
    %int1_597 = torch.constant.int 1
    %1328 = torch.aten.transpose.int %1324, %int0_596, %int1_597 : !torch.vtensor<[3072,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,3072],f32>
    %1329 = torch.aten.gelu_backward %1327, %arg121, %str : !torch.vtensor<[4,512,3072],f32>, !torch.vtensor<[4,512,3072],f32>, !torch.str -> !torch.vtensor<[4,512,3072],f32>
    %1330 = torch.aten.view %1329, %88 : !torch.vtensor<[4,512,3072],f32>, !torch.list<int> -> !torch.vtensor<[2048,3072],f32>
    %int0_598 = torch.constant.int 0
    %int1_599 = torch.constant.int 1
    %1331 = torch.aten.transpose.int %arg119, %int0_598, %int1_599 : !torch.vtensor<[768,3072],f32>, !torch.int, !torch.int -> !torch.vtensor<[3072,768],f32>
    %1332 = torch.aten.mm %1330, %1331 : !torch.vtensor<[2048,3072],f32>, !torch.vtensor<[3072,768],f32> -> !torch.vtensor<[2048,768],f32>
    %int0_600 = torch.constant.int 0
    %int1_601 = torch.constant.int 1
    %1333 = torch.aten.transpose.int %1330, %int0_600, %int1_601 : !torch.vtensor<[2048,3072],f32>, !torch.int, !torch.int -> !torch.vtensor<[3072,2048],f32>
    %1334 = torch.aten.mm %1333, %arg120 : !torch.vtensor<[3072,2048],f32>, !torch.vtensor<[2048,768],f32> -> !torch.vtensor<[3072,768],f32>
    %int0_602 = torch.constant.int 0
    %int1_603 = torch.constant.int 1
    %1335 = torch.aten.transpose.int %1334, %int0_602, %int1_603 : !torch.vtensor<[3072,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,3072],f32>
    %1336 = torch.aten.sum.dim_IntList %1330, %24, %true, %none : !torch.vtensor<[2048,3072],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,3072],f32>
    %1337 = torch.aten.view %1336, %96 : !torch.vtensor<[1,3072],f32>, !torch.list<int> -> !torch.vtensor<[3072],f32>
    %1338 = torch.aten.view %1332, %28 : !torch.vtensor<[2048,768],f32>, !torch.list<int> -> !torch.vtensor<[4,512,768],f32>
    %1339 = torch.aten.add.Tensor %1314, %1338, %int1 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %int0_604 = torch.constant.int 0
    %int1_605 = torch.constant.int 1
    %1340 = torch.aten.transpose.int %1335, %int0_604, %int1_605 : !torch.vtensor<[768,3072],f32>, !torch.int, !torch.int -> !torch.vtensor<[3072,768],f32>
    %1341 = torch.aten.sub.Tensor %arg118, %result1_13, %int1 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %1342 = torch.aten.mul.Tensor %1341, %result2_14 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32> -> !torch.vtensor<[4,512,768],f32>
    %1343 = torch.aten.mul.Tensor %1339, %arg19 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[768],f32> -> !torch.vtensor<[4,512,768],f32>
    %1344 = torch.aten.mul.Tensor %1343, %0 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[],si64> -> !torch.vtensor<[4,512,768],f32>
    %1345 = torch.aten.sum.dim_IntList %1343, %35, %true, %none : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[4,512,1],f32>
    %1346 = torch.aten.mul.Tensor %1343, %1342 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32> -> !torch.vtensor<[4,512,768],f32>
    %1347 = torch.aten.sum.dim_IntList %1346, %35, %true, %none : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[4,512,1],f32>
    %1348 = torch.aten.mul.Tensor %1342, %1347 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32> -> !torch.vtensor<[4,512,768],f32>
    %1349 = torch.aten.sub.Tensor %1344, %1345, %int1 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %1350 = torch.aten.sub.Tensor %1349, %1348, %int1 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %1351 = torch.aten.div.Tensor %result2_14, %0 : !torch.vtensor<[4,512,1],f32>, !torch.vtensor<[],si64> -> !torch.vtensor<[4,512,1],f32>
    %1352 = torch.aten.mul.Tensor %1351, %1350 : !torch.vtensor<[4,512,1],f32>, !torch.vtensor<[4,512,768],f32> -> !torch.vtensor<[4,512,768],f32>
    %1353 = torch.aten.mul.Tensor %1339, %1342 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32> -> !torch.vtensor<[4,512,768],f32>
    %1354 = torch.aten.sum.dim_IntList %1353, %45, %false, %none : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[768],f32>
    %1355 = torch.aten.sum.dim_IntList %1339, %45, %false, %none : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[768],f32>
    %1356 = torch.aten.mul.Tensor %1352, %arg117 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32> -> !torch.vtensor<[4,512,768],f32>
    %1357 = torch.aten.view %1356, %49 : !torch.vtensor<[4,512,768],f32>, !torch.list<int> -> !torch.vtensor<[2048,768],f32>
    %int0_606 = torch.constant.int 0
    %int1_607 = torch.constant.int 1
    %1358 = torch.aten.transpose.int %arg115, %int0_606, %int1_607 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %1359 = torch.aten.mm %1357, %1358 : !torch.vtensor<[2048,768],f32>, !torch.vtensor<[768,768],f32> -> !torch.vtensor<[2048,768],f32>
    %int0_608 = torch.constant.int 0
    %int1_609 = torch.constant.int 1
    %1360 = torch.aten.transpose.int %1357, %int0_608, %int1_609 : !torch.vtensor<[2048,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,2048],f32>
    %1361 = torch.aten.mm %1360, %arg116 : !torch.vtensor<[768,2048],f32>, !torch.vtensor<[2048,768],f32> -> !torch.vtensor<[768,768],f32>
    %int0_610 = torch.constant.int 0
    %int1_611 = torch.constant.int 1
    %1362 = torch.aten.transpose.int %1361, %int0_610, %int1_611 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %1363 = torch.aten.sum.dim_IntList %1357, %24, %true, %none : !torch.vtensor<[2048,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,768],f32>
    %1364 = torch.aten.view %1363, %3 : !torch.vtensor<[1,768],f32>, !torch.list<int> -> !torch.vtensor<[768],f32>
    %1365 = torch.aten.view %1359, %28 : !torch.vtensor<[2048,768],f32>, !torch.list<int> -> !torch.vtensor<[4,512,768],f32>
    %int0_612 = torch.constant.int 0
    %int1_613 = torch.constant.int 1
    %1366 = torch.aten.transpose.int %1362, %int0_612, %int1_613 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %1367 = torch.aten.view %1365, %127 : !torch.vtensor<[4,512,768],f32>, !torch.list<int> -> !torch.vtensor<[4,512,12,64],f32>
    %1368 = torch.aten.permute %1367, %129 : !torch.vtensor<[4,512,12,64],f32>, !torch.list<int> -> !torch.vtensor<[4,12,512,64],f32>
    %1369 = torch.aten.clone %1368, %int0 : !torch.vtensor<[4,12,512,64],f32>, !torch.int -> !torch.vtensor<[4,12,512,64],f32>
    %1370 = torch.aten.view %1369, %132 : !torch.vtensor<[4,12,512,64],f32>, !torch.list<int> -> !torch.vtensor<[48,512,64],f32>
    %1371 = torch.aten.transpose.int %arg113, %int1, %int2 : !torch.vtensor<[48,512,512],f32>, !torch.int, !torch.int -> !torch.vtensor<[48,512,512],f32>
    %1372 = torch.aten.bmm %1371, %1370 : !torch.vtensor<[48,512,512],f32>, !torch.vtensor<[48,512,64],f32> -> !torch.vtensor<[48,512,64],f32>
    %1373 = torch.aten.transpose.int %arg114, %int1, %int2 : !torch.vtensor<[48,512,64],f32>, !torch.int, !torch.int -> !torch.vtensor<[48,64,512],f32>
    %1374 = torch.aten.bmm %1370, %1373 : !torch.vtensor<[48,512,64],f32>, !torch.vtensor<[48,64,512],f32> -> !torch.vtensor<[48,512,512],f32>
    %1375 = torch.aten.view %1372, %138 : !torch.vtensor<[48,512,64],f32>, !torch.list<int> -> !torch.vtensor<[4,12,512,64],f32>
    %1376 = torch.aten.view %1374, %140 : !torch.vtensor<[48,512,512],f32>, !torch.list<int> -> !torch.vtensor<[4,12,512,512],f32>
    %1377 = torch.aten.mul.Tensor %1376, %arg112 : !torch.vtensor<[4,12,512,512],f32>, !torch.vtensor<[4,12,512,512],f32> -> !torch.vtensor<[4,12,512,512],f32>
    %1378 = torch.aten.mul.Tensor %1377, %arg111 : !torch.vtensor<[4,12,512,512],f32>, !torch.vtensor<[4,12,512,512],f32> -> !torch.vtensor<[4,12,512,512],f32>
    %1379 = torch.prim.ListConstruct %int-1 : (!torch.int) -> !torch.list<int>
    %true_614 = torch.constant.bool true
    %none_615 = torch.constant.none
    %1380 = torch.aten.sum.dim_IntList %1378, %1379, %true_614, %none_615 : !torch.vtensor<[4,12,512,512],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[4,12,512,1],f32>
    %int0_616 = torch.constant.int 0
    %1381 = torch.aten.size.int %1378, %int0_616 : !torch.vtensor<[4,12,512,512],f32>, !torch.int -> !torch.int
    %int1_617 = torch.constant.int 1
    %1382 = torch.aten.size.int %1378, %int1_617 : !torch.vtensor<[4,12,512,512],f32>, !torch.int -> !torch.int
    %int2_618 = torch.constant.int 2
    %1383 = torch.aten.size.int %1378, %int2_618 : !torch.vtensor<[4,12,512,512],f32>, !torch.int -> !torch.int
    %int3_619 = torch.constant.int 3
    %1384 = torch.aten.size.int %1378, %int3_619 : !torch.vtensor<[4,12,512,512],f32>, !torch.int -> !torch.int
    %1385 = torch.prim.ListConstruct %1381, %1382, %1383, %1384 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1386 = torch.aten.broadcast_to %1380, %1385 : !torch.vtensor<[4,12,512,1],f32>, !torch.list<int> -> !torch.vtensor<[4,12,512,512],f32>
    %1387 = torch.aten.mul.Tensor %arg111, %1386 : !torch.vtensor<[4,12,512,512],f32>, !torch.vtensor<[4,12,512,512],f32> -> !torch.vtensor<[4,12,512,512],f32>
    %float1.000000e00_620 = torch.constant.float 1.000000e+00
    %1388 = torch.aten.sub.Tensor %1378, %1387, %float1.000000e00_620 : !torch.vtensor<[4,12,512,512],f32>, !torch.vtensor<[4,12,512,512],f32>, !torch.float -> !torch.vtensor<[4,12,512,512],f32>
    %1389 = torch.aten.div.Tensor %1388, %1 : !torch.vtensor<[4,12,512,512],f32>, !torch.vtensor<[],f64> -> !torch.vtensor<[4,12,512,512],f32>
    %1390 = torch.aten.view %1389, %155 : !torch.vtensor<[4,12,512,512],f32>, !torch.list<int> -> !torch.vtensor<[48,512,512],f32>
    %1391 = torch.aten.transpose.int %arg109, %int1, %int2 : !torch.vtensor<[48,512,64],f32>, !torch.int, !torch.int -> !torch.vtensor<[48,64,512],f32>
    %1392 = torch.aten.bmm %1391, %1390 : !torch.vtensor<[48,64,512],f32>, !torch.vtensor<[48,512,512],f32> -> !torch.vtensor<[48,64,512],f32>
    %1393 = torch.aten.transpose.int %arg110, %int1, %int2 : !torch.vtensor<[48,64,512],f32>, !torch.int, !torch.int -> !torch.vtensor<[48,512,64],f32>
    %1394 = torch.aten.bmm %1390, %1393 : !torch.vtensor<[48,512,512],f32>, !torch.vtensor<[48,512,64],f32> -> !torch.vtensor<[48,512,64],f32>
    %1395 = torch.aten.view %1392, %161 : !torch.vtensor<[48,64,512],f32>, !torch.list<int> -> !torch.vtensor<[4,12,64,512],f32>
    %1396 = torch.aten.view %1394, %138 : !torch.vtensor<[48,512,64],f32>, !torch.list<int> -> !torch.vtensor<[4,12,512,64],f32>
    %1397 = torch.aten.transpose.int %1395, %int-1, %int-2 : !torch.vtensor<[4,12,64,512],f32>, !torch.int, !torch.int -> !torch.vtensor<[4,12,512,64],f32>
    %1398 = torch.aten.permute %1396, %129 : !torch.vtensor<[4,12,512,64],f32>, !torch.list<int> -> !torch.vtensor<[4,512,12,64],f32>
    %1399 = torch.aten.clone %1398, %int0 : !torch.vtensor<[4,512,12,64],f32>, !torch.int -> !torch.vtensor<[4,512,12,64],f32>
    %1400 = torch.aten.view %1399, %28 : !torch.vtensor<[4,512,12,64],f32>, !torch.list<int> -> !torch.vtensor<[4,512,768],f32>
    %1401 = torch.aten.permute %1375, %129 : !torch.vtensor<[4,12,512,64],f32>, !torch.list<int> -> !torch.vtensor<[4,512,12,64],f32>
    %1402 = torch.aten.clone %1401, %int0 : !torch.vtensor<[4,512,12,64],f32>, !torch.int -> !torch.vtensor<[4,512,12,64],f32>
    %1403 = torch.aten.view %1402, %28 : !torch.vtensor<[4,512,12,64],f32>, !torch.list<int> -> !torch.vtensor<[4,512,768],f32>
    %1404 = torch.aten.view %1403, %49 : !torch.vtensor<[4,512,768],f32>, !torch.list<int> -> !torch.vtensor<[2048,768],f32>
    %int0_621 = torch.constant.int 0
    %int1_622 = torch.constant.int 1
    %1405 = torch.aten.transpose.int %arg107, %int0_621, %int1_622 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %1406 = torch.aten.mm %1404, %1405 : !torch.vtensor<[2048,768],f32>, !torch.vtensor<[768,768],f32> -> !torch.vtensor<[2048,768],f32>
    %int0_623 = torch.constant.int 0
    %int1_624 = torch.constant.int 1
    %1407 = torch.aten.transpose.int %1404, %int0_623, %int1_624 : !torch.vtensor<[2048,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,2048],f32>
    %1408 = torch.aten.mm %1407, %arg108 : !torch.vtensor<[768,2048],f32>, !torch.vtensor<[2048,768],f32> -> !torch.vtensor<[768,768],f32>
    %int0_625 = torch.constant.int 0
    %int1_626 = torch.constant.int 1
    %1409 = torch.aten.transpose.int %1408, %int0_625, %int1_626 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %1410 = torch.aten.sum.dim_IntList %1404, %24, %true, %none : !torch.vtensor<[2048,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,768],f32>
    %1411 = torch.aten.view %1410, %3 : !torch.vtensor<[1,768],f32>, !torch.list<int> -> !torch.vtensor<[768],f32>
    %1412 = torch.aten.view %1406, %28 : !torch.vtensor<[2048,768],f32>, !torch.list<int> -> !torch.vtensor<[4,512,768],f32>
    %1413 = torch.aten.add.Tensor %1352, %1412, %int1 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %int0_627 = torch.constant.int 0
    %int1_628 = torch.constant.int 1
    %1414 = torch.aten.transpose.int %1409, %int0_627, %int1_628 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %1415 = torch.aten.permute %1397, %129 : !torch.vtensor<[4,12,512,64],f32>, !torch.list<int> -> !torch.vtensor<[4,512,12,64],f32>
    %1416 = torch.aten.view %1415, %28 : !torch.vtensor<[4,512,12,64],f32>, !torch.list<int> -> !torch.vtensor<[4,512,768],f32>
    %1417 = torch.aten.clone %1416, %int0 : !torch.vtensor<[4,512,768],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %1418 = torch.aten.view %1417, %49 : !torch.vtensor<[4,512,768],f32>, !torch.list<int> -> !torch.vtensor<[2048,768],f32>
    %int0_629 = torch.constant.int 0
    %int1_630 = torch.constant.int 1
    %1419 = torch.aten.transpose.int %arg105, %int0_629, %int1_630 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %1420 = torch.aten.mm %1418, %1419 : !torch.vtensor<[2048,768],f32>, !torch.vtensor<[768,768],f32> -> !torch.vtensor<[2048,768],f32>
    %int0_631 = torch.constant.int 0
    %int1_632 = torch.constant.int 1
    %1421 = torch.aten.transpose.int %1418, %int0_631, %int1_632 : !torch.vtensor<[2048,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,2048],f32>
    %1422 = torch.aten.mm %1421, %arg106 : !torch.vtensor<[768,2048],f32>, !torch.vtensor<[2048,768],f32> -> !torch.vtensor<[768,768],f32>
    %int0_633 = torch.constant.int 0
    %int1_634 = torch.constant.int 1
    %1423 = torch.aten.transpose.int %1422, %int0_633, %int1_634 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %1424 = torch.aten.sum.dim_IntList %1418, %24, %true, %none : !torch.vtensor<[2048,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,768],f32>
    %1425 = torch.aten.view %1424, %3 : !torch.vtensor<[1,768],f32>, !torch.list<int> -> !torch.vtensor<[768],f32>
    %1426 = torch.aten.view %1420, %28 : !torch.vtensor<[2048,768],f32>, !torch.list<int> -> !torch.vtensor<[4,512,768],f32>
    %1427 = torch.aten.add.Tensor %1413, %1426, %int1 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %int0_635 = torch.constant.int 0
    %int1_636 = torch.constant.int 1
    %1428 = torch.aten.transpose.int %1423, %int0_635, %int1_636 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %1429 = torch.aten.view %1400, %49 : !torch.vtensor<[4,512,768],f32>, !torch.list<int> -> !torch.vtensor<[2048,768],f32>
    %int0_637 = torch.constant.int 0
    %int1_638 = torch.constant.int 1
    %1430 = torch.aten.transpose.int %arg103, %int0_637, %int1_638 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %1431 = torch.aten.mm %1429, %1430 : !torch.vtensor<[2048,768],f32>, !torch.vtensor<[768,768],f32> -> !torch.vtensor<[2048,768],f32>
    %int0_639 = torch.constant.int 0
    %int1_640 = torch.constant.int 1
    %1432 = torch.aten.transpose.int %1429, %int0_639, %int1_640 : !torch.vtensor<[2048,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,2048],f32>
    %1433 = torch.aten.mm %1432, %arg104 : !torch.vtensor<[768,2048],f32>, !torch.vtensor<[2048,768],f32> -> !torch.vtensor<[768,768],f32>
    %int0_641 = torch.constant.int 0
    %int1_642 = torch.constant.int 1
    %1434 = torch.aten.transpose.int %1433, %int0_641, %int1_642 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %1435 = torch.aten.sum.dim_IntList %1429, %24, %true, %none : !torch.vtensor<[2048,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,768],f32>
    %1436 = torch.aten.view %1435, %3 : !torch.vtensor<[1,768],f32>, !torch.list<int> -> !torch.vtensor<[768],f32>
    %1437 = torch.aten.view %1431, %28 : !torch.vtensor<[2048,768],f32>, !torch.list<int> -> !torch.vtensor<[4,512,768],f32>
    %1438 = torch.aten.add.Tensor %1427, %1437, %int1 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %int0_643 = torch.constant.int 0
    %int1_644 = torch.constant.int 1
    %1439 = torch.aten.transpose.int %1434, %int0_643, %int1_644 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %1440 = torch.aten.sub.Tensor %arg102, %result1_10, %int1 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %1441 = torch.aten.mul.Tensor %1440, %result2_11 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32> -> !torch.vtensor<[4,512,768],f32>
    %1442 = torch.aten.mul.Tensor %1438, %arg9 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[768],f32> -> !torch.vtensor<[4,512,768],f32>
    %1443 = torch.aten.mul.Tensor %1442, %0 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[],si64> -> !torch.vtensor<[4,512,768],f32>
    %1444 = torch.aten.sum.dim_IntList %1442, %35, %true, %none : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[4,512,1],f32>
    %1445 = torch.aten.mul.Tensor %1442, %1441 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32> -> !torch.vtensor<[4,512,768],f32>
    %1446 = torch.aten.sum.dim_IntList %1445, %35, %true, %none : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[4,512,1],f32>
    %1447 = torch.aten.mul.Tensor %1441, %1446 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32> -> !torch.vtensor<[4,512,768],f32>
    %1448 = torch.aten.sub.Tensor %1443, %1444, %int1 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %1449 = torch.aten.sub.Tensor %1448, %1447, %int1 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %1450 = torch.aten.div.Tensor %result2_11, %0 : !torch.vtensor<[4,512,1],f32>, !torch.vtensor<[],si64> -> !torch.vtensor<[4,512,1],f32>
    %1451 = torch.aten.mul.Tensor %1450, %1449 : !torch.vtensor<[4,512,1],f32>, !torch.vtensor<[4,512,768],f32> -> !torch.vtensor<[4,512,768],f32>
    %1452 = torch.aten.mul.Tensor %1438, %1441 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32> -> !torch.vtensor<[4,512,768],f32>
    %1453 = torch.aten.sum.dim_IntList %1452, %45, %false, %none : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[768],f32>
    %1454 = torch.aten.sum.dim_IntList %1438, %45, %false, %none : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[768],f32>
    %1455 = torch.aten.mul.Tensor %1451, %arg101 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32> -> !torch.vtensor<[4,512,768],f32>
    %1456 = torch.aten.view %1455, %49 : !torch.vtensor<[4,512,768],f32>, !torch.list<int> -> !torch.vtensor<[2048,768],f32>
    %int0_645 = torch.constant.int 0
    %int1_646 = torch.constant.int 1
    %1457 = torch.aten.transpose.int %arg99, %int0_645, %int1_646 : !torch.vtensor<[3072,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,3072],f32>
    %1458 = torch.aten.mm %1456, %1457 : !torch.vtensor<[2048,768],f32>, !torch.vtensor<[768,3072],f32> -> !torch.vtensor<[2048,3072],f32>
    %int0_647 = torch.constant.int 0
    %int1_648 = torch.constant.int 1
    %1459 = torch.aten.transpose.int %1456, %int0_647, %int1_648 : !torch.vtensor<[2048,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,2048],f32>
    %1460 = torch.aten.mm %1459, %arg100 : !torch.vtensor<[768,2048],f32>, !torch.vtensor<[2048,3072],f32> -> !torch.vtensor<[768,3072],f32>
    %int0_649 = torch.constant.int 0
    %int1_650 = torch.constant.int 1
    %1461 = torch.aten.transpose.int %1460, %int0_649, %int1_650 : !torch.vtensor<[768,3072],f32>, !torch.int, !torch.int -> !torch.vtensor<[3072,768],f32>
    %1462 = torch.aten.sum.dim_IntList %1456, %24, %true, %none : !torch.vtensor<[2048,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,768],f32>
    %1463 = torch.aten.view %1462, %3 : !torch.vtensor<[1,768],f32>, !torch.list<int> -> !torch.vtensor<[768],f32>
    %1464 = torch.aten.view %1458, %84 : !torch.vtensor<[2048,3072],f32>, !torch.list<int> -> !torch.vtensor<[4,512,3072],f32>
    %int0_651 = torch.constant.int 0
    %int1_652 = torch.constant.int 1
    %1465 = torch.aten.transpose.int %1461, %int0_651, %int1_652 : !torch.vtensor<[3072,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,3072],f32>
    %1466 = torch.aten.gelu_backward %1464, %arg98, %str : !torch.vtensor<[4,512,3072],f32>, !torch.vtensor<[4,512,3072],f32>, !torch.str -> !torch.vtensor<[4,512,3072],f32>
    %1467 = torch.aten.view %1466, %88 : !torch.vtensor<[4,512,3072],f32>, !torch.list<int> -> !torch.vtensor<[2048,3072],f32>
    %int0_653 = torch.constant.int 0
    %int1_654 = torch.constant.int 1
    %1468 = torch.aten.transpose.int %arg96, %int0_653, %int1_654 : !torch.vtensor<[768,3072],f32>, !torch.int, !torch.int -> !torch.vtensor<[3072,768],f32>
    %1469 = torch.aten.mm %1467, %1468 : !torch.vtensor<[2048,3072],f32>, !torch.vtensor<[3072,768],f32> -> !torch.vtensor<[2048,768],f32>
    %int0_655 = torch.constant.int 0
    %int1_656 = torch.constant.int 1
    %1470 = torch.aten.transpose.int %1467, %int0_655, %int1_656 : !torch.vtensor<[2048,3072],f32>, !torch.int, !torch.int -> !torch.vtensor<[3072,2048],f32>
    %1471 = torch.aten.mm %1470, %arg97 : !torch.vtensor<[3072,2048],f32>, !torch.vtensor<[2048,768],f32> -> !torch.vtensor<[3072,768],f32>
    %int0_657 = torch.constant.int 0
    %int1_658 = torch.constant.int 1
    %1472 = torch.aten.transpose.int %1471, %int0_657, %int1_658 : !torch.vtensor<[3072,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,3072],f32>
    %1473 = torch.aten.sum.dim_IntList %1467, %24, %true, %none : !torch.vtensor<[2048,3072],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,3072],f32>
    %1474 = torch.aten.view %1473, %96 : !torch.vtensor<[1,3072],f32>, !torch.list<int> -> !torch.vtensor<[3072],f32>
    %1475 = torch.aten.view %1469, %28 : !torch.vtensor<[2048,768],f32>, !torch.list<int> -> !torch.vtensor<[4,512,768],f32>
    %1476 = torch.aten.add.Tensor %1451, %1475, %int1 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %int0_659 = torch.constant.int 0
    %int1_660 = torch.constant.int 1
    %1477 = torch.aten.transpose.int %1472, %int0_659, %int1_660 : !torch.vtensor<[768,3072],f32>, !torch.int, !torch.int -> !torch.vtensor<[3072,768],f32>
    %1478 = torch.aten.sub.Tensor %arg95, %result1_7, %int1 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %1479 = torch.aten.mul.Tensor %1478, %result2_8 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32> -> !torch.vtensor<[4,512,768],f32>
    %1480 = torch.aten.mul.Tensor %1476, %arg7 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[768],f32> -> !torch.vtensor<[4,512,768],f32>
    %1481 = torch.aten.mul.Tensor %1480, %0 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[],si64> -> !torch.vtensor<[4,512,768],f32>
    %1482 = torch.aten.sum.dim_IntList %1480, %35, %true, %none : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[4,512,1],f32>
    %1483 = torch.aten.mul.Tensor %1480, %1479 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32> -> !torch.vtensor<[4,512,768],f32>
    %1484 = torch.aten.sum.dim_IntList %1483, %35, %true, %none : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[4,512,1],f32>
    %1485 = torch.aten.mul.Tensor %1479, %1484 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32> -> !torch.vtensor<[4,512,768],f32>
    %1486 = torch.aten.sub.Tensor %1481, %1482, %int1 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %1487 = torch.aten.sub.Tensor %1486, %1485, %int1 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %1488 = torch.aten.div.Tensor %result2_8, %0 : !torch.vtensor<[4,512,1],f32>, !torch.vtensor<[],si64> -> !torch.vtensor<[4,512,1],f32>
    %1489 = torch.aten.mul.Tensor %1488, %1487 : !torch.vtensor<[4,512,1],f32>, !torch.vtensor<[4,512,768],f32> -> !torch.vtensor<[4,512,768],f32>
    %1490 = torch.aten.mul.Tensor %1476, %1479 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32> -> !torch.vtensor<[4,512,768],f32>
    %1491 = torch.aten.sum.dim_IntList %1490, %45, %false, %none : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[768],f32>
    %1492 = torch.aten.sum.dim_IntList %1476, %45, %false, %none : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[768],f32>
    %1493 = torch.aten.mul.Tensor %1489, %arg94 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32> -> !torch.vtensor<[4,512,768],f32>
    %1494 = torch.aten.view %1493, %49 : !torch.vtensor<[4,512,768],f32>, !torch.list<int> -> !torch.vtensor<[2048,768],f32>
    %int0_661 = torch.constant.int 0
    %int1_662 = torch.constant.int 1
    %1495 = torch.aten.transpose.int %arg92, %int0_661, %int1_662 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %1496 = torch.aten.mm %1494, %1495 : !torch.vtensor<[2048,768],f32>, !torch.vtensor<[768,768],f32> -> !torch.vtensor<[2048,768],f32>
    %int0_663 = torch.constant.int 0
    %int1_664 = torch.constant.int 1
    %1497 = torch.aten.transpose.int %1494, %int0_663, %int1_664 : !torch.vtensor<[2048,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,2048],f32>
    %1498 = torch.aten.mm %1497, %arg93 : !torch.vtensor<[768,2048],f32>, !torch.vtensor<[2048,768],f32> -> !torch.vtensor<[768,768],f32>
    %int0_665 = torch.constant.int 0
    %int1_666 = torch.constant.int 1
    %1499 = torch.aten.transpose.int %1498, %int0_665, %int1_666 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %1500 = torch.aten.sum.dim_IntList %1494, %24, %true, %none : !torch.vtensor<[2048,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,768],f32>
    %1501 = torch.aten.view %1500, %3 : !torch.vtensor<[1,768],f32>, !torch.list<int> -> !torch.vtensor<[768],f32>
    %1502 = torch.aten.view %1496, %28 : !torch.vtensor<[2048,768],f32>, !torch.list<int> -> !torch.vtensor<[4,512,768],f32>
    %int0_667 = torch.constant.int 0
    %int1_668 = torch.constant.int 1
    %1503 = torch.aten.transpose.int %1499, %int0_667, %int1_668 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %1504 = torch.aten.view %1502, %127 : !torch.vtensor<[4,512,768],f32>, !torch.list<int> -> !torch.vtensor<[4,512,12,64],f32>
    %1505 = torch.aten.permute %1504, %129 : !torch.vtensor<[4,512,12,64],f32>, !torch.list<int> -> !torch.vtensor<[4,12,512,64],f32>
    %1506 = torch.aten.clone %1505, %int0 : !torch.vtensor<[4,12,512,64],f32>, !torch.int -> !torch.vtensor<[4,12,512,64],f32>
    %1507 = torch.aten.view %1506, %132 : !torch.vtensor<[4,12,512,64],f32>, !torch.list<int> -> !torch.vtensor<[48,512,64],f32>
    %1508 = torch.aten.transpose.int %arg90, %int1, %int2 : !torch.vtensor<[48,512,512],f32>, !torch.int, !torch.int -> !torch.vtensor<[48,512,512],f32>
    %1509 = torch.aten.bmm %1508, %1507 : !torch.vtensor<[48,512,512],f32>, !torch.vtensor<[48,512,64],f32> -> !torch.vtensor<[48,512,64],f32>
    %1510 = torch.aten.transpose.int %arg91, %int1, %int2 : !torch.vtensor<[48,512,64],f32>, !torch.int, !torch.int -> !torch.vtensor<[48,64,512],f32>
    %1511 = torch.aten.bmm %1507, %1510 : !torch.vtensor<[48,512,64],f32>, !torch.vtensor<[48,64,512],f32> -> !torch.vtensor<[48,512,512],f32>
    %1512 = torch.aten.view %1509, %138 : !torch.vtensor<[48,512,64],f32>, !torch.list<int> -> !torch.vtensor<[4,12,512,64],f32>
    %1513 = torch.aten.view %1511, %140 : !torch.vtensor<[48,512,512],f32>, !torch.list<int> -> !torch.vtensor<[4,12,512,512],f32>
    %1514 = torch.aten.mul.Tensor %1513, %arg89 : !torch.vtensor<[4,12,512,512],f32>, !torch.vtensor<[4,12,512,512],f32> -> !torch.vtensor<[4,12,512,512],f32>
    %1515 = torch.aten.mul.Tensor %1514, %arg88 : !torch.vtensor<[4,12,512,512],f32>, !torch.vtensor<[4,12,512,512],f32> -> !torch.vtensor<[4,12,512,512],f32>
    %1516 = torch.prim.ListConstruct %int-1 : (!torch.int) -> !torch.list<int>
    %true_669 = torch.constant.bool true
    %none_670 = torch.constant.none
    %1517 = torch.aten.sum.dim_IntList %1515, %1516, %true_669, %none_670 : !torch.vtensor<[4,12,512,512],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[4,12,512,1],f32>
    %int0_671 = torch.constant.int 0
    %1518 = torch.aten.size.int %1515, %int0_671 : !torch.vtensor<[4,12,512,512],f32>, !torch.int -> !torch.int
    %int1_672 = torch.constant.int 1
    %1519 = torch.aten.size.int %1515, %int1_672 : !torch.vtensor<[4,12,512,512],f32>, !torch.int -> !torch.int
    %int2_673 = torch.constant.int 2
    %1520 = torch.aten.size.int %1515, %int2_673 : !torch.vtensor<[4,12,512,512],f32>, !torch.int -> !torch.int
    %int3_674 = torch.constant.int 3
    %1521 = torch.aten.size.int %1515, %int3_674 : !torch.vtensor<[4,12,512,512],f32>, !torch.int -> !torch.int
    %1522 = torch.prim.ListConstruct %1518, %1519, %1520, %1521 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1523 = torch.aten.broadcast_to %1517, %1522 : !torch.vtensor<[4,12,512,1],f32>, !torch.list<int> -> !torch.vtensor<[4,12,512,512],f32>
    %1524 = torch.aten.mul.Tensor %arg88, %1523 : !torch.vtensor<[4,12,512,512],f32>, !torch.vtensor<[4,12,512,512],f32> -> !torch.vtensor<[4,12,512,512],f32>
    %float1.000000e00_675 = torch.constant.float 1.000000e+00
    %1525 = torch.aten.sub.Tensor %1515, %1524, %float1.000000e00_675 : !torch.vtensor<[4,12,512,512],f32>, !torch.vtensor<[4,12,512,512],f32>, !torch.float -> !torch.vtensor<[4,12,512,512],f32>
    %1526 = torch.aten.div.Tensor %1525, %1 : !torch.vtensor<[4,12,512,512],f32>, !torch.vtensor<[],f64> -> !torch.vtensor<[4,12,512,512],f32>
    %1527 = torch.aten.view %1526, %155 : !torch.vtensor<[4,12,512,512],f32>, !torch.list<int> -> !torch.vtensor<[48,512,512],f32>
    %1528 = torch.aten.transpose.int %arg86, %int1, %int2 : !torch.vtensor<[48,512,64],f32>, !torch.int, !torch.int -> !torch.vtensor<[48,64,512],f32>
    %1529 = torch.aten.bmm %1528, %1527 : !torch.vtensor<[48,64,512],f32>, !torch.vtensor<[48,512,512],f32> -> !torch.vtensor<[48,64,512],f32>
    %1530 = torch.aten.transpose.int %arg87, %int1, %int2 : !torch.vtensor<[48,64,512],f32>, !torch.int, !torch.int -> !torch.vtensor<[48,512,64],f32>
    %1531 = torch.aten.bmm %1527, %1530 : !torch.vtensor<[48,512,512],f32>, !torch.vtensor<[48,512,64],f32> -> !torch.vtensor<[48,512,64],f32>
    %1532 = torch.aten.view %1529, %161 : !torch.vtensor<[48,64,512],f32>, !torch.list<int> -> !torch.vtensor<[4,12,64,512],f32>
    %1533 = torch.aten.view %1531, %138 : !torch.vtensor<[48,512,64],f32>, !torch.list<int> -> !torch.vtensor<[4,12,512,64],f32>
    %1534 = torch.aten.transpose.int %1532, %int-1, %int-2 : !torch.vtensor<[4,12,64,512],f32>, !torch.int, !torch.int -> !torch.vtensor<[4,12,512,64],f32>
    %1535 = torch.aten.permute %1533, %129 : !torch.vtensor<[4,12,512,64],f32>, !torch.list<int> -> !torch.vtensor<[4,512,12,64],f32>
    %1536 = torch.aten.clone %1535, %int0 : !torch.vtensor<[4,512,12,64],f32>, !torch.int -> !torch.vtensor<[4,512,12,64],f32>
    %1537 = torch.aten.view %1536, %28 : !torch.vtensor<[4,512,12,64],f32>, !torch.list<int> -> !torch.vtensor<[4,512,768],f32>
    %1538 = torch.aten.permute %1512, %129 : !torch.vtensor<[4,12,512,64],f32>, !torch.list<int> -> !torch.vtensor<[4,512,12,64],f32>
    %1539 = torch.aten.clone %1538, %int0 : !torch.vtensor<[4,512,12,64],f32>, !torch.int -> !torch.vtensor<[4,512,12,64],f32>
    %1540 = torch.aten.view %1539, %28 : !torch.vtensor<[4,512,12,64],f32>, !torch.list<int> -> !torch.vtensor<[4,512,768],f32>
    %1541 = torch.aten.view %1540, %49 : !torch.vtensor<[4,512,768],f32>, !torch.list<int> -> !torch.vtensor<[2048,768],f32>
    %int0_676 = torch.constant.int 0
    %int1_677 = torch.constant.int 1
    %1542 = torch.aten.transpose.int %arg84, %int0_676, %int1_677 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %1543 = torch.aten.mm %1541, %1542 : !torch.vtensor<[2048,768],f32>, !torch.vtensor<[768,768],f32> -> !torch.vtensor<[2048,768],f32>
    %int0_678 = torch.constant.int 0
    %int1_679 = torch.constant.int 1
    %1544 = torch.aten.transpose.int %1541, %int0_678, %int1_679 : !torch.vtensor<[2048,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,2048],f32>
    %1545 = torch.aten.mm %1544, %arg85 : !torch.vtensor<[768,2048],f32>, !torch.vtensor<[2048,768],f32> -> !torch.vtensor<[768,768],f32>
    %int0_680 = torch.constant.int 0
    %int1_681 = torch.constant.int 1
    %1546 = torch.aten.transpose.int %1545, %int0_680, %int1_681 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %1547 = torch.aten.sum.dim_IntList %1541, %24, %true, %none : !torch.vtensor<[2048,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,768],f32>
    %1548 = torch.aten.view %1547, %3 : !torch.vtensor<[1,768],f32>, !torch.list<int> -> !torch.vtensor<[768],f32>
    %1549 = torch.aten.view %1543, %28 : !torch.vtensor<[2048,768],f32>, !torch.list<int> -> !torch.vtensor<[4,512,768],f32>
    %1550 = torch.aten.add.Tensor %1489, %1549, %int1 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %int0_682 = torch.constant.int 0
    %int1_683 = torch.constant.int 1
    %1551 = torch.aten.transpose.int %1546, %int0_682, %int1_683 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %1552 = torch.aten.permute %1534, %129 : !torch.vtensor<[4,12,512,64],f32>, !torch.list<int> -> !torch.vtensor<[4,512,12,64],f32>
    %1553 = torch.aten.view %1552, %28 : !torch.vtensor<[4,512,12,64],f32>, !torch.list<int> -> !torch.vtensor<[4,512,768],f32>
    %1554 = torch.aten.clone %1553, %int0 : !torch.vtensor<[4,512,768],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %1555 = torch.aten.view %1554, %49 : !torch.vtensor<[4,512,768],f32>, !torch.list<int> -> !torch.vtensor<[2048,768],f32>
    %int0_684 = torch.constant.int 0
    %int1_685 = torch.constant.int 1
    %1556 = torch.aten.transpose.int %arg82, %int0_684, %int1_685 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %1557 = torch.aten.mm %1555, %1556 : !torch.vtensor<[2048,768],f32>, !torch.vtensor<[768,768],f32> -> !torch.vtensor<[2048,768],f32>
    %int0_686 = torch.constant.int 0
    %int1_687 = torch.constant.int 1
    %1558 = torch.aten.transpose.int %1555, %int0_686, %int1_687 : !torch.vtensor<[2048,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,2048],f32>
    %1559 = torch.aten.mm %1558, %arg83 : !torch.vtensor<[768,2048],f32>, !torch.vtensor<[2048,768],f32> -> !torch.vtensor<[768,768],f32>
    %int0_688 = torch.constant.int 0
    %int1_689 = torch.constant.int 1
    %1560 = torch.aten.transpose.int %1559, %int0_688, %int1_689 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %1561 = torch.aten.sum.dim_IntList %1555, %24, %true, %none : !torch.vtensor<[2048,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,768],f32>
    %1562 = torch.aten.view %1561, %3 : !torch.vtensor<[1,768],f32>, !torch.list<int> -> !torch.vtensor<[768],f32>
    %1563 = torch.aten.view %1557, %28 : !torch.vtensor<[2048,768],f32>, !torch.list<int> -> !torch.vtensor<[4,512,768],f32>
    %1564 = torch.aten.add.Tensor %1550, %1563, %int1 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %int0_690 = torch.constant.int 0
    %int1_691 = torch.constant.int 1
    %1565 = torch.aten.transpose.int %1560, %int0_690, %int1_691 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %1566 = torch.aten.view %1537, %49 : !torch.vtensor<[4,512,768],f32>, !torch.list<int> -> !torch.vtensor<[2048,768],f32>
    %int0_692 = torch.constant.int 0
    %int1_693 = torch.constant.int 1
    %1567 = torch.aten.transpose.int %arg80, %int0_692, %int1_693 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %1568 = torch.aten.mm %1566, %1567 : !torch.vtensor<[2048,768],f32>, !torch.vtensor<[768,768],f32> -> !torch.vtensor<[2048,768],f32>
    %int0_694 = torch.constant.int 0
    %int1_695 = torch.constant.int 1
    %1569 = torch.aten.transpose.int %1566, %int0_694, %int1_695 : !torch.vtensor<[2048,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,2048],f32>
    %1570 = torch.aten.mm %1569, %arg81 : !torch.vtensor<[768,2048],f32>, !torch.vtensor<[2048,768],f32> -> !torch.vtensor<[768,768],f32>
    %int0_696 = torch.constant.int 0
    %int1_697 = torch.constant.int 1
    %1571 = torch.aten.transpose.int %1570, %int0_696, %int1_697 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %1572 = torch.aten.sum.dim_IntList %1566, %24, %true, %none : !torch.vtensor<[2048,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,768],f32>
    %1573 = torch.aten.view %1572, %3 : !torch.vtensor<[1,768],f32>, !torch.list<int> -> !torch.vtensor<[768],f32>
    %1574 = torch.aten.view %1568, %28 : !torch.vtensor<[2048,768],f32>, !torch.list<int> -> !torch.vtensor<[4,512,768],f32>
    %1575 = torch.aten.add.Tensor %1564, %1574, %int1 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %int0_698 = torch.constant.int 0
    %int1_699 = torch.constant.int 1
    %1576 = torch.aten.transpose.int %1571, %int0_698, %int1_699 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %1577 = torch.aten.sub.Tensor %arg79, %result1_4, %int1 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %1578 = torch.aten.mul.Tensor %1577, %result2_5 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32> -> !torch.vtensor<[4,512,768],f32>
    %1579 = torch.aten.mul.Tensor %1575, %arg5 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[768],f32> -> !torch.vtensor<[4,512,768],f32>
    %1580 = torch.aten.mul.Tensor %1579, %0 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[],si64> -> !torch.vtensor<[4,512,768],f32>
    %1581 = torch.aten.sum.dim_IntList %1579, %35, %true, %none : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[4,512,1],f32>
    %1582 = torch.aten.mul.Tensor %1579, %1578 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32> -> !torch.vtensor<[4,512,768],f32>
    %1583 = torch.aten.sum.dim_IntList %1582, %35, %true, %none : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[4,512,1],f32>
    %1584 = torch.aten.mul.Tensor %1578, %1583 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32> -> !torch.vtensor<[4,512,768],f32>
    %1585 = torch.aten.sub.Tensor %1580, %1581, %int1 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %1586 = torch.aten.sub.Tensor %1585, %1584, %int1 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %1587 = torch.aten.div.Tensor %result2_5, %0 : !torch.vtensor<[4,512,1],f32>, !torch.vtensor<[],si64> -> !torch.vtensor<[4,512,1],f32>
    %1588 = torch.aten.mul.Tensor %1587, %1586 : !torch.vtensor<[4,512,1],f32>, !torch.vtensor<[4,512,768],f32> -> !torch.vtensor<[4,512,768],f32>
    %1589 = torch.aten.mul.Tensor %1575, %1578 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32> -> !torch.vtensor<[4,512,768],f32>
    %1590 = torch.aten.sum.dim_IntList %1589, %45, %false, %none : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[768],f32>
    %1591 = torch.aten.sum.dim_IntList %1575, %45, %false, %none : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[768],f32>
    %1592 = torch.aten.mul.Tensor %1588, %arg78 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32> -> !torch.vtensor<[4,512,768],f32>
    %1593 = torch.aten.view %1592, %49 : !torch.vtensor<[4,512,768],f32>, !torch.list<int> -> !torch.vtensor<[2048,768],f32>
    %int0_700 = torch.constant.int 0
    %int1_701 = torch.constant.int 1
    %1594 = torch.aten.transpose.int %arg76, %int0_700, %int1_701 : !torch.vtensor<[3072,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,3072],f32>
    %1595 = torch.aten.mm %1593, %1594 : !torch.vtensor<[2048,768],f32>, !torch.vtensor<[768,3072],f32> -> !torch.vtensor<[2048,3072],f32>
    %int0_702 = torch.constant.int 0
    %int1_703 = torch.constant.int 1
    %1596 = torch.aten.transpose.int %1593, %int0_702, %int1_703 : !torch.vtensor<[2048,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,2048],f32>
    %1597 = torch.aten.mm %1596, %arg77 : !torch.vtensor<[768,2048],f32>, !torch.vtensor<[2048,3072],f32> -> !torch.vtensor<[768,3072],f32>
    %int0_704 = torch.constant.int 0
    %int1_705 = torch.constant.int 1
    %1598 = torch.aten.transpose.int %1597, %int0_704, %int1_705 : !torch.vtensor<[768,3072],f32>, !torch.int, !torch.int -> !torch.vtensor<[3072,768],f32>
    %1599 = torch.aten.sum.dim_IntList %1593, %24, %true, %none : !torch.vtensor<[2048,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,768],f32>
    %1600 = torch.aten.view %1599, %3 : !torch.vtensor<[1,768],f32>, !torch.list<int> -> !torch.vtensor<[768],f32>
    %1601 = torch.aten.view %1595, %84 : !torch.vtensor<[2048,3072],f32>, !torch.list<int> -> !torch.vtensor<[4,512,3072],f32>
    %int0_706 = torch.constant.int 0
    %int1_707 = torch.constant.int 1
    %1602 = torch.aten.transpose.int %1598, %int0_706, %int1_707 : !torch.vtensor<[3072,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,3072],f32>
    %1603 = torch.aten.gelu_backward %1601, %arg75, %str : !torch.vtensor<[4,512,3072],f32>, !torch.vtensor<[4,512,3072],f32>, !torch.str -> !torch.vtensor<[4,512,3072],f32>
    %1604 = torch.aten.view %1603, %88 : !torch.vtensor<[4,512,3072],f32>, !torch.list<int> -> !torch.vtensor<[2048,3072],f32>
    %int0_708 = torch.constant.int 0
    %int1_709 = torch.constant.int 1
    %1605 = torch.aten.transpose.int %arg73, %int0_708, %int1_709 : !torch.vtensor<[768,3072],f32>, !torch.int, !torch.int -> !torch.vtensor<[3072,768],f32>
    %1606 = torch.aten.mm %1604, %1605 : !torch.vtensor<[2048,3072],f32>, !torch.vtensor<[3072,768],f32> -> !torch.vtensor<[2048,768],f32>
    %int0_710 = torch.constant.int 0
    %int1_711 = torch.constant.int 1
    %1607 = torch.aten.transpose.int %1604, %int0_710, %int1_711 : !torch.vtensor<[2048,3072],f32>, !torch.int, !torch.int -> !torch.vtensor<[3072,2048],f32>
    %1608 = torch.aten.mm %1607, %arg74 : !torch.vtensor<[3072,2048],f32>, !torch.vtensor<[2048,768],f32> -> !torch.vtensor<[3072,768],f32>
    %int0_712 = torch.constant.int 0
    %int1_713 = torch.constant.int 1
    %1609 = torch.aten.transpose.int %1608, %int0_712, %int1_713 : !torch.vtensor<[3072,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,3072],f32>
    %1610 = torch.aten.sum.dim_IntList %1604, %24, %true, %none : !torch.vtensor<[2048,3072],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,3072],f32>
    %1611 = torch.aten.view %1610, %96 : !torch.vtensor<[1,3072],f32>, !torch.list<int> -> !torch.vtensor<[3072],f32>
    %1612 = torch.aten.view %1606, %28 : !torch.vtensor<[2048,768],f32>, !torch.list<int> -> !torch.vtensor<[4,512,768],f32>
    %1613 = torch.aten.add.Tensor %1588, %1612, %int1 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %int0_714 = torch.constant.int 0
    %int1_715 = torch.constant.int 1
    %1614 = torch.aten.transpose.int %1609, %int0_714, %int1_715 : !torch.vtensor<[768,3072],f32>, !torch.int, !torch.int -> !torch.vtensor<[3072,768],f32>
    %1615 = torch.aten.sub.Tensor %arg72, %result1_1, %int1 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %1616 = torch.aten.mul.Tensor %1615, %result2_2 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32> -> !torch.vtensor<[4,512,768],f32>
    %1617 = torch.aten.mul.Tensor %1613, %arg3 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[768],f32> -> !torch.vtensor<[4,512,768],f32>
    %1618 = torch.aten.mul.Tensor %1617, %0 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[],si64> -> !torch.vtensor<[4,512,768],f32>
    %1619 = torch.aten.sum.dim_IntList %1617, %35, %true, %none : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[4,512,1],f32>
    %1620 = torch.aten.mul.Tensor %1617, %1616 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32> -> !torch.vtensor<[4,512,768],f32>
    %1621 = torch.aten.sum.dim_IntList %1620, %35, %true, %none : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[4,512,1],f32>
    %1622 = torch.aten.mul.Tensor %1616, %1621 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32> -> !torch.vtensor<[4,512,768],f32>
    %1623 = torch.aten.sub.Tensor %1618, %1619, %int1 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %1624 = torch.aten.sub.Tensor %1623, %1622, %int1 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %1625 = torch.aten.div.Tensor %result2_2, %0 : !torch.vtensor<[4,512,1],f32>, !torch.vtensor<[],si64> -> !torch.vtensor<[4,512,1],f32>
    %1626 = torch.aten.mul.Tensor %1625, %1624 : !torch.vtensor<[4,512,1],f32>, !torch.vtensor<[4,512,768],f32> -> !torch.vtensor<[4,512,768],f32>
    %1627 = torch.aten.mul.Tensor %1613, %1616 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32> -> !torch.vtensor<[4,512,768],f32>
    %1628 = torch.aten.sum.dim_IntList %1627, %45, %false, %none : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[768],f32>
    %1629 = torch.aten.sum.dim_IntList %1613, %45, %false, %none : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[768],f32>
    %1630 = torch.aten.mul.Tensor %1626, %arg71 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32> -> !torch.vtensor<[4,512,768],f32>
    %1631 = torch.aten.view %1630, %49 : !torch.vtensor<[4,512,768],f32>, !torch.list<int> -> !torch.vtensor<[2048,768],f32>
    %int0_716 = torch.constant.int 0
    %int1_717 = torch.constant.int 1
    %1632 = torch.aten.transpose.int %arg69, %int0_716, %int1_717 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %1633 = torch.aten.mm %1631, %1632 : !torch.vtensor<[2048,768],f32>, !torch.vtensor<[768,768],f32> -> !torch.vtensor<[2048,768],f32>
    %int0_718 = torch.constant.int 0
    %int1_719 = torch.constant.int 1
    %1634 = torch.aten.transpose.int %1631, %int0_718, %int1_719 : !torch.vtensor<[2048,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,2048],f32>
    %1635 = torch.aten.mm %1634, %arg70 : !torch.vtensor<[768,2048],f32>, !torch.vtensor<[2048,768],f32> -> !torch.vtensor<[768,768],f32>
    %int0_720 = torch.constant.int 0
    %int1_721 = torch.constant.int 1
    %1636 = torch.aten.transpose.int %1635, %int0_720, %int1_721 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %1637 = torch.aten.sum.dim_IntList %1631, %24, %true, %none : !torch.vtensor<[2048,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,768],f32>
    %1638 = torch.aten.view %1637, %3 : !torch.vtensor<[1,768],f32>, !torch.list<int> -> !torch.vtensor<[768],f32>
    %1639 = torch.aten.view %1633, %28 : !torch.vtensor<[2048,768],f32>, !torch.list<int> -> !torch.vtensor<[4,512,768],f32>
    %int0_722 = torch.constant.int 0
    %int1_723 = torch.constant.int 1
    %1640 = torch.aten.transpose.int %1636, %int0_722, %int1_723 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %1641 = torch.aten.view %1639, %127 : !torch.vtensor<[4,512,768],f32>, !torch.list<int> -> !torch.vtensor<[4,512,12,64],f32>
    %1642 = torch.aten.permute %1641, %129 : !torch.vtensor<[4,512,12,64],f32>, !torch.list<int> -> !torch.vtensor<[4,12,512,64],f32>
    %1643 = torch.aten.clone %1642, %int0 : !torch.vtensor<[4,12,512,64],f32>, !torch.int -> !torch.vtensor<[4,12,512,64],f32>
    %1644 = torch.aten.view %1643, %132 : !torch.vtensor<[4,12,512,64],f32>, !torch.list<int> -> !torch.vtensor<[48,512,64],f32>
    %1645 = torch.aten.transpose.int %arg67, %int1, %int2 : !torch.vtensor<[48,512,512],f32>, !torch.int, !torch.int -> !torch.vtensor<[48,512,512],f32>
    %1646 = torch.aten.bmm %1645, %1644 : !torch.vtensor<[48,512,512],f32>, !torch.vtensor<[48,512,64],f32> -> !torch.vtensor<[48,512,64],f32>
    %1647 = torch.aten.transpose.int %arg68, %int1, %int2 : !torch.vtensor<[48,512,64],f32>, !torch.int, !torch.int -> !torch.vtensor<[48,64,512],f32>
    %1648 = torch.aten.bmm %1644, %1647 : !torch.vtensor<[48,512,64],f32>, !torch.vtensor<[48,64,512],f32> -> !torch.vtensor<[48,512,512],f32>
    %1649 = torch.aten.view %1646, %138 : !torch.vtensor<[48,512,64],f32>, !torch.list<int> -> !torch.vtensor<[4,12,512,64],f32>
    %1650 = torch.aten.view %1648, %140 : !torch.vtensor<[48,512,512],f32>, !torch.list<int> -> !torch.vtensor<[4,12,512,512],f32>
    %1651 = torch.aten.mul.Tensor %1650, %arg66 : !torch.vtensor<[4,12,512,512],f32>, !torch.vtensor<[4,12,512,512],f32> -> !torch.vtensor<[4,12,512,512],f32>
    %1652 = torch.aten.mul.Tensor %1651, %arg65 : !torch.vtensor<[4,12,512,512],f32>, !torch.vtensor<[4,12,512,512],f32> -> !torch.vtensor<[4,12,512,512],f32>
    %1653 = torch.prim.ListConstruct %int-1 : (!torch.int) -> !torch.list<int>
    %true_724 = torch.constant.bool true
    %none_725 = torch.constant.none
    %1654 = torch.aten.sum.dim_IntList %1652, %1653, %true_724, %none_725 : !torch.vtensor<[4,12,512,512],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[4,12,512,1],f32>
    %int0_726 = torch.constant.int 0
    %1655 = torch.aten.size.int %1652, %int0_726 : !torch.vtensor<[4,12,512,512],f32>, !torch.int -> !torch.int
    %int1_727 = torch.constant.int 1
    %1656 = torch.aten.size.int %1652, %int1_727 : !torch.vtensor<[4,12,512,512],f32>, !torch.int -> !torch.int
    %int2_728 = torch.constant.int 2
    %1657 = torch.aten.size.int %1652, %int2_728 : !torch.vtensor<[4,12,512,512],f32>, !torch.int -> !torch.int
    %int3_729 = torch.constant.int 3
    %1658 = torch.aten.size.int %1652, %int3_729 : !torch.vtensor<[4,12,512,512],f32>, !torch.int -> !torch.int
    %1659 = torch.prim.ListConstruct %1655, %1656, %1657, %1658 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1660 = torch.aten.broadcast_to %1654, %1659 : !torch.vtensor<[4,12,512,1],f32>, !torch.list<int> -> !torch.vtensor<[4,12,512,512],f32>
    %1661 = torch.aten.mul.Tensor %arg65, %1660 : !torch.vtensor<[4,12,512,512],f32>, !torch.vtensor<[4,12,512,512],f32> -> !torch.vtensor<[4,12,512,512],f32>
    %float1.000000e00_730 = torch.constant.float 1.000000e+00
    %1662 = torch.aten.sub.Tensor %1652, %1661, %float1.000000e00_730 : !torch.vtensor<[4,12,512,512],f32>, !torch.vtensor<[4,12,512,512],f32>, !torch.float -> !torch.vtensor<[4,12,512,512],f32>
    %1663 = torch.aten.div.Tensor %1662, %1 : !torch.vtensor<[4,12,512,512],f32>, !torch.vtensor<[],f64> -> !torch.vtensor<[4,12,512,512],f32>
    %1664 = torch.aten.view %1663, %155 : !torch.vtensor<[4,12,512,512],f32>, !torch.list<int> -> !torch.vtensor<[48,512,512],f32>
    %1665 = torch.aten.transpose.int %arg63, %int1, %int2 : !torch.vtensor<[48,512,64],f32>, !torch.int, !torch.int -> !torch.vtensor<[48,64,512],f32>
    %1666 = torch.aten.bmm %1665, %1664 : !torch.vtensor<[48,64,512],f32>, !torch.vtensor<[48,512,512],f32> -> !torch.vtensor<[48,64,512],f32>
    %1667 = torch.aten.transpose.int %arg64, %int1, %int2 : !torch.vtensor<[48,64,512],f32>, !torch.int, !torch.int -> !torch.vtensor<[48,512,64],f32>
    %1668 = torch.aten.bmm %1664, %1667 : !torch.vtensor<[48,512,512],f32>, !torch.vtensor<[48,512,64],f32> -> !torch.vtensor<[48,512,64],f32>
    %1669 = torch.aten.view %1666, %161 : !torch.vtensor<[48,64,512],f32>, !torch.list<int> -> !torch.vtensor<[4,12,64,512],f32>
    %1670 = torch.aten.view %1668, %138 : !torch.vtensor<[48,512,64],f32>, !torch.list<int> -> !torch.vtensor<[4,12,512,64],f32>
    %1671 = torch.aten.transpose.int %1669, %int-1, %int-2 : !torch.vtensor<[4,12,64,512],f32>, !torch.int, !torch.int -> !torch.vtensor<[4,12,512,64],f32>
    %1672 = torch.aten.permute %1670, %129 : !torch.vtensor<[4,12,512,64],f32>, !torch.list<int> -> !torch.vtensor<[4,512,12,64],f32>
    %1673 = torch.aten.clone %1672, %int0 : !torch.vtensor<[4,512,12,64],f32>, !torch.int -> !torch.vtensor<[4,512,12,64],f32>
    %1674 = torch.aten.view %1673, %28 : !torch.vtensor<[4,512,12,64],f32>, !torch.list<int> -> !torch.vtensor<[4,512,768],f32>
    %1675 = torch.aten.permute %1649, %129 : !torch.vtensor<[4,12,512,64],f32>, !torch.list<int> -> !torch.vtensor<[4,512,12,64],f32>
    %1676 = torch.aten.clone %1675, %int0 : !torch.vtensor<[4,512,12,64],f32>, !torch.int -> !torch.vtensor<[4,512,12,64],f32>
    %1677 = torch.aten.view %1676, %28 : !torch.vtensor<[4,512,12,64],f32>, !torch.list<int> -> !torch.vtensor<[4,512,768],f32>
    %1678 = torch.aten.view %1677, %49 : !torch.vtensor<[4,512,768],f32>, !torch.list<int> -> !torch.vtensor<[2048,768],f32>
    %int0_731 = torch.constant.int 0
    %int1_732 = torch.constant.int 1
    %1679 = torch.aten.transpose.int %arg61, %int0_731, %int1_732 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %1680 = torch.aten.mm %1678, %1679 : !torch.vtensor<[2048,768],f32>, !torch.vtensor<[768,768],f32> -> !torch.vtensor<[2048,768],f32>
    %int0_733 = torch.constant.int 0
    %int1_734 = torch.constant.int 1
    %1681 = torch.aten.transpose.int %1678, %int0_733, %int1_734 : !torch.vtensor<[2048,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,2048],f32>
    %1682 = torch.aten.mm %1681, %arg62 : !torch.vtensor<[768,2048],f32>, !torch.vtensor<[2048,768],f32> -> !torch.vtensor<[768,768],f32>
    %int0_735 = torch.constant.int 0
    %int1_736 = torch.constant.int 1
    %1683 = torch.aten.transpose.int %1682, %int0_735, %int1_736 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %1684 = torch.aten.sum.dim_IntList %1678, %24, %true, %none : !torch.vtensor<[2048,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,768],f32>
    %1685 = torch.aten.view %1684, %3 : !torch.vtensor<[1,768],f32>, !torch.list<int> -> !torch.vtensor<[768],f32>
    %1686 = torch.aten.view %1680, %28 : !torch.vtensor<[2048,768],f32>, !torch.list<int> -> !torch.vtensor<[4,512,768],f32>
    %1687 = torch.aten.add.Tensor %1626, %1686, %int1 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %int0_737 = torch.constant.int 0
    %int1_738 = torch.constant.int 1
    %1688 = torch.aten.transpose.int %1683, %int0_737, %int1_738 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %1689 = torch.aten.permute %1671, %129 : !torch.vtensor<[4,12,512,64],f32>, !torch.list<int> -> !torch.vtensor<[4,512,12,64],f32>
    %1690 = torch.aten.view %1689, %28 : !torch.vtensor<[4,512,12,64],f32>, !torch.list<int> -> !torch.vtensor<[4,512,768],f32>
    %1691 = torch.aten.clone %1690, %int0 : !torch.vtensor<[4,512,768],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %1692 = torch.aten.view %1691, %49 : !torch.vtensor<[4,512,768],f32>, !torch.list<int> -> !torch.vtensor<[2048,768],f32>
    %int0_739 = torch.constant.int 0
    %int1_740 = torch.constant.int 1
    %1693 = torch.aten.transpose.int %arg59, %int0_739, %int1_740 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %1694 = torch.aten.mm %1692, %1693 : !torch.vtensor<[2048,768],f32>, !torch.vtensor<[768,768],f32> -> !torch.vtensor<[2048,768],f32>
    %int0_741 = torch.constant.int 0
    %int1_742 = torch.constant.int 1
    %1695 = torch.aten.transpose.int %1692, %int0_741, %int1_742 : !torch.vtensor<[2048,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,2048],f32>
    %1696 = torch.aten.mm %1695, %arg60 : !torch.vtensor<[768,2048],f32>, !torch.vtensor<[2048,768],f32> -> !torch.vtensor<[768,768],f32>
    %int0_743 = torch.constant.int 0
    %int1_744 = torch.constant.int 1
    %1697 = torch.aten.transpose.int %1696, %int0_743, %int1_744 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %1698 = torch.aten.sum.dim_IntList %1692, %24, %true, %none : !torch.vtensor<[2048,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,768],f32>
    %1699 = torch.aten.view %1698, %3 : !torch.vtensor<[1,768],f32>, !torch.list<int> -> !torch.vtensor<[768],f32>
    %1700 = torch.aten.view %1694, %28 : !torch.vtensor<[2048,768],f32>, !torch.list<int> -> !torch.vtensor<[4,512,768],f32>
    %1701 = torch.aten.add.Tensor %1687, %1700, %int1 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %int0_745 = torch.constant.int 0
    %int1_746 = torch.constant.int 1
    %1702 = torch.aten.transpose.int %1697, %int0_745, %int1_746 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %1703 = torch.aten.view %1674, %49 : !torch.vtensor<[4,512,768],f32>, !torch.list<int> -> !torch.vtensor<[2048,768],f32>
    %int0_747 = torch.constant.int 0
    %int1_748 = torch.constant.int 1
    %1704 = torch.aten.transpose.int %arg57, %int0_747, %int1_748 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %1705 = torch.aten.mm %1703, %1704 : !torch.vtensor<[2048,768],f32>, !torch.vtensor<[768,768],f32> -> !torch.vtensor<[2048,768],f32>
    %int0_749 = torch.constant.int 0
    %int1_750 = torch.constant.int 1
    %1706 = torch.aten.transpose.int %1703, %int0_749, %int1_750 : !torch.vtensor<[2048,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,2048],f32>
    %1707 = torch.aten.mm %1706, %arg58 : !torch.vtensor<[768,2048],f32>, !torch.vtensor<[2048,768],f32> -> !torch.vtensor<[768,768],f32>
    %int0_751 = torch.constant.int 0
    %int1_752 = torch.constant.int 1
    %1708 = torch.aten.transpose.int %1707, %int0_751, %int1_752 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %1709 = torch.aten.sum.dim_IntList %1703, %24, %true, %none : !torch.vtensor<[2048,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,768],f32>
    %1710 = torch.aten.view %1709, %3 : !torch.vtensor<[1,768],f32>, !torch.list<int> -> !torch.vtensor<[768],f32>
    %1711 = torch.aten.view %1705, %28 : !torch.vtensor<[2048,768],f32>, !torch.list<int> -> !torch.vtensor<[4,512,768],f32>
    %1712 = torch.aten.add.Tensor %1701, %1711, %int1 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %int0_753 = torch.constant.int 0
    %int1_754 = torch.constant.int 1
    %1713 = torch.aten.transpose.int %1708, %int0_753, %int1_754 : !torch.vtensor<[768,768],f32>, !torch.int, !torch.int -> !torch.vtensor<[768,768],f32>
    %1714 = torch.aten.mul.Tensor %1712, %arg56 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32> -> !torch.vtensor<[4,512,768],f32>
    %1715 = torch.aten.sub.Tensor %arg55, %result1, %int1 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %1716 = torch.aten.mul.Tensor %1715, %result2 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32> -> !torch.vtensor<[4,512,768],f32>
    %1717 = torch.aten.mul.Tensor %1714, %arg1 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[768],f32> -> !torch.vtensor<[4,512,768],f32>
    %1718 = torch.aten.mul.Tensor %1717, %0 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[],si64> -> !torch.vtensor<[4,512,768],f32>
    %1719 = torch.aten.sum.dim_IntList %1717, %35, %true, %none : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[4,512,1],f32>
    %1720 = torch.aten.mul.Tensor %1717, %1716 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32> -> !torch.vtensor<[4,512,768],f32>
    %1721 = torch.aten.sum.dim_IntList %1720, %35, %true, %none : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[4,512,1],f32>
    %1722 = torch.aten.mul.Tensor %1716, %1721 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32> -> !torch.vtensor<[4,512,768],f32>
    %1723 = torch.aten.sub.Tensor %1718, %1719, %int1 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,1],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %1724 = torch.aten.sub.Tensor %1723, %1722, %int1 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32>, !torch.int -> !torch.vtensor<[4,512,768],f32>
    %1725 = torch.aten.div.Tensor %result2, %0 : !torch.vtensor<[4,512,1],f32>, !torch.vtensor<[],si64> -> !torch.vtensor<[4,512,1],f32>
    %1726 = torch.aten.mul.Tensor %1725, %1724 : !torch.vtensor<[4,512,1],f32>, !torch.vtensor<[4,512,768],f32> -> !torch.vtensor<[4,512,768],f32>
    %1727 = torch.aten.mul.Tensor %1714, %1716 : !torch.vtensor<[4,512,768],f32>, !torch.vtensor<[4,512,768],f32> -> !torch.vtensor<[4,512,768],f32>
    %1728 = torch.aten.sum.dim_IntList %1727, %45, %false, %none : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[768],f32>
    %1729 = torch.aten.sum.dim_IntList %1714, %45, %false, %none : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[768],f32>
    %1730 = torch.aten.sum.dim_IntList %1726, %24, %true, %none : !torch.vtensor<[4,512,768],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,512,768],f32>
    %1731 = torch.prim.ListConstruct %int512, %int768 : (!torch.int, !torch.int) -> !torch.list<int>
    %1732 = torch.aten.view %1730, %1731 : !torch.vtensor<[1,512,768],f32>, !torch.list<int> -> !torch.vtensor<[512,768],f32>
    %none_755 = torch.constant.none
    %1733 = torch.aten.empty.memory_format %1731, %int6, %int0, %cpu, %false, %none_755 : !torch.list<int>, !torch.int, !torch.int, !torch.Device, !torch.bool, !torch.none -> !torch.vtensor<[512,768],f32>
    %int0_756 = torch.constant.int 0
    %1734 = torch.valsem.aten.fill.Scalar %1733, %int0_756 : !torch.vtensor<[512,768],f32>, !torch.int -> !torch.vtensor<[512,768],f32>
    %1735 = torch.prim.ListConstruct %int512 : (!torch.int) -> !torch.list<int>
    %1736 = torch.aten.view %arg54, %1735 : !torch.vtensor<[1,512],si64>, !torch.list<int> -> !torch.vtensor<[512],si64>
    %1737 = torch.aten.ne.Scalar %1736, %int-1 : !torch.vtensor<[512],si64>, !torch.int -> !torch.vtensor<[512],i1>
    %1738 = torch.aten.unsqueeze %1737, %int1 : !torch.vtensor<[512],i1>, !torch.int -> !torch.vtensor<[512,1],i1>
    %1739 = torch.aten.broadcast_to %1738, %1731 : !torch.vtensor<[512,1],i1>, !torch.list<int> -> !torch.vtensor<[512,768],i1>
    %int0_757 = torch.constant.int 0
    %1740 = torch.aten.size.int %1732, %int0_757 : !torch.vtensor<[512,768],f32>, !torch.int -> !torch.int
    %int1_758 = torch.constant.int 1
    %1741 = torch.aten.size.int %1732, %int1_758 : !torch.vtensor<[512,768],f32>, !torch.int -> !torch.int
    %1742 = torch.prim.ListConstruct %1740, %1741 : (!torch.int, !torch.int) -> !torch.list<int>
    %1743 = torch.aten.empty.memory_format %1742, %int6, %int0, %cpu, %false, %none : !torch.list<int>, !torch.int, !torch.int, !torch.Device, !torch.bool, !torch.none -> !torch.vtensor<[512,768],f32>
    %1744 = torch.valsem.aten.fill.Scalar %1743, %int0 : !torch.vtensor<[512,768],f32>, !torch.int -> !torch.vtensor<[512,768],f32>
    %1745 = torch.aten.where.self %1739, %1732, %1744 : !torch.vtensor<[512,768],i1>, !torch.vtensor<[512,768],f32>, !torch.vtensor<[512,768],f32> -> !torch.vtensor<[512,768],f32>
    %1746 = torch.prim.ListConstruct %1736 : (!torch.vtensor<[512],si64>) -> !torch.list<vtensor>
    %false_759 = torch.constant.bool false
    %1747 = torch.valsem.aten.index_put_impl %1734, %1746, %1745, %true, %false_759 : !torch.vtensor<[512,768],f32>, !torch.list<vtensor>, !torch.vtensor<[512,768],f32>, !torch.bool, !torch.bool -> !torch.vtensor<[512,768],f32>
    %1748 = torch.aten.view %1726, %49 : !torch.vtensor<[4,512,768],f32>, !torch.list<int> -> !torch.vtensor<[2048,768],f32>
    %1749 = torch.prim.ListConstruct %int2, %int768 : (!torch.int, !torch.int) -> !torch.list<int>
    %none_760 = torch.constant.none
    %1750 = torch.aten.empty.memory_format %1749, %int6, %int0, %cpu, %false, %none_760 : !torch.list<int>, !torch.int, !torch.int, !torch.Device, !torch.bool, !torch.none -> !torch.vtensor<[2,768],f32>
    %int0_761 = torch.constant.int 0
    %1751 = torch.valsem.aten.fill.Scalar %1750, %int0_761 : !torch.vtensor<[2,768],f32>, !torch.int -> !torch.vtensor<[2,768],f32>
    %1752 = torch.aten.clone %arg53, %int0 : !torch.vtensor<[4,512],si64>, !torch.int -> !torch.vtensor<[4,512],si64>
    %1753 = torch.prim.ListConstruct %int2048 : (!torch.int) -> !torch.list<int>
    %1754 = torch.aten.view %1752, %1753 : !torch.vtensor<[4,512],si64>, !torch.list<int> -> !torch.vtensor<[2048],si64>
    %1755 = torch.aten.ne.Scalar %1754, %int-1 : !torch.vtensor<[2048],si64>, !torch.int -> !torch.vtensor<[2048],i1>
    %1756 = torch.aten.unsqueeze %1755, %int1 : !torch.vtensor<[2048],i1>, !torch.int -> !torch.vtensor<[2048,1],i1>
    %1757 = torch.aten.broadcast_to %1756, %49 : !torch.vtensor<[2048,1],i1>, !torch.list<int> -> !torch.vtensor<[2048,768],i1>
    %int0_762 = torch.constant.int 0
    %1758 = torch.aten.size.int %1748, %int0_762 : !torch.vtensor<[2048,768],f32>, !torch.int -> !torch.int
    %int1_763 = torch.constant.int 1
    %1759 = torch.aten.size.int %1748, %int1_763 : !torch.vtensor<[2048,768],f32>, !torch.int -> !torch.int
    %1760 = torch.prim.ListConstruct %1758, %1759 : (!torch.int, !torch.int) -> !torch.list<int>
    %1761 = torch.aten.empty.memory_format %1760, %int6, %int0, %cpu, %false, %none : !torch.list<int>, !torch.int, !torch.int, !torch.Device, !torch.bool, !torch.none -> !torch.vtensor<[2048,768],f32>
    %1762 = torch.valsem.aten.fill.Scalar %1761, %int0 : !torch.vtensor<[2048,768],f32>, !torch.int -> !torch.vtensor<[2048,768],f32>
    %1763 = torch.aten.where.self %1757, %1748, %1762 : !torch.vtensor<[2048,768],i1>, !torch.vtensor<[2048,768],f32>, !torch.vtensor<[2048,768],f32> -> !torch.vtensor<[2048,768],f32>
    %1764 = torch.prim.ListConstruct %1754 : (!torch.vtensor<[2048],si64>) -> !torch.list<vtensor>
    %false_764 = torch.constant.bool false
    %1765 = torch.valsem.aten.index_put_impl %1751, %1764, %1763, %true, %false_764 : !torch.vtensor<[2,768],f32>, !torch.list<vtensor>, !torch.vtensor<[2048,768],f32>, !torch.bool, !torch.bool -> !torch.vtensor<[2,768],f32>
    %1766 = torch.prim.ListConstruct %int30522, %int768 : (!torch.int, !torch.int) -> !torch.list<int>
    %none_765 = torch.constant.none
    %1767 = torch.aten.empty.memory_format %1766, %int6, %int0, %cpu, %false, %none_765 : !torch.list<int>, !torch.int, !torch.int, !torch.Device, !torch.bool, !torch.none -> !torch.vtensor<[30522,768],f32>
    %int0_766 = torch.constant.int 0
    %1768 = torch.valsem.aten.fill.Scalar %1767, %int0_766 : !torch.vtensor<[30522,768],f32>, !torch.int -> !torch.vtensor<[30522,768],f32>
    %1769 = torch.aten.view %arg52, %1753 : !torch.vtensor<[4,512],si64>, !torch.list<int> -> !torch.vtensor<[2048],si64>
    %1770 = torch.aten.ne.Scalar %1769, %int0 : !torch.vtensor<[2048],si64>, !torch.int -> !torch.vtensor<[2048],i1>
    %1771 = torch.aten.unsqueeze %1770, %int1 : !torch.vtensor<[2048],i1>, !torch.int -> !torch.vtensor<[2048,1],i1>
    %1772 = torch.aten.broadcast_to %1771, %49 : !torch.vtensor<[2048,1],i1>, !torch.list<int> -> !torch.vtensor<[2048,768],i1>
    %1773 = torch.aten.where.self %1772, %1748, %1762 : !torch.vtensor<[2048,768],i1>, !torch.vtensor<[2048,768],f32>, !torch.vtensor<[2048,768],f32> -> !torch.vtensor<[2048,768],f32>
    %1774 = torch.prim.ListConstruct %1769 : (!torch.vtensor<[2048],si64>) -> !torch.list<vtensor>
    %false_767 = torch.constant.bool false
    %1775 = torch.valsem.aten.index_put_impl %1768, %1774, %1773, %true, %false_767 : !torch.vtensor<[30522,768],f32>, !torch.list<vtensor>, !torch.vtensor<[2048,768],f32>, !torch.bool, !torch.bool -> !torch.vtensor<[30522,768],f32>
    return %1729, %1728, %1747, %1765, %1775, %1629, %1628, %1638, %1640, %1699, %1702, %1710, %1713, %1685, %1688, %1611, %1614, %1591, %1590, %1600, %1602, %1492, %1491, %1501, %1503, %1562, %1565, %1573, %1576, %1548, %1551, %1474, %1477, %1454, %1453, %1463, %1465, %259, %258, %268, %270, %329, %332, %340, %343, %315, %318, %241, %244, %221, %220, %230, %232, %115, %114, %124, %126, %192, %195, %203, %206, %178, %181, %97, %100, %74, %73, %83, %86, %1355, %1354, %1364, %1366, %1425, %1428, %1436, %1439, %1411, %1414, %1337, %1340, %1317, %1316, %1326, %1328, %1218, %1217, %1227, %1229, %1288, %1291, %1299, %1302, %1274, %1277, %1200, %1203, %1180, %1179, %1189, %1191, %1081, %1080, %1090, %1092, %1151, %1154, %1162, %1165, %1137, %1140, %1063, %1066, %1043, %1042, %1052, %1054, %944, %943, %953, %955, %1014, %1017, %1025, %1028, %1000, %1003, %926, %929, %906, %905, %915, %917, %807, %806, %816, %818, %877, %880, %888, %891, %863, %866, %789, %792, %769, %768, %778, %780, %670, %669, %679, %681, %740, %743, %751, %754, %726, %729, %652, %655, %632, %631, %641, %643, %533, %532, %542, %544, %603, %606, %614, %617, %589, %592, %515, %518, %495, %494, %504, %506, %396, %395, %405, %407, %466, %469, %477, %480, %452, %455, %378, %381, %358, %357, %367, %369, %2, %27, %30, %47, %46, %57, %59, %2, %2, %2, %2 : !torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[512,768],f32>, !torch.vtensor<[2,768],f32>, !torch.vtensor<[30522,768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768,768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768,768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768,768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768,768],f32>, !torch.vtensor<[3072],f32>, !torch.vtensor<[3072,768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768,3072],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768,768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768,768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768,768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768,768],f32>, !torch.vtensor<[3072],f32>, !torch.vtensor<[3072,768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768,3072],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768,768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768,768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768,768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768,768],f32>, !torch.vtensor<[3072],f32>, !torch.vtensor<[3072,768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768,3072],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768,768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768,768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768,768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768,768],f32>, !torch.vtensor<[3072],f32>, !torch.vtensor<[3072,768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768,3072],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768,768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768,768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768,768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768,768],f32>, !torch.vtensor<[3072],f32>, !torch.vtensor<[3072,768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768,3072],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768,768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768,768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768,768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768,768],f32>, !torch.vtensor<[3072],f32>, !torch.vtensor<[3072,768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768,3072],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768,768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768,768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768,768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768,768],f32>, !torch.vtensor<[3072],f32>, !torch.vtensor<[3072,768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768,3072],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768,768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768,768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768,768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768,768],f32>, !torch.vtensor<[3072],f32>, !torch.vtensor<[3072,768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768,3072],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768,768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768,768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768,768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768,768],f32>, !torch.vtensor<[3072],f32>, !torch.vtensor<[3072,768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768,3072],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768,768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768,768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768,768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768,768],f32>, !torch.vtensor<[3072],f32>, !torch.vtensor<[3072,768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768,3072],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768,768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768,768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768,768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768,768],f32>, !torch.vtensor<[3072],f32>, !torch.vtensor<[3072,768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768,3072],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768,768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768,768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768,768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768,768],f32>, !torch.vtensor<[3072],f32>, !torch.vtensor<[3072,768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768,3072],f32>, !torch.vtensor<[1],f32>, !torch.vtensor<[30522],f32>, !torch.vtensor<[30522,768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768],f32>, !torch.vtensor<[768,768],f32>, !torch.vtensor<[1],f32>, !torch.vtensor<[1],f32>, !torch.vtensor<[1],f32>, !torch.vtensor<[1],f32>
  }
 }