GPT-1基础原理:
原始论文及相关文章:
# gpu driver sudo ubuntu-drivers autoinstall nvidia-smi
# 依赖 pip config set global.index-url http://pypi.tuna.tsinghua.edu.cn/simple pip install numpy pip install transformers pip install datasets pip install tiktoken pip install wandb pip install tqdm # pytorch 1.13 需要关闭train.py中的开关 compile=False pip install torch # pytorch 2.0 模型加速要用到torch.compile(),只支持比较新的GPU # pip install --pre torch[dynamo] --force-reinstall --extra-index-url http://download.pytorch.org/whl/nightly/cu117 --timeout 60000
GPT 语言模型的定义参考:
""" GPT 语言模型的定义参考: 1)OpenAI官方发布的GPT-2 TensorFlow 实现: http://github.com/openai/gpt-2/blob/master/src/model.py 2) huggingface/transformers PyTorch 实现: http://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py """ import numpy as np import tensorflow as tf from tensorflow.contrib.training import HParams def default_hparams(): return HParams( n_vocab=0, n_ctx=1024, n_embd=768, n_head=12, n_layer=12, ) def shape_list(x): """Deal with dynamic shape in tensorflow cleanly.""" static = x.shape.as_list() dynamic = tf.shape(x) return [dynamic[i] if s is None else s for i, s in enumerate(static)] def softmax(x, axis=-1): x = x - tf.reduce_max(x, axis=axis, keepdims=True) ex = tf.exp(x) return ex / tf.reduce_sum(ex, axis=axis, keepdims=True) def gelu(x): return 0.5*x*(1+tf.tanh(np.sqrt(2/np.pi)*(x+0.044715*tf.pow(x, 3)))) def norm(x, scope, *, axis=-1, epsilon=1e-5): """Normalize to mean = 0, std = 1, then do a diagonal affine transform.""" with tf.variable_scope(scope): n_state = x.shape[-1].value g = tf.get_variable('g', [n_state], initializer=tf.constant_initializer(1)) b = tf.get_variable('b', [n_state], initializer=tf.constant_initializer(0)) u = tf.reduce_mean(x, axis=axis, keepdims=True) s = tf.reduce_mean(tf.square(x-u), axis=axis, keepdims=True) x = (x - u) * tf.rsqrt(s + epsilon) x = x*g + b return x def split_states(x, n): """Reshape the last dimension of x into [n, x.shape[-1]/n].""" *start, m = shape_list(x) return tf.reshape(x, start + [n, m//n]) def merge_states(x): """Smash the last two dimensions of x into a single dimension.""" *start, a, b = shape_list(x) return tf.reshape(x, start + [a*b]) def conv1d(x, scope, nf, *, w_init_stdev=0.02): with tf.variable_scope(scope): *start, nx = shape_list(x) w = tf.get_variable('w', [1, nx, nf], initializer=tf.random_normal_initializer(stddev=w_init_stdev)) b = tf.get_variable('b', [nf], initializer=tf.constant_initializer(0)) c = tf.reshape(tf.matmul(tf.reshape(x, [-1, nx]), tf.reshape(w, [-1, nf]))+b, start+[nf]) return c def attention_mask(nd, ns, *, dtype): """1's in the lower triangle, counting from the lower right corner. Same as tf.matrix_band_part(tf.ones([nd, ns]), -1, ns-nd), but doesn't produce garbage on TPUs. """ i = tf.range(nd)[:,None] j = tf.range(ns) m = i >= j - ns + nd return tf.cast(m, dtype) def attn(x, scope, n_state, *, past, hparams): assert x.shape.ndims == 3 # Should be [batch, sequence, features] assert n_state % hparams.n_head == 0 if past is not None: assert past.shape.ndims == 5 # Should be [batch, 2, heads, sequence, features], where 2 is [k, v] def split_heads(x): # From [batch, sequence, features] to [batch, heads, sequence, features] return tf.transpose(split_states(x, hparams.n_head), [0, 2, 1, 3]) def merge_heads(x): # Reverse of split_heads return merge_states(tf.transpose(x, [0, 2, 1, 3])) def mask_attn_weights(w): # w has shape [batch, heads, dst_sequence, src_sequence], where information flows from src to dst. _, _, nd, ns = shape_list(w) b = attention_mask(nd, ns, dtype=w.dtype) b = tf.reshape(b, [1, 1, nd, ns]) w = w*b - tf.cast(1e10, w.dtype)*(1-b) return w def multihead_attn(q, k, v): # q, k, v have shape [batch, heads, sequence, features] w = tf.matmul(q, k, transpose_b=True) w = w * tf.rsqrt(tf.cast(v.shape[-1].value, w.dtype)) w = mask_attn_weights(w) w = softmax(w) a = tf.matmul(w, v) return a with tf.variable_scope(scope): c = conv1d(x, 'c_attn', n_state*3) q, k, v = map(split_heads, tf.split(c, 3, axis=2)) present = tf.stack([k, v], axis=1) if past is not None: pk, pv = tf.unstack(past, axis=1) k = tf.concat([pk, k], axis=-2) v = tf.concat([pv, v], axis=-2) a = multihead_attn(q, k, v) a = merge_heads(a) a = conv1d(a, 'c_proj', n_state) return a, present def mlp(x, scope, n_state, *, hparams): with tf.variable_scope(scope): nx = x.shape[-1].value h = gelu(conv1d(x, 'c_fc', n_state)) h2 = conv1d(h, 'c_proj', nx) return h2 def block(x, scope, *, past, hparams): with tf.variable_scope(scope): nx = x.shape[-1].value a, present = attn(norm(x, 'ln_1'), 'attn', nx, past=past, hparams=hparams) x = x + a m = mlp(norm(x, 'ln_2'), 'mlp', nx*4, hparams=hparams) x = x + m return x, present def past_shape(*, hparams, batch_size=None, sequence=None): return [batch_size, hparams.n_layer, 2, hparams.n_head, sequence, hparams.n_embd // hparams.n_head] def expand_tile(value, size): """Add a new axis of given size.""" value = tf.convert_to_tensor(value, name='value') ndims = value.shape.ndims return tf.tile(tf.expand_dims(value, axis=0), [size] + [1]*ndims) def positions_for(tokens, past_length): batch_size = tf.shape(tokens)[0] nsteps = tf.shape(tokens)[1] return expand_tile(past_length + tf.range(nsteps), batch_size) def model(hparams, X, past=None, scope='model', reuse=False): with tf.variable_scope(scope, reuse=reuse): results = {} batch, sequence = shape_list(X) wpe = tf.get_variable('wpe', [hparams.n_ctx, hparams.n_embd], initializer=tf.random_normal_initializer(stddev=0.01)) wte = tf.get_variable('wte', [hparams.n_vocab, hparams.n_embd], initializer=tf.random_normal_initializer(stddev=0.02)) past_length = 0 if past is None else tf.shape(past)[-2] h = tf.gather(wte, X) + tf.gather(wpe, positions_for(X, past_length)) # Transformer presents = [] pasts = tf.unstack(past, axis=1) if past is not None else [None] * hparams.n_layer assert len(pasts) == hparams.n_layer for layer, past in enumerate(pasts): h, present = block(h, 'h%d' % layer, past=past, hparams=hparams) presents.append(present) results['present'] = tf.stack(presents, axis=1) h = norm(h, 'ln_f') # Language model loss. Do tokens <n predict token n? h_flat = tf.reshape(h, [batch*sequence, hparams.n_embd]) logits = tf.matmul(h_flat, wte, transpose_b=True) logits = tf.reshape(logits, [batch, sequence, hparams.n_vocab]) results['logits'] = logits return results
如果你不是深度学习专业人士,只是想感受一下魔力,尝试一下,那么最快的入门方式就是在莎士比亚的作品上训练一个角色级别的 GPT。
首先,我们将其下载为单个 (1MB) 文件,并将其从原始文本转换为一个大的整数流:
# 拉取莎士比亚作品,并将字符映射为整数数据集train.bin/val.bin以及编码解码器文件meta.pkl(非GPT-2编码器时才有)
python3 data/shakespeare_char/prepare.py
接下来我们训练一个初级的GPT模型 :
# 训练shakespeare python3 train.py config/train_shakespeare_char.py # 实测V100 GPU,训练100分钟后,train loss可以降到0.15左右,valid loss可以降到3.72 Overriding config with config/train_shakespeare_char.py: # train a miniature character-level shakespeare model # good for debugging and playing on macbooks and such out_dir = 'out-shakespeare-char' eval_interval = 250 # keep frequent because we'll overfit eval_iters = 200 log_interval = 10 # don't print too too often # we expect to overfit on this small dataset, so only save when val improves always_save_checkpoint = False wandb_log = False # override via command line if you like wandb_project = 'shakespeare-char' wandb_run_name = 'mini-gpt' dataset = 'shakespeare_char' batch_size = 64 block_size = 256 # context of up to 256 previous characters dtype = 'bfloat16' # baby GPT model :) n_layer = 6 n_head = 6 n_embd = 384 dropout = 0.2 learning_rate = 1e-3 # with baby networks can afford to go a bit higher max_iters = 5000 lr_decay_iters = 5000 # make equal to max_iters usually min_lr = 1e-4 # learning_rate / 10 usually beta2 = 0.99 # make a bit bigger because number of tokens per iter is small warmup_iters = 100 # not super necessary potentially # on macbook also add # device = 'cpu' # run on cpu only compile = False # do not torch compile the model # on GPU server device = 'cuda' total number of tokens per iteration: 655360 Traceback (most recent call last): File "train.py", line 106, in <module> ctx = nullcontext() if device_type == 'cpu' else torch.amp.autocast(device_type=device_type, dtype=ptdtype) File "/usr/local/lib/python3.8/dist-packages/torch/amp/autocast_mode.py", line 234, in __init__ raise RuntimeError('Current CUDA Device does not support bfloat16. Please switch dtype to float16.') RuntimeError: Current CUDA Device does not support bfloat16. Please switch dtype to float16. root@iZt4n7oihq25tf613d4f0mZ:~/nanoGPT# python3 train.py config/train_shakespeare_char.py Overriding config with config/train_shakespeare_char.py: # train a miniature character-level shakespeare model # good for debugging and playing on macbooks and such out_dir = 'out-shakespeare-char' eval_interval = 250 # keep frequent because we'll overfit eval_iters = 200 log_interval = 10 # don't print too too often # we expect to overfit on this small dataset, so only save when val improves always_save_checkpoint = False wandb_log = False # override via command line if you like wandb_project = 'shakespeare-char' wandb_run_name = 'mini-gpt' dataset = 'shakespeare_char' batch_size = 64 block_size = 256 # context of up to 256 previous characters # baby GPT model :) n_layer = 6 n_head = 6 n_embd = 384 dropout = 0.2 learning_rate = 1e-3 # with baby networks can afford to go a bit higher max_iters = 5000 lr_decay_iters = 5000 # make equal to max_iters usually min_lr = 1e-4 # learning_rate / 10 usually beta2 = 0.99 # make a bit bigger because number of tokens per iter is small warmup_iters = 100 # not super necessary potentially # on macbook also add # device = 'cpu' # run on cpu only # compile = False # do not torch compile the model # on GPU server device = 'cuda' total number of tokens per iteration: 655360 Traceback (most recent call last): File "train.py", line 106, in <module> ptdtype = {'float32': torch.float32, 'float16': torch.float16}[dtype] KeyError: 'bfloat16' root@iZt4n7oihq25tf613d4f0mZ:~/nanoGPT# python3 train.py config/train_shakespeare_char.py root@iZt4n7oihq25tf613d4f0mZ:~/nanoGPT# python3 train.py config/train_shakespeare_char.py Overriding config with config/train_shakespeare_char.py: # train a miniature character-level shakespeare model # good for debugging and playing on macbooks and such out_dir = 'out-shakespeare-char' eval_interval = 250 # keep frequent because we'll overfit eval_iters = 200 log_interval = 10 # don't print too too often # we expect to overfit on this small dataset, so only save when val improves always_save_checkpoint = False wandb_log = False # override via command line if you like wandb_project = 'shakespeare-char' wandb_run_name = 'mini-gpt' dataset = 'shakespeare_char' batch_size = 64 block_size = 256 # context of up to 256 previous characters # baby GPT model :) n_layer = 6 n_head = 6 n_embd = 384 dropout = 0.2 learning_rate = 1e-3 # with baby networks can afford to go a bit higher max_iters = 5000 lr_decay_iters = 5000 # make equal to max_iters usually min_lr = 1e-4 # learning_rate / 10 usually beta2 = 0.99 # make a bit bigger because number of tokens per iter is small warmup_iters = 100 # not super necessary potentially # on macbook also add # device = 'cpu' # run on cpu only # compile = False # do not torch compile the model # on GPU server device = 'cuda' total number of tokens per iteration: 655360 found vocab_size = 65 (inside data/shakespeare_char/meta.pkl) Initializing a new model from scratch number of parameters: 10.65M using fused AdamW: True compiling the model... (takes a ~minute) step 0: train loss 4.2874, val loss 4.2823 [2023-04-14 09:55:30,333] torch._inductor.utils: [WARNING] using triton random, expect difference from eager iter 0: loss 4.2586, time 35048.17ms, mfu -100.00% iter 10: loss 3.2202, time 1389.25ms, mfu 10.73% iter 20: loss 2.7714, time 1392.25ms, mfu 10.73% iter 30: loss 2.6154, time 1392.92ms, mfu 10.72% iter 40: loss 2.5368, time 1394.59ms, mfu 10.72% iter 50: loss 2.5093, time 1394.69ms, mfu 10.72% iter 60: loss 2.4757, time 1394.85ms, mfu 10.71% iter 70: loss 2.5073, time 1394.47ms, mfu 10.71% iter 80: loss 2.4474, time 1394.96ms, mfu 10.71% iter 90: loss 2.4334, time 1395.45ms, mfu 10.71% iter 100: loss 2.4050, time 1395.06ms, mfu 10.70% iter 110: loss 2.3856, time 1396.25ms, mfu 10.70% iter 120: loss 2.3631, time 1394.67ms, mfu 10.70% iter 130: loss 2.3024, time 1394.34ms, mfu 10.70% iter 140: loss 2.2330, time 1394.40ms, mfu 10.70% iter 150: loss 2.1229, time 1396.18ms, mfu 10.70% iter 160: loss 2.0596, time 1396.76ms, mfu 10.69% iter 170: loss 2.0247, time 1396.03ms, mfu 10.69% iter 180: loss 1.9253, time 1394.91ms, mfu 10.69% iter 190: loss 1.8770, time 1395.84ms, mfu 10.69% iter 200: loss 1.8505, time 1396.60ms, mfu 10.69% iter 210: loss 1.8220, time 1396.59ms, mfu 10.69% iter 220: loss 1.7351, time 1397.92ms, mfu 10.68% iter 230: loss 1.7186, time 1396.52ms, mfu 10.68% iter 240: loss 1.6742, time 1395.36ms, mfu 10.68% step 250: train loss 1.5482, val loss 1.7322 saving checkpoint to out-shakespeare-char iter 250: loss 1.6170, time 6002.98ms, mfu 9.86% iter 260: loss 1.6227, time 1396.29ms, mfu 9.94% iter 270: loss 1.6086, time 1395.38ms, mfu 10.02% iter 280: loss 1.5508, time 1396.46ms, mfu 10.08% iter 290: loss 1.5237, time 1395.96ms, mfu 10.14% iter 300: loss 1.5497, time 1395.28ms, mfu 10.20% iter 310: loss 1.5187, time 1397.89ms, mfu 10.24% iter 320: loss 1.5137, time 1396.06ms, mfu 10.29% iter 330: loss 1.5041, time 1395.99ms, mfu 10.33% iter 340: loss 1.4562, time 1394.80ms, mfu 10.36% iter 350: loss 1.4466, time 1396.15ms, mfu 10.39% iter 360: loss 1.3967, time 1399.17ms, mfu 10.42% iter 370: loss 1.3867, time 1396.58ms, mfu 10.44% iter 380: loss 1.3648, time 1395.66ms, mfu 10.47% iter 390: loss 1.3446, time 1395.32ms, mfu 10.49% iter 400: loss 1.3223, time 1396.27ms, mfu 10.51% iter 410: loss 1.3614, time 1395.41ms, mfu 10.53% iter 420: loss 1.3121, time 1396.52ms, mfu 10.54% iter 430: loss 1.2831, time 1396.91ms, mfu 10.55% iter 440: loss 1.3500, time 1395.62ms, mfu 10.57% iter 450: loss 1.3271, time 1395.62ms, mfu 10.58% iter 460: loss 1.2502, time 1396.25ms, mfu 10.59% iter 470: loss 1.3077, time 1397.06ms, mfu 10.60% iter 480: loss 1.2766, time 1396.11ms, mfu 10.60% iter 490: loss 1.2447, time 1395.38ms, mfu 10.61% step 500: train loss 1.1257, val loss 1.4794 saving checkpoint to out-shakespeare-char iter 500: loss 1.2409, time 6310.81ms, mfu 9.79% iter 510: loss 1.2128, time 1395.79ms, mfu 9.88% iter 520: loss 1.1950, time 1396.49ms, mfu 9.96% iter 530: loss 1.2109, time 1397.05ms, mfu 10.03% iter 540: loss 1.1947, time 1396.67ms, mfu 10.09% iter 550: loss 1.1853, time 1395.17ms, mfu 10.15% iter 560: loss 1.2016, time 1396.62ms, mfu 10.20% iter 570: loss 1.1693, time 1397.37ms, mfu 10.25% iter 580: loss 1.1706, time 1395.91ms, mfu 10.29% iter 590: loss 1.1353, time 1396.05ms, mfu 10.33% iter 600: loss 1.1314, time 1395.56ms, mfu 10.37% iter 610: loss 1.1187, time 1395.38ms, mfu 10.40% iter 620: loss 1.1109, time 1396.23ms, mfu 10.42% iter 630: loss 1.0877, time 1397.28ms, mfu 10.45% iter 640: loss 1.1222, time 1397.20ms, mfu 10.47% iter 650: loss 1.0938, time 1396.87ms, mfu 10.49% iter 660: loss 1.0652, time 1396.59ms, mfu 10.51% iter 670: loss 1.0469, time 1397.01ms, mfu 10.52% iter 680: loss 1.0372, time 1397.16ms, mfu 10.54% iter 690: loss 1.0529, time 1397.81ms, mfu 10.55% iter 700: loss 1.0402, time 1396.57ms, mfu 10.56% iter 710: loss 1.0225, time 1396.72ms, mfu 10.57% iter 720: loss 0.9876, time 1396.02ms, mfu 10.58% iter 730: loss 1.0127, time 1396.87ms, mfu 10.59% iter 740: loss 0.9794, time 1397.62ms, mfu 10.60% step 750: train loss 0.7875, val loss 1.5834 iter 750: loss 0.9941, time 5848.39ms, mfu 9.80% iter 760: loss 0.9972, time 1394.78ms, mfu 9.88% iter 770: loss 0.9471, time 1397.23ms, mfu 9.96% iter 780: loss 0.9479, time 1397.72ms, mfu 10.03% iter 790: loss 0.9377, time 1396.61ms, mfu 10.10% iter 800: loss 0.8917, time 1397.22ms, mfu 10.15% iter 810: loss 0.8710, time 1396.42ms, mfu 10.21% iter 820: loss 0.8780, time 1395.73ms, mfu 10.25% iter 830: loss 0.8634, time 1396.84ms, mfu 10.29% iter 840: loss 0.8529, time 1397.88ms, mfu 10.33% iter 850: loss 0.8546, time 1396.87ms, mfu 10.37% iter 860: loss 0.8158, time 1396.09ms, mfu 10.40% iter 870: loss 0.8265, time 1395.68ms, mfu 10.42% iter 880: loss 0.8065, time 1396.95ms, mfu 10.45% iter 890: loss 0.8108, time 1397.06ms, mfu 10.47% iter 900: loss 0.7922, time 1395.59ms, mfu 10.49% iter 910: loss 0.8111, time 1396.20ms, mfu 10.51% iter 920: loss 0.7672, time 1396.92ms, mfu 10.53% iter 930: loss 0.7691, time 1397.41ms, mfu 10.54% iter 940: loss 0.7607, time 1397.17ms, mfu 10.55% iter 950: loss 0.7706, time 1396.58ms, mfu 10.57% iter 960: loss 0.7467, time 1396.98ms, mfu 10.58% iter 970: loss 0.7432, time 1395.20ms, mfu 10.59% iter 980: loss 0.7039, time 1396.55ms, mfu 10.59% iter 990: loss 0.7100, time 1397.82ms, mfu 10.60% step 1000: train loss 0.3959, val loss 1.9050 iter 1000: loss 0.6856, time 5838.01ms, mfu 9.80% iter 1010: loss 0.6781, time 1396.71ms, mfu 9.88% iter 1020: loss 0.6765, time 1395.95ms, mfu 9.96% iter 1030: loss 0.6651, time 1395.96ms, mfu 10.03% iter 1040: loss 0.6758, time 1397.37ms, mfu 10.10% iter 1050: loss 0.6483, time 1397.06ms, mfu 10.16% iter 1060: loss 0.6382, time 1397.28ms, mfu 10.21% iter 1070: loss 0.5898, time 1397.25ms, mfu 10.25% iter 1080: loss 0.6376, time 1396.11ms, mfu 10.29% iter 1090: loss 0.6204, time 1396.74ms, mfu 10.33% iter 1100: loss 0.5924, time 1397.62ms, mfu 10.37% iter 1110: loss 0.5955, time 1395.80ms, mfu 10.40% iter 1120: loss 0.5758, time 1395.05ms, mfu 10.43% iter 1130: loss 0.5956, time 1396.69ms, mfu 10.45% iter 1140: loss 0.5833, time 1395.38ms, mfu 10.47% iter 1150: loss 0.5774, time 1397.76ms, mfu 10.49% iter 1160: loss 0.5521, time 1396.44ms, mfu 10.51% iter 1170: loss 0.5472, time 1394.76ms, mfu 10.53% iter 1180: loss 0.5513, time 1396.83ms, mfu 10.54% iter 1190: loss 0.5299, time 1395.86ms, mfu 10.56% iter 1200: loss 0.5342, time 1398.13ms, mfu 10.57% iter 1210: loss 0.5397, time 1396.97ms, mfu 10.58% iter 1220: loss 0.5248, time 1396.42ms, mfu 10.59% iter 1230: loss 0.5127, time 1395.75ms, mfu 10.60% iter 1240: loss 0.5328, time 1395.91ms, mfu 10.60% step 1250: train loss 0.1908, val loss 2.2568 iter 1250: loss 0.5135, time 5839.61ms, mfu 9.80% iter 1260: loss 0.5065, time 1397.27ms, mfu 9.89% iter 1270: loss 0.5214, time 1397.27ms, mfu 9.96% iter 1280: loss 0.4986, time 1395.61ms, mfu 10.04% iter 1290: loss 0.4790, time 1397.00ms, mfu 10.10% iter 1300: loss 0.4788, time 1396.65ms, mfu 10.16% iter 1310: loss 0.4886, time 1397.53ms, mfu 10.21% iter 1320: loss 0.4646, time 1395.61ms, mfu 10.25% iter 1330: loss 0.4611, time 1395.60ms, mfu 10.30% iter 1340: loss 0.4612, time 1396.26ms, mfu 10.33% iter 1350: loss 0.4525, time 1396.28ms, mfu 10.37% iter 1360: loss 0.4236, time 1397.25ms, mfu 10.40% iter 1370: loss 0.4528, time 1395.44ms, mfu 10.43% iter 1380: loss 0.4495, time 1395.44ms, mfu 10.45% iter 1390: loss 0.4413, time 1394.99ms, mfu 10.48% iter 1400: loss 0.4362, time 1397.86ms, mfu 10.49% iter 1410: loss 0.4302, time 1397.51ms, mfu 10.51% iter 1420: loss 0.4267, time 1396.74ms, mfu 10.53% iter 1430: loss 0.4190, time 1396.87ms, mfu 10.54% iter 1440: loss 0.4370, time 1396.64ms, mfu 10.55% iter 1450: loss 0.4101, time 1397.86ms, mfu 10.57% iter 1460: loss 0.4200, time 1396.51ms, mfu 10.58% iter 1470: loss 0.4043, time 1395.97ms, mfu 10.59% iter 1480: loss 0.4027, time 1396.46ms, mfu 10.59% iter 1490: loss 0.4051, time 1395.87ms, mfu 10.60% step 1500: train loss 0.1302, val loss 2.4975 iter 1500: loss 0.4120, time 5848.71ms, mfu 9.80% iter 1510: loss 0.3907, time 1396.52ms, mfu 9.89% iter 1520: loss 0.3884, time 1396.87ms, mfu 9.96% iter 1530: loss 0.3842, time 1395.13ms, mfu 10.04% iter 1540: loss 0.3896, time 1396.81ms, mfu 10.10% iter 1550: loss 0.3729, time 1396.97ms, mfu 10.16% iter 1560: loss 0.3719, time 1396.46ms, mfu 10.21% iter 1570: loss 0.3951, time 1397.06ms, mfu 10.25% iter 1580: loss 0.3723, time 1395.96ms, mfu 10.30% iter 1590: loss 0.3719, time 1396.24ms, mfu 10.33% iter 1600: loss 0.3787, time 1395.88ms, mfu 10.37% iter 1610: loss 0.3628, time 1395.75ms, mfu 10.40% iter 1620: loss 0.3713, time 1397.60ms, mfu 10.43% iter 1630: loss 0.3550, time 1396.80ms, mfu 10.45% iter 1640: loss 0.3717, time 1397.31ms, mfu 10.47% iter 1650: loss 0.3657, time 1395.51ms, mfu 10.49% iter 1660: loss 0.3553, time 1395.29ms, mfu 10.51% iter 1670: loss 0.3558, time 1396.13ms, mfu 10.53% iter 1680: loss 0.3377, time 1397.38ms, mfu 10.54% iter 1690: loss 0.3515, time 1396.99ms, mfu 10.55% iter 1700: loss 0.3486, time 1395.50ms, mfu 10.57% iter 1710: loss 0.3422, time 1395.65ms, mfu 10.58% iter 1720: loss 0.3527, time 1395.82ms, mfu 10.59% iter 1730: loss 0.3397, time 1397.71ms, mfu 10.60% iter 1740: loss 0.3379, time 1396.31ms, mfu 10.60% step 1750: train loss 0.1023, val loss 2.7085 iter 1750: loss 0.3402, time 5830.73ms, mfu 9.80% iter 1760: loss 0.3718, time 1396.05ms, mfu 9.89% iter 1770: loss 0.3542, time 1395.77ms, mfu 9.97% iter 1780: loss 0.3278, time 1396.88ms, mfu 10.04% iter 1790: loss 0.3237, time 1395.49ms, mfu 10.10% iter 1800: loss 0.3190, time 1396.11ms, mfu 10.16% iter 1810: loss 0.3168, time 1395.17ms, mfu 10.21% iter 1820: loss 0.3173, time 1397.21ms, mfu 10.26% iter 1830: loss 0.3182, time 1398.33ms, mfu 10.30% iter 1840: loss 0.3205, time 1396.30ms, mfu 10.33% iter 1850: loss 0.3148, time 1395.38ms, mfu 10.37% iter 1860: loss 0.3084, time 1396.11ms, mfu 10.40% iter 1870: loss 0.3156, time 1395.58ms, mfu 10.43% iter 1880: loss 0.3139, time 1396.58ms, mfu 10.45% iter 1890: loss 0.3217, time 1396.58ms, mfu 10.47% iter 1900: loss 0.3148, time 1397.03ms, mfu 10.49% iter 1910: loss 0.3084, time 1395.46ms, mfu 10.51% iter 1920: loss 0.3127, time 1395.68ms, mfu 10.53% iter 1930: loss 0.3201, time 1396.21ms, mfu 10.54% iter 1940: loss 0.3035, time 1397.30ms, mfu 10.56% iter 1950: loss 0.3101, time 1396.34ms, mfu 10.57% iter 1960: loss 0.2990, time 1396.22ms, mfu 10.58% iter 1970: loss 0.3049, time 1395.96ms, mfu 10.59% iter 1980: loss 0.2934, time 1395.03ms, mfu 10.60% iter 1990: loss 0.2874, time 1397.65ms, mfu 10.60% step 2000: train loss 0.0942, val loss 2.8577 iter 2000: loss 0.2923, time 5852.53ms, mfu 9.80% iter 2010: loss 0.2912, time 1395.97ms, mfu 9.89% iter 2020: loss 0.2946, time 1395.69ms, mfu 9.97% iter 2030: loss 0.3042, time 1396.45ms, mfu 10.04% iter 2040: loss 0.2845, time 1397.80ms, mfu 10.10% iter 2050: loss 0.2835, time 1395.66ms, mfu 10.16% iter 2060: loss 0.2955, time 1395.43ms, mfu 10.21% iter 2070: loss 0.2914, time 1396.30ms, mfu 10.26% iter 2080: loss 0.2805, time 1395.78ms, mfu 10.30% iter 2090: loss 0.2995, time 1396.00ms, mfu 10.34% iter 2100: loss 0.2913, time 1396.46ms, mfu 10.37% iter 2110: loss 0.2899, time 1396.60ms, mfu 10.40% iter 2120: loss 0.2925, time 1396.35ms, mfu 10.43% iter 2130: loss 0.2807, time 1396.85ms, mfu 10.45% iter 2140: loss 0.2756, time 1396.39ms, mfu 10.47% iter 2150: loss 0.2790, time 1395.13ms, mfu 10.50% iter 2160: loss 0.2801, time 1396.35ms, mfu 10.51% iter 2170: loss 0.2680, time 1396.02ms, mfu 10.53% iter 2180: loss 0.2809, time 1396.18ms, mfu 10.54% iter 2190: loss 0.2725, time 1396.69ms, mfu 10.56% iter 2200: loss 0.2723, time 1395.61ms, mfu 10.57% iter 2210: loss 0.2750, time 1395.76ms, mfu 10.58% iter 2220: loss 0.2665, time 1396.04ms, mfu 10.59% iter 2230: loss 0.2632, time 1397.37ms, mfu 10.60% iter 2240: loss 0.2750, time 1396.50ms, mfu 10.60% step 2250: train loss 0.0883, val loss 2.9841 iter 2250: loss 0.2809, time 5827.53ms, mfu 9.80% iter 2260: loss 0.2735, time 1395.44ms, mfu 9.89% iter 2270: loss 0.2649, time 1395.60ms, mfu 9.97% iter 2280: loss 0.2677, time 1396.26ms, mfu 10.04% iter 2290: loss 0.2708, time 1397.06ms, mfu 10.10% iter 2300: loss 0.2592, time 1395.89ms, mfu 10.16% iter 2310: loss 0.2555, time 1395.71ms, mfu 10.21% iter 2320: loss 0.2637, time 1395.79ms, mfu 10.26% iter 2330: loss 0.2607, time 1396.06ms, mfu 10.30% iter 2340: loss 0.2667, time 1396.14ms, mfu 10.34% iter 2350: loss 0.2542, time 1396.13ms, mfu 10.37% iter 2360: loss 0.2603, time 1394.59ms, mfu 10.40% iter 2370: loss 0.2569, time 1395.76ms, mfu 10.43% iter 2380: loss 0.2542, time 1395.66ms, mfu 10.46% iter 2390: loss 0.2636, time 1396.60ms, mfu 10.48% iter 2400: loss 0.2527, time 1396.18ms, mfu 10.50% iter 2410: loss 0.2454, time 1395.95ms, mfu 10.51% iter 2420: loss 0.2493, time 1395.37ms, mfu 10.53% iter 2430: loss 0.2559, time 1396.10ms, mfu 10.55% iter 2440: loss 0.2569, time 1396.71ms, mfu 10.56% iter 2450: loss 0.2573, time 1396.07ms, mfu 10.57% iter 2460: loss 0.2479, time 1395.60ms, mfu 10.58% iter 2470: loss 0.2514, time 1395.71ms, mfu 10.59% iter 2480: loss 0.2505, time 1396.36ms, mfu 10.60% iter 2490: loss 0.2551, time 1397.24ms, mfu 10.61% step 2500: train loss 0.0846, val loss 3.1065 iter 2500: loss 0.2564, time 5855.72ms, mfu 9.80% iter 2510: loss 0.2534, time 1395.32ms, mfu 9.89% iter 2520: loss 0.2538, time 1396.35ms, mfu 9.97% iter 2530: loss 0.2599, time 1397.59ms, mfu 10.04% iter 2540: loss 0.2439, time 1396.39ms, mfu 10.10% iter 2550: loss 0.2446, time 1396.15ms, mfu 10.16% iter 2560: loss 0.2497, time 1395.30ms, mfu 10.21% iter 2570: loss 0.2503, time 1395.12ms, mfu 10.26% iter 2580: loss 0.2413, time 1395.61ms, mfu 10.30% iter 2590: loss 0.2550, time 1397.11ms, mfu 10.34% iter 2600: loss 0.2450, time 1396.70ms, mfu 10.37% iter 2610: loss 0.2449, time 1396.05ms, mfu 10.40% iter 2620: loss 0.2401, time 1395.63ms, mfu 10.43% iter 2630: loss 0.2367, time 1395.24ms, mfu 10.45% iter 2640: loss 0.2387, time 1396.21ms, mfu 10.48% iter 2650: loss 0.2481, time 1395.63ms, mfu 10.50% iter 2660: loss 0.2281, time 1395.85ms, mfu 10.51% iter 2670: loss 0.2364, time 1396.07ms, mfu 10.53% iter 2680: loss 0.2368, time 1395.36ms, mfu 10.55% iter 2690: loss 0.2381, time 1396.54ms, mfu 10.56% iter 2700: loss 0.2320, time 1395.94ms, mfu 10.57% iter 2710: loss 0.2345, time 1395.72ms, mfu 10.58% iter 2720: loss 0.2361, time 1394.89ms, mfu 10.59% iter 2730: loss 0.2322, time 1396.44ms, mfu 10.60% iter 2740: loss 0.2180, time 1396.10ms, mfu 10.61% step 2750: train loss 0.0821, val loss 3.2077 iter 2750: loss 0.2246, time 5845.95ms, mfu 9.80% iter 2760: loss 0.2218, time 1395.24ms, mfu 9.89% iter 2770: loss 0.2278, time 1396.33ms, mfu 9.97% iter 2780: loss 0.2252, time 1396.55ms, mfu 10.04% iter 2790: loss 0.2253, time 1395.96ms, mfu 10.10% iter 2800: loss 0.2243, time 1395.94ms, mfu 10.16% iter 2810: loss 0.2170, time 1395.45ms, mfu 10.21% iter 2820: loss 0.2194, time 1395.59ms, mfu 10.26% iter 2830: loss 0.2282, time 1395.67ms, mfu 10.30% iter 2840: loss 0.2205, time 1396.07ms, mfu 10.34% iter 2850: loss 0.2295, time 1396.02ms, mfu 10.37% iter 2860: loss 0.2269, time 1395.82ms, mfu 10.40% iter 2870: loss 0.2227, time 1395.21ms, mfu 10.43% iter 2880: loss 0.2214, time 1396.80ms, mfu 10.45% iter 2890: loss 0.2117, time 1397.77ms, mfu 10.48% iter 2900: loss 0.2126, time 1396.02ms, mfu 10.50% iter 2910: loss 0.2238, time 1395.95ms, mfu 10.51% iter 2920: loss 0.2170, time 1396.77ms, mfu 10.53% iter 2930: loss 0.2303, time 1395.38ms, mfu 10.54% iter 2940: loss 0.2177, time 1396.25ms, mfu 10.56% iter 2950: loss 0.2164, time 1396.23ms, mfu 10.57% iter 2960: loss 0.2261, time 1394.96ms, mfu 10.58% iter 2970: loss 0.2162, time 1395.43ms, mfu 10.59% iter 2980: loss 0.2164, time 1395.56ms, mfu 10.60% iter 2990: loss 0.2181, time 1396.75ms, mfu 10.61% step 3000: train loss 0.0795, val loss 3.3033 iter 3000: loss 0.2120, time 5831.66ms, mfu 9.80% iter 3010: loss 0.2117, time 1394.72ms, mfu 9.89% iter 3020: loss 0.2109, time 1396.49ms, mfu 9.97% iter 3030: loss 0.2288, time 1395.74ms, mfu 10.04% iter 3040: loss 0.2185, time 1396.67ms, mfu 10.10% iter 3050: loss 0.2146, time 1396.54ms, mfu 10.16% iter 3060: loss 0.2063, time 1396.53ms, mfu 10.21% iter 3070: loss 0.2139, time 1395.24ms, mfu 10.26% iter 3080: loss 0.2122, time 1395.87ms, mfu 10.30% iter 3090: loss 0.2027, time 1397.17ms, mfu 10.34% iter 3100: loss 0.2144, time 1395.34ms, mfu 10.37% iter 3110: loss 0.2257, time 1396.23ms, mfu 10.40% iter 3120: loss 0.2102, time 1395.60ms, mfu 10.43% iter 3130: loss 0.2072, time 1396.10ms, mfu 10.45% iter 3140: loss 0.2082, time 1395.68ms, mfu 10.48% iter 3150: loss 0.2121, time 1396.81ms, mfu 10.50% iter 3160: loss 0.2061, time 1396.68ms, mfu 10.51% iter 3170: loss 0.1955, time 1395.18ms, mfu 10.53% iter 3180: loss 0.2053, time 1395.35ms, mfu 10.55% iter 3190: loss 0.2104, time 1395.92ms, mfu 10.56% iter 3200: loss 0.2140, time 1395.34ms, mfu 10.57% iter 3210: loss 0.1993, time 1395.98ms, mfu 10.58% iter 3220: loss 0.2012, time 1394.71ms, mfu 10.59% iter 3230: loss 0.2028, time 1395.98ms, mfu 10.60% iter 3240: loss 0.2138, time 1395.71ms, mfu 10.61% step 3250: train loss 0.0780, val loss 3.3859 iter 3250: loss 0.2091, time 5841.23ms, mfu 9.80% iter 3260: loss 0.2058, time 1396.07ms, mfu 9.89% iter 3270: loss 0.2043, time 1397.21ms, mfu 9.97% iter 3280: loss 0.2045, time 1396.75ms, mfu 10.04% iter 3290: loss 0.1999, time 1396.46ms, mfu 10.10% iter 3300: loss 0.2028, time 1396.42ms, mfu 10.16% iter 3310: loss 0.2022, time 1394.55ms, mfu 10.21% iter 3320: loss 0.1993, time 1395.66ms, mfu 10.26% iter 3330: loss 0.1987, time 1395.88ms, mfu 10.30% iter 3340: loss 0.2015, time 1396.69ms, mfu 10.34% iter 3350: loss 0.2003, time 1395.98ms, mfu 10.37% iter 3360: loss 0.2053, time 1396.19ms, mfu 10.40% iter 3370: loss 0.2030, time 1396.23ms, mfu 10.43% iter 3380: loss 0.1946, time 1395.42ms, mfu 10.45% iter 3390: loss 0.1991, time 1396.85ms, mfu 10.48% iter 3400: loss 0.1966, time 1396.09ms, mfu 10.50% iter 3410: loss 0.2060, time 1396.34ms, mfu 10.51% iter 3420: loss 0.2016, time 1396.14ms, mfu 10.53% iter 3430: loss 0.2013, time 1395.82ms, mfu 10.54% iter 3440: loss 0.2015, time 1397.46ms, mfu 10.56% iter 3450: loss 0.1937, time 1395.00ms, mfu 10.57% iter 3460: loss 0.1895, time 1395.58ms, mfu 10.58% iter 3470: loss 0.1941, time 1393.97ms, mfu 10.59% iter 3480: loss 0.2000, time 1395.54ms, mfu 10.60% iter 3490: loss 0.1968, time 1396.20ms, mfu 10.61% step 3500: train loss 0.0765, val loss 3.4490 iter 3500: loss 0.2007, time 5843.20ms, mfu 9.80% iter 3510: loss 0.1947, time 1396.31ms, mfu 9.89% iter 3520: loss 0.1970, time 1395.62ms, mfu 9.97% iter 3530: loss 0.2012, time 1395.89ms, mfu 10.04% iter 3540: loss 0.1977, time 1396.59ms, mfu 10.10% iter 3550: loss 0.2031, time 1395.86ms, mfu 10.16% iter 3560: loss 0.1864, time 1395.95ms, mfu 10.21% iter 3570: loss 0.1994, time 1395.73ms, mfu 10.26% iter 3580: loss 0.1943, time 1395.84ms, mfu 10.30% iter 3590: loss 0.1883, time 1396.66ms, mfu 10.34% iter 3600: loss 0.1949, time 1395.57ms, mfu 10.37% iter 3610: loss 0.1937, time 1394.28ms, mfu 10.40% iter 3620: loss 0.1857, time 1395.96ms, mfu 10.43% iter 3630: loss 0.1880, time 1398.29ms, mfu 10.45% iter 3640: loss 0.1928, time 1395.48ms, mfu 10.48% iter 3650: loss 0.1925, time 1396.16ms, mfu 10.50% iter 3660: loss 0.1888, time 1394.57ms, mfu 10.52% iter 3670: loss 0.1942, time 1394.98ms, mfu 10.53% iter 3680: loss 0.1876, time 1395.96ms, mfu 10.55% iter 3690: loss 0.1879, time 1395.39ms, mfu 10.56% iter 3700: loss 0.1776, time 1395.65ms, mfu 10.57% iter 3710: loss 0.1937, time 1394.41ms, mfu 10.58% iter 3720: loss 0.1820, time 1396.65ms, mfu 10.59% iter 3730: loss 0.1953, time 1396.12ms, mfu 10.60% iter 3740: loss 0.1856, time 1395.92ms, mfu 10.61% step 3750: train loss 0.0755, val loss 3.5307 iter 3750: loss 0.1828, time 5845.61ms, mfu 9.80% iter 3760: loss 0.1798, time 1396.08ms, mfu 9.89% iter 3770: loss 0.1882, time 1395.16ms, mfu 9.97% iter 3780: loss 0.1831, time 1395.07ms, mfu 10.04% iter 3790: loss 0.1847, time 1395.98ms, mfu 10.10% iter 3800: loss 0.1837, time 1394.44ms, mfu 10.16% iter 3810: loss 0.1864, time 1394.99ms, mfu 10.22% iter 3820: loss 0.1850, time 1394.58ms, mfu 10.26% iter 3830: loss 0.1831, time 1395.50ms, mfu 10.30% iter 3840: loss 0.1845, time 1395.93ms, mfu 10.34% iter 3850: loss 0.1837, time 1395.45ms, mfu 10.38% iter 3860: loss 0.1850, time 1396.44ms, mfu 10.41% iter 3870: loss 0.1727, time 1395.43ms, mfu 10.43% iter 3880: loss 0.1832, time 1395.11ms, mfu 10.46% iter 3890: loss 0.1860, time 1396.42ms, mfu 10.48% iter 3900: loss 0.1835, time 1396.00ms, mfu 10.50% iter 3910: loss 0.1960, time 1395.40ms, mfu 10.52% iter 3920: loss 0.1815, time 1395.38ms, mfu 10.53% iter 3930: loss 0.1906, time 1395.15ms, mfu 10.55% iter 3940: loss 0.1807, time 1395.60ms, mfu 10.56% iter 3950: loss 0.1817, time 1398.31ms, mfu 10.57% iter 3960: loss 0.1764, time 1396.48ms, mfu 10.58% iter 3970: loss 0.1787, time 1395.17ms, mfu 10.59% iter 3980: loss 0.1727, time 1395.32ms, mfu 10.60% iter 3990: loss 0.1772, time 1395.31ms, mfu 10.61% step 4000: train loss 0.0742, val loss 3.5892 iter 4000: loss 0.1825, time 5853.00ms, mfu 9.80% iter 4010: loss 0.1832, time 1395.68ms, mfu 9.89% iter 4020: loss 0.1800, time 1395.25ms, mfu 9.97% iter 4030: loss 0.1753, time 1394.93ms, mfu 10.04% iter 4040: loss 0.1822, time 1396.01ms, mfu 10.10% iter 4050: loss 0.1792, time 1396.04ms, mfu 10.16% iter 4060: loss 0.1805, time 1397.10ms, mfu 10.21% iter 4070: loss 0.1791, time 1396.38ms, mfu 10.26% iter 4080: loss 0.1727, time 1395.72ms, mfu 10.30% iter 4090: loss 0.1771, time 1395.96ms, mfu 10.34% iter 4100: loss 0.1730, time 1395.62ms, mfu 10.37% iter 4110: loss 0.1744, time 1396.09ms, mfu 10.40% iter 4120: loss 0.1790, time 1396.16ms, mfu 10.43% iter 4130: loss 0.1748, time 1395.76ms, mfu 10.46% iter 4140: loss 0.1809, time 1395.48ms, mfu 10.48% iter 4150: loss 0.1730, time 1396.41ms, mfu 10.50% iter 4160: loss 0.1768, time 1396.96ms, mfu 10.51% iter 4170: loss 0.1772, time 1396.36ms, mfu 10.53% iter 4180: loss 0.1701, time 1395.68ms, mfu 10.55% iter 4190: loss 0.1759, time 1394.81ms, mfu 10.56% iter 4200: loss 0.1776, time 1397.39ms, mfu 10.57% iter 4210: loss 0.1722, time 1397.47ms, mfu 10.58% iter 4220: loss 0.1730, time 1396.22ms, mfu 10.59% iter 4230: loss 0.1715, time 1394.98ms, mfu 10.60% iter 4240: loss 0.1782, time 1395.52ms, mfu 10.61% step 4250: train loss 0.0737, val loss 3.6301 iter 4250: loss 0.1742, time 5829.30ms, mfu 9.80% iter 4260: loss 0.1719, time 1395.96ms, mfu 9.89% iter 4270: loss 0.1737, time 1397.36ms, mfu 9.97% iter 4280: loss 0.1750, time 1396.12ms, mfu 10.04% iter 4290: loss 0.1716, time 1395.20ms, mfu 10.10% iter 4300: loss 0.1742, time 1395.36ms, mfu 10.16% iter 4310: loss 0.1698, time 1395.47ms, mfu 10.21% iter 4320: loss 0.1679, time 1396.83ms, mfu 10.26% iter 4330: loss 0.1758, time 1395.56ms, mfu 10.30% iter 4340: loss 0.1737, time 1395.96ms, mfu 10.34% iter 4350: loss 0.1728, time 1395.92ms, mfu 10.37% iter 4360: loss 0.1638, time 1395.93ms, mfu 10.40% iter 4370: loss 0.1704, time 1396.43ms, mfu 10.43% iter 4380: loss 0.1731, time 1395.75ms, mfu 10.45% iter 4390: loss 0.1734, time 1395.29ms, mfu 10.48% iter 4400: loss 0.1755, time 1396.20ms, mfu 10.50% iter 4410: loss 0.1734, time 1396.99ms, mfu 10.51% iter 4420: loss 0.1671, time 1396.79ms, mfu 10.53% iter 4430: loss 0.1746, time 1395.35ms, mfu 10.55% iter 4440: loss 0.1698, time 1394.89ms, mfu 10.56% iter 4450: loss 0.1709, time 1395.87ms, mfu 10.57% iter 4460: loss 0.1732, time 1396.20ms, mfu 10.58% iter 4470: loss 0.1709, time 1396.58ms, mfu 10.59% iter 4480: loss 0.1744, time 1395.89ms, mfu 10.60% iter 4490: loss 0.1680, time 1395.58ms, mfu 10.61% step 4500: train loss 0.0729, val loss 3.6449 iter 4500: loss 0.1739, time 5832.06ms, mfu 9.80% iter 4510: loss 0.1693, time 1396.05ms, mfu 9.89% iter 4520: loss 0.1708, time 1396.85ms, mfu 9.97% iter 4530: loss 0.1594, time 1395.56ms, mfu 10.04% iter 4540: loss 0.1661, time 1395.39ms, mfu 10.10% iter 4550: loss 0.1665, time 1395.12ms, mfu 10.16% iter 4560: loss 0.1690, time 1397.28ms, mfu 10.21% iter 4570: loss 0.1664, time 1395.59ms, mfu 10.26% iter 4580: loss 0.1691, time 1395.47ms, mfu 10.30% iter 4590: loss 0.1700, time 1394.83ms, mfu 10.34% iter 4600: loss 0.1639, time 1394.86ms, mfu 10.37% iter 4610: loss 0.1618, time 1395.88ms, mfu 10.40% iter 4620: loss 0.1678, time 1395.99ms, mfu 10.43% iter 4630: loss 0.1694, time 1396.96ms, mfu 10.46% iter 4640: loss 0.1697, time 1394.56ms, mfu 10.48% iter 4650: loss 0.1699, time 1393.97ms, mfu 10.50% iter 4660: loss 0.1691, time 1395.08ms, mfu 10.52% iter 4670: loss 0.1747, time 1395.97ms, mfu 10.53% iter 4680: loss 0.1670, time 1396.05ms, mfu 10.55% iter 4690: loss 0.1677, time 1395.60ms, mfu 10.56% iter 4700: loss 0.1668, time 1396.53ms, mfu 10.57% iter 4710: loss 0.1686, time 1396.15ms, mfu 10.58% iter 4720: loss 0.1749, time 1397.71ms, mfu 10.59% iter 4730: loss 0.1677, time 1396.46ms, mfu 10.60% iter 4740: loss 0.1651, time 1395.50ms, mfu 10.61% step 4750: train loss 0.0726, val loss 3.6949 iter 4750: loss 0.1612, time 5828.90ms, mfu 9.80% iter 4760: loss 0.1647, time 1396.15ms, mfu 9.89% iter 4770: loss 0.1631, time 1397.44ms, mfu 9.97% iter 4780: loss 0.1584, time 1397.02ms, mfu 10.04% iter 4790: loss 0.1677, time 1395.88ms, mfu 10.10% iter 4800: loss 0.1676, time 1395.00ms, mfu 10.16% iter 4810: loss 0.1651, time 1394.33ms, mfu 10.21% iter 4820: loss 0.1628, time 1395.26ms, mfu 10.26% iter 4830: loss 0.1674, time 1396.76ms, mfu 10.30% iter 4840: loss 0.1605, time 1395.77ms, mfu 10.34% iter 4850: loss 0.1639, time 1395.68ms, mfu 10.37% iter 4860: loss 0.1762, time 1395.44ms, mfu 10.40% iter 4870: loss 0.1628, time 1396.52ms, mfu 10.43% iter 4880: loss 0.1628, time 1396.79ms, mfu 10.45% iter 4890: loss 0.1591, time 1395.89ms, mfu 10.48% iter 4900: loss 0.1672, time 1395.60ms, mfu 10.50% iter 4910: loss 0.1634, time 1396.36ms, mfu 10.51% iter 4920: loss 0.1596, time 1396.20ms, mfu 10.53% iter 4930: loss 0.1680, time 1396.82ms, mfu 10.54% iter 4940: loss 0.1590, time 1396.95ms, mfu 10.56% iter 4950: loss 0.1608, time 1395.59ms, mfu 10.57% iter 4960: loss 0.1650, time 1394.79ms, mfu 10.58% iter 4970: loss 0.1675, time 1395.83ms, mfu 10.59% iter 4980: loss 0.1636, time 1397.06ms, mfu 10.60% iter 4990: loss 0.1673, time 1396.25ms, mfu 10.61% step 5000: train loss 0.0720, val loss 3.7275 iter 5000: loss 0.1584, time 5840.43ms, mfu 9.80%
从配置文件中可以看到,我们本质上是在训练一个上下文大小高达 256 个字符、384 个特征通道的 GPT,它是一个 6 层 Transformer,每层有 6 个头。如果是在A100 GPU 上,此训练运行大约需要 3 分钟,最佳loss为 1.4697,而在V100上,要达到相同loss需要大约120分钟。我们也可以通过调整训练参数来加快训练速度:
# 更多的噪音但更快的估计eval_iters200->20,上下文大小256->64,每次迭代的批量样例大小64->12,更小的Transformer(4 层、4 个头、128 嵌入大小),并将迭代次数减少到 2000,简化正则化dropout=0.0 python3 train.py config/train_shakespeare_char.py -eval_iters=20 --log_interval=1 --block_size=64 --batch_size=12 --n_layer=4 --n_head=4 --n_embd=128 --max_iters=2000 --lr_decay_iters=2000 --dropout=0.0
根据配置,模型检查点被写入--out_dir
目录out-shakespeare-char
。
因此,一旦训练完成,我们就可以通过将采样脚本指向此目录来从最佳模型中采样:
# 文本生成 python3 sample.py --out_dir=out-shakespeare-char --num_samples=2
这会生成一些示例(从零生成),例如:
Overriding: out_dir = out-shakespeare-char Overriding: num_samples = 2 number of parameters: 10.65M Loading meta from data/shakespeare_char/meta.pkl... YORK: I think will be anger than you boast. YORK: Say you gone. GLOUCESTER: King Edward did hath, away! what fast? YORK: That stand for the course of the beggar? EXETER: The Hereford that Bolingbroke is wars! YORK: And let him be come to speak. YORK: The violet of die sights; but like the Saint George of Grey, The queen, this son shall strong of his business noble trust Are great live to malk of what dear? YORK: Nay, but I still in a hope of thine own. Give me a save of your chamber voic --------------- Menenius, sir, you shall have hand you have sorry; Therefore lay the world call you to the dead. MARIANA: I should so, my lords. LADY ANNE: What is Anne moved and pass'd from my move? KING RICHARD III: What art you to prove my life, is dead? GLOUCESTER: Nor bear that you but wear to be your lord? LADY ANNE: Not in all the city, if I shall please in my grace, Though to death my denied cousin's lips, Ground be that I have learn'd to keep you have wrong, But at doth both the sea against of you
以上的生成结果是GPT-2模型在没有instructions tuning的前提下自动生成的。
基于这个模型,我们输入一个前置的语料进行instructions tuning,可以获得更好的效果。本质上,就是让GPT-2模型不要”从零开始“,而是从一个指定的”上下文背景“之下开始继续往下说。
# 文本生成 python3 sample.py --out_dir=out-shakespeare-char --num_samples=2 --start="FILE:./fine-tune/webshell_alpaca_like_dataset.json"
可以看到,牛头不对马嘴,因为我们的训练语料库中并不包含关于”webshell代码描述“的语料,所以GPT-2自然也就无法从语料库中学到任何关于”webshell代码描述“的知识。
首先需要标记数据集,这里我们直接使用部分开放的OpenWebText数据集(12.9G大小,是OpenAI使用的可公开子集):
# 将 openwebtext 数据集下载并处理为二进制文件用于训练(train.bin/val.bin,GPT2 BPE TokenID,uint16)
python3 data/openwebtext/prepare.py
然后我们准备开始训练。要重现 GPT-2 (124M),您至少需要一个 8X A100 40GB 节点并运行:
# 普通单GPU上训练 python3 train.py # A100上训练(根据A100卡数量调整--nproc_per_node参数) # torchrun --standalone --nproc_per_node=8 train.py config/train_gpt2.py
可以看到,如果要完成收敛,需要相当长的时间,在大模型出现后,对计算架构和集群性能的要求提升了好几个数量级。
GPT是预训练模型,为了加快速度,我们可以直接加载GPT-2预训练模型,再继续喂入语料训练即可。
从GPT-2预训练模型进行初始化,并输入莎士比亚数据集进行后续的微调(测试GPT-2模型能否通过微调迁移到一个新的NLP任务上),以较小的学习率进行训练,
# 拉取莎士比亚作品,使用OpenAI BPE分词器生成train.bin/val.bin python3 data/shakespeare/prepare.py # 加载GPT-2预训练模型、莎士比亚数据进行训练 python3 train.py config/finetune_shakespeare.py
# 加载GPT-2预训练模型,进行文本输出 python3 sample.py --init_from=gpt2 --start="生命、宇宙和一切的答案是什么?" --num_samples=3 --max_new_tokens=1000
通过上述步骤本地迭代训练生成的best checkpoint model(最低验证损失)将保存在out_dir
目录中,如果要加载继续训练后的模型进行文本输出的话,可以通过--out_dir
参数指定位置
# 加载本地模型,传入字符串进行文本输出 python3 sample.py --out_dir=out-shakespeare --start="hi" --num_samples=3 --max_new_tokens=100 # 加载本地模型,传入文件进行文本输出 python3 sample.py --out_dir=out-shakespeare --start="FILE:./fine-tune/webshell_alpaca_like_dataset.json" --num_samples=3 --max_new_tokens=1000
参考链接:
http://blog.yanjingang.com/?p=7102 http://blog.csdn.net/xixiaoyaoww/article/details/130073328 http://github.com/karpathy/nanoGPT http://arxiv.org/pdf/2107.03374.pdf
import os import requests import tiktoken import numpy as np # foreach all the file and return the file's content def read_files(data_path): content = "" for root, dirs, files in os.walk(data_path): for file in files: file_path = os.path.join(root, file) print("file_path", file_path) with open(file_path, "r") as f: content += f.read() # add one line ---- <EOF END OF THE FILE> ----- content += "---- <EOF END OF THE FILE> ----- \n" return content # write content into file def write_file(content, input_file_path): with open(input_file_path, "w") as f: f.write(content) if __name__ == "__main__": data_path = "./data/fomo_webshell/fomo_v2" input_file_path = os.path.join(os.path.dirname(__file__), 'input.txt') content = read_files(data_path) write_file(content, input_file_path) with open(input_file_path, 'r') as f: data = f.read() n = len(data) train_data = data[:int(n*0.9)] val_data = data[int(n*0.9):] # encode with tiktoken gpt2 bpe enc = tiktoken.get_encoding("gpt2") train_ids = enc.encode_ordinary(train_data) val_ids = enc.encode_ordinary(val_data) print(f"train has {len(train_ids):,} tokens") print(f"val has {len(val_ids):,} tokens") # export to bin files train_ids = np.array(train_ids, dtype=np.uint16) val_ids = np.array(val_ids, dtype=np.uint16) train_ids.tofile(os.path.join(os.path.dirname(__file__), 'train.bin')) val_ids.tofile(os.path.join(os.path.dirname(__file__), 'val.bin'))
将海量的Webshell样本(非结构化数据)从原始文本转换为一个大的整数流(半结构化数据)。
代码示例:
import os import requests import tiktoken import numpy as np # foreach all the file and return the file's content def read_files(data_path): content = "" for root, dirs, files in os.walk(data_path): for file in files: file_path = os.path.join(root, file) print("file_path", file_path) with open(file_path, "rb") as f: content += f.read().decode('utf-8', errors='ignore') # add one line ---- <EOF END OF THE FILE> ----- content += "---- <EOF END OF THE FILE> ----- \n" return content # write content into file def write_file(content, input_file_path): with open(input_file_path, "w") as f: f.write(content) if __name__ == "__main__": data_path = "./data/fomo_webshell/fomo_v2" input_file_path = os.path.join(os.path.dirname(__file__), 'input.txt') content = read_files(data_path) write_file(content, input_file_path) with open(input_file_path, 'r') as f: data = f.read() # get all the unique characters that occur in this text chars = sorted(list(set(data))) vocab_size = len(chars) print("all the unique characters:", ''.join(chars)) print(f"vocab size: {vocab_size:,}") n = len(data) train_data = data[:int(n*0.9)] val_data = data[int(n*0.9):] # encode with tiktoken gpt2 bpe enc = tiktoken.get_encoding("gpt2") train_ids = enc.encode_ordinary(train_data) val_ids = enc.encode_ordinary(val_data) print(f"train has {len(train_ids):,} tokens") print(f"val has {len(val_ids):,} tokens") # export to bin files train_ids = np.array(train_ids, dtype=np.uint16) val_ids = np.array(val_ids, dtype=np.uint16) train_ids.tofile(os.path.join(os.path.dirname(__file__), 'train.bin')) val_ids.tofile(os.path.join(os.path.dirname(__file__), 'val.bin'))
命令行如下:
# 使用OpenAI BPE分词器生成train.bin/val.bin
python3 data/fomo_webshell/prepare.py
训练配置参数:
import time
out_dir = 'out-fomo_webshell'
eval_interval = 5
eval_iters = 40
wandb_log = False # feel free to turn on
wandb_project = 'fomo_webshell'
wandb_run_name = 'ft-' + str(time.time())
dataset = 'fomo_webshell'
init_from = 'gpt2-xl' # this is the largest GPT-2 model
# only save checkpoints if the validation loss improves
always_save_checkpoint = False
# the number of examples per iter:
batch_size = 1
block_size = 256
vocab_size = 4096
gradient_accumulation_steps = 32
max_iters = 20
# finetune at constant LR
learning_rate = 3e-5
decay_lr = False
加载gpt2-xl预训练模型,并加载之前预处理好的webshell数据,继续对基模型进行微调。
screen python3 train.py config/finetune_fomo_webshell.py
# 加载本地模型,传入字符串进行文本输出 python3 sample.py --out_dir=out-fomo_webshell --start="<?php\n$a=$_GET[1];" --num_samples=1 --max_new_tokens=512 python3 sample.py --out_dir=out-fomo_webshell --start="生成一段webshell代码。" --num_samples=1 --max_new_tokens=256
GPT-2已经比之前RNN的效果好很多了,之前RNN稍微长度增加就梯度消失了
效果不理想,分析了一下可能的原因:
1概要数字信号处理的主要数学工具是傅里叶变换.而傅里叶变换研究的是整个时间域和频率域的关系。不过,当运用计算机实现工程测试信号处理时,不可能对无限长的信号进行测量和运算,而是取其有限的时间片段进行分析。做法是从信号中截取一个时间片段,然后用截取的信号时间片段进行周期延拓处理,得到虚拟的无限长的信号,然后就可以对信号进行傅里叶变换、相关分析等数学处理。无限长的信号被截断以后,其频谱发生了畸变,原来集中在f(0)处的能量被分散到两个较宽的频带中去了(这种现象称之为频谱能量泄漏)。 为了减少频谱能量泄漏,可采用不同的截取函数对信号进行截断,截断函数称为窗函数,简称为窗。 窗函数就是时域有限宽的信号。 泄漏与窗函数频谱的两侧旁瓣有关,如果两侧瓣的高度趋于零,而使能量相对集中在主瓣,就可以较为接近于真实的频谱,为此,在时间域中可采用不同的窗函数来截断信号。2matlab中窗函数%@bartlett-Bartlettwindow. %@barthannwin-ModifiedBartlett-Hanningwindow. %@blackman-Blackmanwindow. %@blackmanh
最近发现自己对const这一块其实不甚熟悉,所以复习一下const的相关知识点。基本用法constintbufSize=512;复制上面这样就可以将bufSize定义为常量,编译的时候编译器会把用到该变量的地方都替换成对应的值。在代码运行过程中无法为常量另外赋值:bufSize=1024;复制注意,一定要给定一个值来初始化常量,初始值可以是表达式:constintj=get_size();复制const只限定不能执行会更改到内容的操作,其他的操作例如将const变量的值复制给另一个变量是可以的。inti; i=bufSize;复制作用范围定义好的const变量只有在当前文件下才能被访问到,所以在多个文件中定义了同名的const变量的时候其实就等同于在不同文件中分别定义了独立的变量。如果想要在不同文件中共享一个初始值不是常量表达式的常量,那么就要在定义以及其他文件的声明中加上extern://file1.cpp externconstintbufSize=fcn(); //file2.cpp externconstintbufSize;复制左定值,右定向constint*p=8;复制上
1.检查是否设置口令长度至少8位,并包括数字,小写字符、大写字符和特殊符号4类中至少2类。在文件/etc/login.defs中设置PASS_MIN_LEN不小于标准值 修改/etc/pam.d/system-auth文件,在ucredit=-1lcredit=-1dcredit=-1ocredit=-1选3种,追加到passwordrequisitepam_cracklib.so后面,添加到配置文件中。 例如:passwordrequisitepam_cracklib.soucredit=-1lcredit=-1dcredit=-1 注:ucredit:大写字母个数;lcredit:小写字母个数;dcredit:数字个数;ocredit:特殊字符个数复制2.检查是否设置账户口令的生存期不长于90天。修改/etc/login.def PASS_MAX_DAYS90--99999 PASS_MIN_DAYS10--0 PASS_MIN_LEN8--5 PASS_WARN_AGE5--7复制3.检查是否设置用户不能重复使用最近5次(含5次)内已使用的口令。修改文件#vi/etc/pam.d
前言做iOS开发有3年了,从当初的小白到现在,断断续续看过很多资料,之前也写过一些博文来记录,但是感觉知识点都比较凌乱。所以最近准备抽时间把iOS开发的相关知识进行一个梳理,主要分为OC基础、UI控件、多线程、动画、网络、数据持久化、自动布局、第三方框架等几个模块进行梳理。本系列的所有博文集合参见:iOS开发知识梳理博文集。本文主要介绍OC基础--数据类型与表达式。一数据类型Objective-C是在C语言基础上拓展出的新语言,所以它是完全兼容C语言代码的,C语言中的基本数据类型如int、float、double和char在Objective-C中是完全可以正常使用的。除此之外,Objective-C还拓展了一些新的数据类型如BOOL、id、instancetype等。1.1基本数据类型因为Objective-C是在C语言基础上拓展出的新语言,所以它是完全兼容C语言代码的,C语言中的基本数据类型都可以正常使用,直接来自C语言中的数据类型如下所示。当然,这些数据类型我们在实际开发过程中很少用到(枚举类型有时候会用到)。#import<Foundation/Foundation.h&
什么是零拷贝在操作系统中,从内核的形态区分,可以分为内核态(KernelSpace)和用户态(UserSpace)。在传统的IO中,如果把数据通过网络发送到指定端的时候,数据需要经历下面的几个过程:当调用系统函数的时候,CPU执行一系列准备工作,然后把请求发送给DMA处理(DMA可以理解为专门处理IO的组件),DMA将硬盘数据通过总线传输到内存中。 当程序需要读取内存的时候,这个时候会执行CPUCopy,内存会有内核态写入用户的缓存区。 系统调用write()方法时,数据从用户态缓冲区写入到网络缓冲区(SocketBuffer),由用户态编程内核态。 最后由DMA写入网卡驱动中,传输到网卡的驱动。 可以看到,传统的IO的读写,数据会经历4次内存的拷贝,这种拷贝拷贝会带来资源的浪费和效率的底下。如何实现零拷贝内存映射方式I/O在顺序IO中有一个mmap的机制,具体数据是怎么流转的呢?数据通过DMA读取写入的内核空间的内存,通过内存共享的方式,把这块数据与用户空间的进行共用,减少了一次CPU的拷贝。 然后数据直接写入到网络的缓存区 最后由DMA写入的网卡的驱动的缓存。 从上面的图可以看出,
目录Idea使用Junit4进行单元测试1.Junit4依赖安装2.编写测试代码3.生成测试类4.运行Idea使用Junit4进行单元测试1.Junit4依赖安装Idea默认使用arquillianjunit4作为测试框架,我们将其更改为Junit4。Settings->Plugins->搜索栏中搜索Junit并勾选->安装完成复制2.编写测试代码编写一个简单的加减乘除工具类,代码如下:Calculate.javapackagecom.test.util; publicclassCalculate{ publicintadd(inta,intb){ intresult=a+b; returnresult; } publicintsubtract(inta,intb){ intresult=a-b; returnresult; } publicintmultiply(inta,intb){ intresult=a*b; returnresult; } publicintdivide(inta,intb){ intresult=a/b; returnresult
大数据文摘出品来源:Bair编译:李雷、宋欣仪动物学习使用工具的技能大多来自观察学习和实践。例如通过观察,黑猩猩学会了用树枝“捕捉”昆虫,卷尾猴会用棍子将食物扫到身边。虽然我们不知道它们是否只是“机械模仿”,但我们相信这种使用工具的能力代表着一种更高水平的智力。大猩猩用木棍收集草药一个值得探讨的问题:我们能否让机器人以同样的方式——观察和实践——学会使用工具?执行这种复杂的多对象操作任务(例如涉及工具使用的任务)的前提是理解物体间的物理因果关系,也就是要能够预测一个物体如何与另一个物体相互作用的。我们在之前的视觉模型强化学习的研究中探究过机器人在没有监测的情况下怎样利用因果视觉预测模型与周围环境互动。学习这样的模型之后,机器人可以完成各种简单的任务,包括折叠衣物和摆放物体。但是,如果我们考虑有更多工具的情况,例如用扫帚将污垢扫入簸箕,这种学习模型是不够的。利用因果视觉预测模型与周围环境互动https://bair.berkeley.edu/blog/2018/11/30/visual-rl/因此,我们从动物的学习方式中汲取灵感,设计了一种算法,使机器人可以通过模仿和互动式学习获得工具使
AI科技评论按:学校的教授同时在企业和学校任职,每周一部分时间在企业、一部分时间在学校,到底是好事还是坏事?最著名的例子是Facebook人工智能实验室(FAIR)。Facebook从创立就有开放、开源的基因,而等到邀请深度学习三驾马车之一的YannLeCun加入、正式建立FAIR的时候,马克扎克伯格希望这是一个研究性质的机构,可以与Facebook业务的工程开发风气不同;LeCun更是明确表示要保留学术研究的风气,教授要同时能够在学校兼职(包括他本人保留纽约大学的教职),研究成果要公开发表,代码也要开源,等等。几年下来,不仅YannLeCun本人非常享受这种同时在企业实验室做研究、在高校做研究带学生的模式(LeCun称之为「dualaffiliation」,双重联盟),陆续加盟FAIR的多位高校教授(包括今年7月刚刚加入的5位)也采取了同样的做法。LeCun甚至近期公开发表一篇文章,表示业界应该和学术界建立更多的联系、更多的合作,其中一种很好的做法就是他推崇的双重联盟。但也有一些教授提出了反对意见。UC伯克利大学电子工程和计算机系副教授、《MachineLearningResearc
目录Swagger生成JavaDoc什么是Swagger集成Swagger添加依赖启用功能配置基础信息阶段效果一完善说明信息集成更好用的UI界面集成依赖预览效果生成离线文档开源项目swagger2markup使用MAVEN插件生成AsciiDoc文档使用MAVEN插件生成HTML下节预告 Swagger生成JavaDoc 在日常的工作中,特别是现在前后端分离模式之下,接口的提供造成了我们前后端开发人员的沟通 成本大量提升,因为沟通不到位,不及时而造成的[撕币]事件都成了日常工作。特别是很多的开发人员 不擅长沟通,造成的结果就会让自己特别的痛苦,也让合作人员恨的牙根痒痒。 为了结束战火蔓延,同时为了提升开发人员的满意度,Swagger应运而生。 什么是Swagger SwaggerforEveryone SimplifyAPIdevelopmentforusers,teams,andenterpriseswiththeSwaggeropensourceandprofessionaltoolset. Swaggeropensourceandprotoolshavehelpedmill
Java中定义了两类异常: 1)Checkedexception:这类异常都是Exception的子类。异常的向上抛出机制进行处理,如果子类可能产生A异常,那么在父类中也必须throwsA异常。可能导致的问题:代码效率低,耦合度过高。C#中就没有使用这种异常机制。 2)Uncheckedexception:这类异常都是RuntimeException的子类,虽然RuntimeException同样也是Exception的子类,但是它们是特殊的,它们不能通过clientcode来试图解决,所以称为Uncheckedexception。 checkedexception是需要强制catch的异常,你在调用这个方法的时候,你如果不catch这个异常,那么编译器就会报错,比如说我们读写文件的时候会catchIOException,执行数据库操作会有SQLException等 UnCheckedException是RuntimeException,也就是说运行时的异常,这种异常不是必须需要catch的,你是无法预料的,比如说你在调用一个list.szie()的时候,如果这个
with是从Python2.5引入的一个新的语法,它是一种上下文管理协议,目的在于从流程图中把try,except和finally关键字和 资源分配释放相关代码统统去掉,简化try….except….finlally的处理流程。 with通过__enter__方法初始化,然后在__exit__中做善后以及处理异常。 所以使用with处理的对象必须有__enter__()和__exit__()这两个方法。 其中__enter__()方法在语句体(with语句包裹起来的代码块)执行之前进入运行,__exit__()方法在语句体执行完毕退出后运行。 with语句适用于对资源进行访问的场合,确保不管使用过程中是否发生异常都会执行必要的“清理”操作,释放资源,比如文件使用后自动关闭、线程中锁的自动获取和释放等。 With语句的基本语法格式: withexpression[astarget]: with_body 参数说明: expression:是一个需要执行的表达式; target:是一个变量或者元组,存储的是expression表达式执行返回的结果,可选参数。 例如: >
题目:Alice和Bob是两个程序员,发明了一个比MSWord还好用的文字处理软件PaperWrite,把软件做成一个安装光盘,安装在Windows系统中就能使用,但是由于他们对于安全技术不是很了解,故受到盗版问题的困扰,需要你的帮助。设计一个版权保护模块,防止盗版使用软件,方法不限。 方法1:数字水印 数字水印:用来证明一个数字产品的拥有权、真实性, 数字水印是通过一些算法嵌入在数字产品中的数字信息。 在制作数字光盘时在其中加入隐藏的唯一数字水印,安装或使用软件时将数字水印上传到服务器,若发现多个用户使用同一个数字水印,说明这份光盘遭到盗版,则拥有这一数字水印的所有用户都不能使用该软件。或采用DRM技术,对于每一份正版软件,都用NEC算法为其生成数字水印,并存储于服务器。每次用户启动软件时,都必须联网并与服务器进行数字水印比对,比对不成功就无法启动软件。 方法2: 依赖特定的硬件,通过硬件和软件相结合的方式来实现软件加密保护的技术。常用的方法有磁盘加密、加密卡和加密锁等。如需要一个安装在计算机USB口上的硬件,且软件的一部分程序或算法在这个硬件中运行,确如果缺失这部分则无法
阅读目录 一time与datetime模块 二random模块 三os模块 四sys模块 五shutil模块 六json&pickle模块 七shelve模块 八xml模块 九configparser模块 十hashlib模块 十一suprocess模块 十二logging模块 十三re模块 一time与datetime模块 在Python中,通常有这几种方式来表示时间: 时间戳(timestamp):通常来说,时间戳表示的是从1970年1月1日00:00:00开始按秒计算的偏移量。我们运行“type(time.time())”,返回的是float类型。 格式化的时间字符串(FormatString) 结构化的时间(struct_time):struct_time元组共有9个元素共九个元素:(年,月,日,时,分,秒,一年中第几周,一年中第几天,夏令时) 1importtime 2#--------------------------我们先以当前时间为准,让大家快速认识三种形式的时间 3print(time.time())#时间戳:1487130156.41952
普遍的apk性能测试,主要是以下七类 1、响应2、内存3、cpu4、FPS(app使用的流畅度)5、GPU过度渲染6、耗电7、耗流(app除了这些性能测试,还有:手机版本号兼容性,屏幕分辨率兼容性,稳定性测试,安全测试等,后续会持续更新…流量测试同这些一起更新,这里就不在说明了) 一、响应软件的响应时间和响应速度直接影响到用户的体验度,如果一个软件,迟迟加载不出来,会直接影响到软件的日活、留存。因此对于一个软件,对响应速度测试是必不可少的。 主要测试点:1、冷启动:首次启动app的时间间隔(只是启动时间,不包括页面加载)2、热启动:非首次启动app的时间间隔(只是启动时间,不包括页面加载)3、完全启动:从启动到首页完全加载出来的时间间隔4、有网启动:从发起跳转,到页面完全加载出来的时间间隔5、无网启动:从发起跳转,到页面完全加载出来的时间间隔(在项目中,主要测试关注点是冷启动,热启动) 测试方法:1、使用adb命令1)冷启动adbshellamstart-WpackageName/ActivityName(绝对路径,首个Activity) 含义:ThisTime:该Activity的启
系统的学习Erlang可参考:https://www.cnblogs.com/xuan52rock/tag/erlang/ 一、Erlang语法:变量、模式匹配--------------------------------------1.elr命令:工具栏toolbar:start().2.%。。。注释3.变量首字母大写,单一赋值4.模式匹配5.原子:hello,'anatomwith'6.元组tuple:元组嵌套{person,{a,1},{b,2}} 提取元组字段值Point={point,10,45} {point,X,Y}=Point7.列表list: 定义列表ThingsToBuy1=[{oranges,4},{newspaper,1}|{ThingsToBuy}] &nb
Linuxcore文件介绍 1.core文件的简单介绍在一个程序崩溃时,它一般会在指定目录下生成一个core文件。core文件仅仅是一个内存映象(同时加上调试信息),主要是用来调试的。 2.开启或关闭core文件的生成用以下命令来阻止系统生成core文件:ulimit-c0下面的命令可以检查生成core文件的选项是否打开:ulimit-a该命令将显示所有的用户定制,其中选项-a代表“all”。 也可以修改系统文件来调整core选项,在/etc/profile通常会有这样一句话来禁止产生core文件,通常这种设置是合理的:#Nocorefilesbydefaultulimit-S-c0>/dev/null2>&1但是在开发过程中有时为了调试问题,还是需要在特定的用户环境下打开core文件产生的设置;在用户的~/.bash_profile里加上ulimit-cunlimited来让特定的用户可以产生core文件;如果ulimit-c0则也是禁止产生core文件,而ulimit-c1024则限制产生的core文件的大小不能超过1024kb; 3.设置CoreDump的核心
\(Day0\) 早上起床没去班里,直接跑去奥赛室收拾了下东西 开\(luogu\)发现是中吉并不是预料中的大吉 早上买了一堆零食,然后回奥赛室发现\(luogu\)刷题量\(498\),赶紧切了两道水题,赶在退役之前\(AC500\)海星 后来有了教练的名场面 \(9:00\)登上了大巴,没想啥,希望有个好的结果吧,下次再回来,就只有文化课了,不知道去年这个时候在想什么。 路上本来打算学习,然后吃了一堆东西,和\(tyf\)快乐地颓起了造梦西游,\(5\)个小时车程好快啊,话说我一点模板还没复习有点慌。下午基本整理好了物品,日照外国语校园很大,宿舍有点简陋还不如二中,志愿者态度不错,饭菜也没有那么可口。玩了一下午,终于想起来复习模板,结果要去吃饭,吃完饭回来要提前试机,最后搞到很晚才复习完模板。 不得不吐槽一下试机,电脑显示屏是朝下倾斜的,键盘上的回车键和脸一样大,\(backspace\)键小到找不到,逻辑或符号也换了位置,还有电脑很卡,编译程序要花\(10s\),只能用\(32debug\)。好在没人管试机时间,打了个熟练剖分,\(lca\),\(dijkstra\)都过了,
一、什么是全角和半角?全角:是指中GB2312-80(《信息交换用汉字编码字符集·基本集》)中的各种符号,如A、B、C、1、2、3等,应将这些符号理解为汉字。半角:是指英文件ASCII码中的各种符号,如A、B、C、1、2、3、等。(1)全角---指一个字符占用两个标准字符位置。 汉字字符和规定了全角的英文字符及国标GB2312-80中的图形符号和特殊字符都是全角字符。一般的系统命令是不用全角字符的,只是在作文字处理时才会使用全角字符。(2)半角---指一字符占用一个标准的字符位置。通常的英文字母、数字键、符号键都是半角的,半角的显示内码都是一个字节。在系统内部,以上三种字符是作为基本代码处理的,所以用户输入命令和参数时一般都使用半角。二、全角与半角有什么区别?各在什么情况下使用?全角占两个字节,半角占一个字节。半角全角主要是针对标点符号来说的,全角标点占两个字节,半角占一个字节。不管是半角还是全角,汉字都占两个字节。在编程序的源代码中只能使用半角标点(不包括字符串内部的数据)。在不支持汉字等语言的计算机上只能使用半角标点(其实这种情况根本就不存在半角全角的概念)。示例如下:(
减少谐波失真的PCB设计方法 实际上印刷线路板(PCB)是由电气线性材料构成的,也即其阻抗应是恒定的。那么,PCB为什么会将非线性引入信号内呢?答案在于:相对于电流流过的地方来说,PCB布局是“空间非线性”的。 放大器是从这个电源还是从另外一个电源获取电流,取决于加负载上的信号瞬间极性。电流从电源流出,经过旁路电容,通过放大器进入负载。然后,电流从负载接地端(或PCB输出连接器的屏蔽)回到地平面,经过旁路电容,回到最初提供该电流的电源。 电流流过阻抗最小路径的概念是不正确的。电流在全部不同阻抗路径的多少与其电导率成比例。在一个地平面,常常有不止一个大比例地电流流经的低阻抗路径:一个路径直接连至旁路电容;另一个在达到旁路电容前,对输入电阻形成激励。图1示意了这两个路径。地回流电流才是真正引发问题的原因。 当旁路电容放在PCB的不同位置时,地电流通过不同路径流至各自的旁路电容,即“空间非线性”所代表的含义。若地电流某一极性的分量的很大部分流过输入电路的地,则只扰动信号的这一极性的分量电压。而若地电流的另一极性并没施扰,则输入信号电压以一种非线性方式发生变化。当一个极性分量发生改