diff --git a/CHANGE_LOG.md b/CHANGE_LOG.md index 86ff7d8bf584558c97251c2b24ae4e65cde4c246..0daa8ae5059ac8cde69f321f95774ca80b331f5a 100644 --- a/CHANGE_LOG.md +++ b/CHANGE_LOG.md @@ -1,4 +1,4 @@ -# 2021.10.29 +# 2021.10.29 v0.1 1. change `mixins` from `ModuleList` to `ModuleDict` 2. return tokens and mems in `fill_sequence`, and mems becomes a tensor. 3. `CachedAutoRegressiveMixin` @@ -28,5 +28,8 @@ for the older framework, you also need: old['module']['transformer.word_embeddings.weight'] = old['module']['word_embeddings.weight'] del old['module']['word_embeddings.weight'] ``` +# 2021.11.5 v0.1.2 +1. Add generation.autoregressive_sampling.evalute_perplexity +2. fix Runtime Error in skipping Nan Loss diff --git a/SwissArmyTransformer/generation/sampling_strategies/beam_search_strategy_old.py b/SwissArmyTransformer/generation/sampling_strategies/beam_search_strategy_old.py deleted file mode 100644 index aeab7989930250aa93b3231399152cd514b4162b..0000000000000000000000000000000000000000 --- a/SwissArmyTransformer/generation/sampling_strategies/beam_search_strategy_old.py +++ /dev/null @@ -1,467 +0,0 @@ -# -*- encoding: utf-8 -*- -''' -@File : base_strategy.py -@Time : 2021/10/08 22:22:42 -@Author : Ming Ding -@Contact : dm18@mail.tsinghua.edu.cn -''' - -# here put the import lib -import torch -import torch.nn.functional as F -from abc import ABC, abstractmethod -from collections import UserDict -from typing import Optional, Tuple, List, Iterable, Union - - -class BeamScorer(ABC): - """ - Abstract base class for all beam scorers that are used for :meth:`~transformers.PretrainedModel.beam_search` and - :meth:`~transformers.PretrainedModel.beam_sample`. - """ - - @abstractmethod - def process( - self, - input_ids: torch.LongTensor, - next_scores: torch.FloatTensor, - next_tokens: torch.LongTensor, - next_indices: torch.LongTensor, - **kwargs - ) -> Tuple[torch.Tensor]: - raise NotImplementedError("This is an abstract method.") - - @abstractmethod - def finalize( - self, - input_ids: torch.LongTensor, - next_scores: torch.FloatTensor, - next_tokens: torch.LongTensor, - next_indices: torch.LongTensor, - **kwargs - ) -> torch.LongTensor: - raise NotImplementedError("This is an abstract method.") - - -class BeamSearchScorer(BeamScorer): - r""" - :class:`transformers.BeamScorer` implementing standard beam search decoding. - - Adapted in part from `Facebook's XLM beam search code - <https://github.com/facebookresearch/XLM/blob/9e6f6814d17be4fe5b15f2e6c43eb2b2d76daeb4/src/model/transformer.py#L529>`__. - - Args: - batch_size (:obj:`int`): - Batch Size of :obj:`input_ids` for which beam search decoding is run in parallel. - max_length (:obj:`int`): - The maximum length of the sequence to be generated. - num_beams (:obj:`int`): - Number of beams for beam search. - device (:obj:`torch.device`): - Defines the device type (*e.g.*, :obj:`"cpu"` or :obj:`"cuda"`) on which this instance of - :obj:`BeamSearchScorer` will be allocated. - length_penalty (:obj:`float`, `optional`, defaults to 1.0): - Exponential penalty to the length. 1.0 means no penalty. Set to values < 1.0 in order to encourage the - model to generate shorter sequences, to a value > 1.0 in order to encourage the model to produce longer - sequences. - do_early_stopping (:obj:`bool`, `optional`, defaults to :obj:`False`): - Whether to stop the beam search when at least ``num_beams`` sentences are finished per batch or not. - num_beam_hyps_to_keep (:obj:`int`, `optional`, defaults to 1): - The number of beam hypotheses that shall be returned upon calling - :meth:`~transformer.BeamSearchScorer.finalize`. - """ - - def __init__( - self, - batch_size: int, - max_length: int, - num_beams: int, - device: Union[torch.device, str], - length_penalty: Optional[float] = 1.0, - do_early_stopping: Optional[bool] = False, - num_beam_hyps_to_keep: Optional[int] = 1, - ): - self.max_length = max_length - self.num_beams = num_beams - self.device = device - self.length_penalty = length_penalty - self.do_early_stopping = do_early_stopping - self.num_beam_hyps_to_keep = num_beam_hyps_to_keep - - self._is_init = False - self._beam_hyps = [ - BeamHypotheses( - num_beams=self.num_beams, - max_length=self.max_length, - length_penalty=self.length_penalty, - early_stopping=self.do_early_stopping, - ) - for _ in range(batch_size) - ] - self._done = torch.tensor([False for _ in range(batch_size)], dtype=torch.bool, device=self.device) - - # if not isinstance(num_beams, int) or num_beams <= 1: - # raise ValueError( - # f"`num_beams` has to be an integer strictly greater than 1, but is {num_beams}. For `num_beams` == 1, one should make use of `greedy_search` instead." - # ) - - @property - def is_done(self) -> bool: - return self._done.all() - - def process( - self, - input_ids: torch.LongTensor, - next_scores: torch.FloatTensor, - next_tokens: torch.LongTensor, - next_indices: torch.LongTensor, - pad_token_id: Optional[int] = None, - eos_token_id: Optional[int] = None, - mems=None - ) -> Tuple[torch.Tensor]: - cur_len = input_ids.shape[-1] - batch_size = len(self._beam_hyps) - assert batch_size == (input_ids.shape[0] // self.num_beams) - if isinstance(eos_token_id, int): - eos_token_id = [eos_token_id] - device = next_scores.device - next_beam_scores = torch.zeros((batch_size, self.num_beams), dtype=next_scores.dtype, device=device) - next_beam_tokens = torch.zeros((batch_size, self.num_beams), dtype=next_tokens.dtype, device=device) - next_beam_indices = torch.zeros((batch_size, self.num_beams), dtype=next_indices.dtype, device=device) - - for batch_idx, beam_hyp in enumerate(self._beam_hyps): - if self._done[batch_idx]: - assert ( - len(beam_hyp) >= self.num_beams - ), "Batch can only be done if at least {} beams have been generated".format(self.num_beams) - assert ( - eos_token_id is not None and pad_token_id is not None - ), "generated beams >= num_beams -> eos_token_id and pad_token have to be defined" - # pad the batch - next_beam_scores[batch_idx, :] = 0 - next_beam_tokens[batch_idx, :] = pad_token_id - next_beam_indices[batch_idx, :] = 0 - continue - - # next tokens for this sentence - beam_idx = 0 - for beam_token_rank, (next_token, next_score, next_index) in enumerate( - zip(next_tokens[batch_idx], next_scores[batch_idx], next_indices[batch_idx]) - ): - batch_beam_idx = batch_idx * self.num_beams + next_index - # add to generated hypotheses if end of sentence - if (eos_token_id is not None) and (next_token.item() in eos_token_id): - # if beam_token does not belong to top num_beams tokens, it should not be added - is_beam_token_worse_than_top_num_beams = beam_token_rank >= self.num_beams - if is_beam_token_worse_than_top_num_beams: - continue - beam_hyp.add( - input_ids[batch_beam_idx].clone(), - next_score.item(), - mems=[mem[[next_index.item()]] for mem in mems] if mems else None - ) - else: - # add next predicted token since it is not eos_token - next_beam_scores[batch_idx, beam_idx] = next_score - next_beam_tokens[batch_idx, beam_idx] = next_token - next_beam_indices[batch_idx, beam_idx] = batch_beam_idx - beam_idx += 1 - - # once the beam for next step is full, don't add more tokens to it. - if beam_idx == self.num_beams: - break - - if beam_idx < self.num_beams: - raise ValueError( - f"At most {self.num_beams} tokens in {next_tokens[batch_idx]} can be equal to `eos_token_id: {eos_token_id}`. Make sure {next_tokens[batch_idx]} are corrected." - ) - - # Check if we are done so that we can save a pad step if all(done) - self._done[batch_idx] = self._done[batch_idx] or beam_hyp.is_done( - next_scores[batch_idx].max().item(), cur_len - ) - - return UserDict( - { - "next_beam_scores": next_beam_scores.view(-1), - "next_beam_tokens": next_beam_tokens.view(-1), - "next_beam_indices": next_beam_indices.view(-1), - } - ) - - def finalize( - self, - input_ids: torch.LongTensor, - final_beam_scores: torch.FloatTensor, - pad_token_id: Optional[int] = None, - eos_token_id: Optional[int] = None, - mems=None - ) -> Tuple[torch.LongTensor, List[torch.Tensor], torch.FloatTensor]: - batch_size = len(self._beam_hyps) - # finalize all open beam hypotheses and add to generated hypotheses - for batch_idx, beam_hyp in enumerate(self._beam_hyps): - if self._done[batch_idx]: - continue - - # need to add best num_beams hypotheses to generated hyps - for beam_id in range(self.num_beams): - batch_beam_idx = batch_idx * self.num_beams + beam_id - final_score = final_beam_scores[batch_beam_idx].item() - final_tokens = input_ids[batch_beam_idx] - beam_hyp.add(final_tokens, final_score, mems=[mem[[batch_beam_idx]] for mem in mems] if mems else None) - - # select the best hypotheses - sent_lengths = input_ids.new(batch_size * self.num_beam_hyps_to_keep) - best = [] - - # retrieve best hypotheses - for i, beam_hyp in enumerate(self._beam_hyps): - sorted_hyps = sorted(beam_hyp.beams, key=lambda x: x[0]) - for j in range(self.num_beam_hyps_to_keep): - score, best_hyp, mems = sorted_hyps.pop() - sent_lengths[self.num_beam_hyps_to_keep * i + j] = len(best_hyp) - best.append((best_hyp, mems, score)) - - # prepare for adding eos - sent_max_len = min(sent_lengths.max().item(), self.max_length) - decoded: torch.LongTensor = input_ids.new(batch_size * self.num_beam_hyps_to_keep, sent_max_len) - scores = final_beam_scores.new(batch_size * self.num_beam_hyps_to_keep) - # shorter batches are padded if needed - if sent_lengths.min().item() != sent_lengths.max().item(): - assert pad_token_id is not None, "`pad_token_id` has to be defined" - decoded.fill_(pad_token_id) - - # fill with hypotheses and eos_token_id if the latter fits in - mems = [] - for i, (hypo, mem, score) in enumerate(best): - scores[i] = score - decoded[i, : sent_lengths[i]] = hypo - if sent_lengths[i] < sent_max_len: - decoded[i, sent_lengths[i]] = eos_token_id - mems.append(mem) - mems = [torch.cat([mem[i] for mem in mems], dim=0) for i in range(len(mems[0]))] if mems and mems[0] else None - return decoded, mems, scores - - -class BeamHypotheses: - def __init__(self, num_beams: int, max_length: int, length_penalty: float, early_stopping: bool): - """ - Initialize n-best list of hypotheses. - """ - self.max_length = max_length - 1 # ignoring bos_token - self.length_penalty = length_penalty - self.early_stopping = early_stopping - self.num_beams = num_beams - self.beams = [] - self.worst_score = 1e9 - - def __len__(self): - """ - Number of hypotheses in the list. - """ - return len(self.beams) - - def add(self, hyp: torch.LongTensor, sum_logprobs: float, mems=None): - """ - Add a new hypothesis to the list. - """ - score = sum_logprobs / (max(hyp.shape[-1], 1) ** self.length_penalty) - if len(self) < self.num_beams or score > self.worst_score: - self.beams.append((score, hyp, mems)) - if len(self) > self.num_beams: - sorted_next_scores = sorted([(s, idx) for idx, (s, _, _) in enumerate(self.beams)]) - del self.beams[sorted_next_scores[0][1]] - self.worst_score = sorted_next_scores[1][0] - else: - self.worst_score = min(score, self.worst_score) - - def is_done(self, best_sum_logprobs: float, cur_len: int) -> bool: - """ - If there are enough hypotheses and that none of the hypotheses being generated can become better than the worst - one in the heap, then we are done with this sentence. - """ - - if len(self) < self.num_beams: - return False - elif self.early_stopping: - return True - else: - cur_score = best_sum_logprobs / cur_len ** self.length_penalty - ret = self.worst_score >= cur_score - return ret - - -class LogitsProcessor(ABC): - """Abstract base class for all logit processors that can be applied during generation.""" - - def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor: - """Torch method for processing logits.""" - raise NotImplementedError( - f"{self.__class__} is an abstract class. Only classes inheriting this class can be called." - ) - - -class LogitsProcessorList(list): - """ - This class can be used to create a list of :class:`~transformers.LogitsProcessor` or - :class:`~transformers.LogitsWarper` to subsequently process a :obj:`scores` input tensor. This class inherits from - list and adds a specific `__call__` method to apply each :class:`~transformers.LogitsProcessor` or - :class:`~transformers.LogitsProcessor` to the inputs. - """ - - def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor: - for processor in self: - scores = processor(input_ids, scores) - return scores - - -class MinLengthLogitsProcessor(LogitsProcessor): - r""" - :class:`transformers.LogitsProcessor` enforcing a min-length by setting EOS probability to 0. - - Args: - min_length (:obj:`int`): - The minimum length below which the score of :obj:`eos_token_id` is set to :obj:`-float("Inf")`. - eos_token_id (:obj:`int`): - The id of the `end-of-sequence` token. - """ - - def __init__(self, min_length: int, eos_token_ids: Union[List[int], int]): - if not isinstance(min_length, int) or min_length < 0: - raise ValueError(f"`min_length` has to be a positive integer, but is {min_length}") - if isinstance(eos_token_ids, int): - eos_token_ids = [eos_token_ids] - for eos_token_id in eos_token_ids: - if not isinstance(eos_token_id, int) or eos_token_id < 0: - raise ValueError(f"`eos_token_id` has to be a positive integer, but is {eos_token_id}") - - self.min_length = min_length - self.eos_token_ids = eos_token_ids - - def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor: - cur_len = input_ids.shape[-1] - if cur_len < self.min_length: - for eos_token_id in self.eos_token_ids: - scores[:, eos_token_id] = -float("inf") - return scores - - -class NoRepeatNGramLogitsProcessor(LogitsProcessor): - r""" - :class:`transformers.LogitsProcessor` that enforces no repetition of n-grams. See `Fairseq - <https://github.com/pytorch/fairseq/blob/a07cb6f40480928c9e0548b737aadd36ee66ac76/fairseq/sequence_generator.py#L345>`__. - - Args: - ngram_size (:obj:`int`): - All ngrams of size :obj:`ngram_size` can only occur once. - """ - - def __init__(self, ngram_size: int): - if not isinstance(ngram_size, int) or ngram_size <= 0: - raise ValueError(f"`ngram_size` has to be a strictly positive integer, but is {ngram_size}") - self.ngram_size = ngram_size - - def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor: - num_batch_hypotheses = scores.shape[0] - cur_len = input_ids.shape[-1] - banned_batch_tokens = self._calc_banned_ngram_tokens(input_ids, num_batch_hypotheses, cur_len) - - for i, banned_tokens in enumerate(banned_batch_tokens): - scores[i, banned_tokens] = -float("inf") - - return scores - - def _calc_banned_ngram_tokens( - self, prev_input_ids: torch.Tensor, num_hypos: int, cur_len: int - ) -> List[Iterable[int]]: - """Copied from fairseq for no_repeat_ngram in beam_search""" - if cur_len + 1 < self.ngram_size: - # return no banned tokens if we haven't generated no_repeat_ngram_size tokens yet - return [[] for _ in range(num_hypos)] - generated_ngrams = [{} for _ in range(num_hypos)] - for idx in range(num_hypos): - gen_tokens = prev_input_ids[idx].tolist() - generated_ngram = generated_ngrams[idx] - for ngram in zip(*[gen_tokens[i:] for i in range(self.ngram_size)]): - prev_ngram_tuple = tuple(ngram[:-1]) - generated_ngram[prev_ngram_tuple] = generated_ngram.get(prev_ngram_tuple, []) + [ngram[-1]] - - def _get_generated_ngrams(hypo_idx): - # Before decoding the next token, prevent decoding of ngrams that have already appeared - start_idx = cur_len + 1 - self.ngram_size - ngram_idx = tuple(prev_input_ids[hypo_idx, start_idx:cur_len].tolist()) - return generated_ngrams[hypo_idx].get(ngram_idx, []) - - banned_tokens = [_get_generated_ngrams(hypo_idx) for hypo_idx in range(num_hypos)] - return banned_tokens - - -class BeamSearchStrategy: - def __init__(self, num_beams, max_length, length_penalty, end_tokens, device='cuda', no_repeat_ngram_size=0, - min_tgt_length=0): - self.num_beams = num_beams - self.max_length = max_length - self.length_penalty = length_penalty - self.end_tokens = end_tokens - self.no_repeat_ngram_size = no_repeat_ngram_size - self.min_tgt_length = min_tgt_length - self.processors = LogitsProcessorList() - if min_tgt_length > 0: - processor = MinLengthLogitsProcessor(min_tgt_length, self.end_tokens) - self.processors.append(processor) - if no_repeat_ngram_size > 0: - processor = NoRepeatNGramLogitsProcessor(no_repeat_ngram_size) - self.processors.append(processor) - self.beam_scorer = BeamSearchScorer( - batch_size=1, - max_length=max_length, - num_beams=num_beams, - device=device, - length_penalty=length_penalty, - do_early_stopping=False, - ) - self.beam_scores = torch.zeros(1, dtype=torch.float, device=device) - - @property - def is_done(self) -> bool: - return self.beam_scorer.is_done - - def forward(self, logits, tokens, mems): - last_beam_num = tokens.size(0) - logits = self.processors(tokens, logits.float()) - next_token_scores = F.log_softmax(logits, dim=-1) - next_token_scores = next_token_scores + self.beam_scores[:, None].expand_as(next_token_scores) - vocab_size = next_token_scores.shape[-1] - next_token_scores = next_token_scores.view(1, last_beam_num * vocab_size) - - probs = F.softmax(next_token_scores, dim=-1) - next_tokens = torch.multinomial(probs, num_samples=2 * self.num_beams) - next_token_scores = torch.gather(next_token_scores, -1, next_tokens) - next_token_scores, _indices = torch.sort(next_token_scores, descending=True, dim=1) - next_tokens = torch.gather(next_tokens, -1, _indices) - - next_indices = next_tokens // vocab_size - next_tokens = next_tokens % vocab_size - # stateless - tokens = tokens.expand((self.num_beams, -1)) - beam_outputs = self.beam_scorer.process( - tokens, - next_token_scores, - next_tokens, - next_indices, - eos_token_id=self.end_tokens, - mems=mems - ) - self.beam_scores = beam_outputs["next_beam_scores"] - beam_next_tokens = beam_outputs["next_beam_tokens"] - beam_idx = beam_outputs["next_beam_indices"] - beam_next_tokens = beam_next_tokens.unsqueeze(-1) - tokens = torch.cat([tokens[beam_idx, :], beam_next_tokens], dim=-1) - mems = [mem[beam_idx] for mem in mems] if mems else None - return tokens, mems - - def finalize(self, tokens, mems): - tokens, mems, scores = self.beam_scorer.finalize(tokens, self.beam_scores, - eos_token_id=self.end_tokens[0], - mems=mems) - return tokens, mems diff --git a/SwissArmyTransformer/mpu/transformer.py b/SwissArmyTransformer/mpu/transformer.py index c2e97cd5233a4d9edbcc1ce8cab394d5ea8f9a37..6b0c39ccff27716ddee1ed5c6d1cdffb9ba850e4 100755 --- a/SwissArmyTransformer/mpu/transformer.py +++ b/SwissArmyTransformer/mpu/transformer.py @@ -141,8 +141,9 @@ class SelfAttention(torch.nn.Module): class MLP(torch.nn.Module): def __init__(self, hidden_size, output_dropout_prob, init_method, - output_layer_init_method=None, hooks={}): + output_layer_init_method=None, layer_id=None, hooks={}): super(MLP, self).__init__() + self.layer_id = layer_id # Set output layer initialization if not provided. if output_layer_init_method is None: output_layer_init_method = init_method @@ -225,6 +226,7 @@ class BaseTransformerLayer(torch.nn.Module): output_dropout_prob, init_method, output_layer_init_method=output_layer_init_method, + layer_id=layer_id, hooks=hooks ) diff --git a/SwissArmyTransformer/training/deepspeed_training.py b/SwissArmyTransformer/training/deepspeed_training.py index 6dc457301432a4e2c2c24f377be349df452e6324..335fc1919d2abb048127dc0bc59e5c4badc33d1e 100644 --- a/SwissArmyTransformer/training/deepspeed_training.py +++ b/SwissArmyTransformer/training/deepspeed_training.py @@ -467,6 +467,8 @@ def report_iteration_metrics(summary_writer, optimizer, lr, loss, elapsed_time, summary_writer.add_scalar(f'Train/lr', lr, step) summary_writer.add_scalar(f'Train/train_loss', loss, step) summary_writer.add_scalar(f'Train/elapsed_time', elapsed_time, step) + for key in avg_metrics: + summary_writer.add_scalar('Train/'+key, avg_metrics[key], step) def report_evaluate_metrics(summary_writer, prefix, loss, ppl, step):