c++ primer note

c++ primer note

图算法

基本图算法、图与树、图与树、最短路径、欧拉图

loss function

pytorch loss function

Python itertools

Python itertools 常用函数总结

git 远程命令

5个git远程操作详解

tree debug

class TreeNode(object):
    def __init__(self, x):
        self.val = x
        self.left = None
        self.right = None

def listToTreeNode(inputValues):
    root = TreeNode(inputValues[0])
    nodeQueue = [root]
    front = 0
    index = 1
    while index < len(inputValues):
        node = nodeQueue[front]
        front = front + 1

        item = inputValues[index]
        index = index + 1
        if item != None:
            leftNumber = item
            node.left = TreeNode(leftNumber)
            nodeQueue.append(node.left)

        if index >= len(inputValues):
            break

        item = inputValues[index]
        index = index + 1
        if item != None:
            rightNumber = item
            node.right = TreeNode(rightNumber)
            nodeQueue.append(node.right)
    return root

def printTree(root):
    if not root:
        return []
    queue = []
    queue.append(root)
    ret = []
    while len(queue):
        for i in range(len(queue)):
            res = []
            curNode = queue.pop(0)
            res.append(curNode.val)
            if curNode.left:
                queue.append(curNode.left)
                res.append(curNode.left.val)
            else:
                res.append('None')
            if curNode.right:
                queue.append(curNode.right)
                res.append(curNode.right.val)
            else:
                res.append('None')
            ret.append(res)
    for item in ret:
        print '{} -> {}, {}'.format(*item)

if __name__ == "__main__":
    a = [1, 2, 3, 4, None, 6, None]
    root = listToTreeNode(a)
    printTree(root)
class TreeNode {
    int val;
    TreeNode left;
    TreeNode right;
    TreeNode(int x) { val = x; }
}

public class Solution {
    public static TreeNode listToTreeNode(String input) {
        String[] parts = input.split(",");
        String item = parts[0];
        TreeNode root = new TreeNode(Integer.parseInt(item));
        Queue<TreeNode> nodeQueue = new LinkedList<>();
        nodeQueue.add(root);

        int index = 1;
        while(!nodeQueue.isEmpty()) {
            TreeNode node = nodeQueue.remove();

            if (index == parts.length)
                break;

            item = parts[index++];
            item = item.trim();
            if (!item.equals("null")) {
                int leftNumber = Integer.parseInt(item);
                node.left = new TreeNode(leftNumber);
                nodeQueue.add(node.left);
            }

            if (index == parts.length)
                break;

            item = parts[index++];
            item = item.trim();
            if (!item.equals("null")) {
                int rightNumber = Integer.parseInt(item);
                node.right = new TreeNode(rightNumber);
                nodeQueue.add(node.right);
            }
        }
        return root;
    }

    public static void printTree(TreeNode root){
        if(root == null)
            return;
        LinkedList<TreeNode> queue = new LinkedList<>();
        queue.add(root);
        List<List<String>> ret = new LinkedList<>();

        while(!queue.isEmpty()){
            for(int i = 0; i < queue.size(); i++){
                ArrayList<String > res = new ArrayList<>();
                TreeNode curNode = queue.removeFirst();
                res.add(String.valueOf(curNode.val));
                if(curNode.left != null) {
                    queue.add(curNode.left);
                    res.add(String.valueOf(curNode.left.val));
                } else
                    res.add("null");
                if(curNode.right != null) {
                    queue.add(curNode.right);
                    res.add(String.valueOf(curNode.right.val));
                } else
                    res.add("null");
                ret.add(res);
            }
        }
        for(List<String> item: ret)
            System.out.printf("%s -> %s, %s\n", item.get(0), item.get(1), item.get(2));
    }

    public static void main(String[] args) {
        Solution s = new Solution();
        TreeNode root = s.listToTreeNode(new String("1,2,3,4,null,6,null"));
        printTree(root);
    }
}

cs224n Lecture Notes 7

This set of notes discusses and describes the many variants on the RNN (Recursive Neural Networks) and their application and successes in the field of NLP.

matrix math

class BasicMatrixMath{
    private int mod = 1000000007;
    public long[][] add(long[][] matrixa, long[][] matrixb) {
        for(int i = 0; i < matrixa.length; i++)
            for(int j = 0; j < matrixa[0].length; j++) {
                matrixa[i][j] = matrixa[i][j] + matrixb[i][j];
                matrixa[i][j] %= mod;
            }
        return matrixa;
    }

    public long[][] sub(long[][] matrixa, long[][] matrixb) {
        for(int i=0; i<matrixa.length; i++)
            for(int j=0; j<matrixa[0].length; j++) {
                matrixa[i][j] = matrixa[i][j] - matrixb[i][j];
                matrixa[i][j] %= mod;
            }
        return matrixa;
    }

    public long[][] mul(long[][] matrixa, long[][] matrixb) {
        long[][] result = new long[matrixa.length][matrixb[0].length];
        for(int i = 0; i < matrixa.length; i++) {
            for(int j = 0; j < matrixb[0].length; j++) {
                result[i][j] = 0;
                for (int k = 0; k < matrixa[0].length; k++) {
                    result[i][j] += matrixa[i][k] * matrixb[k][j];
                    result[i][j] %= mod;
                }
            }
        }
        return result;
    }

    public long[][] power(long[][] a, int n){
        if(n == 1)
            return a;
        long[][] c;
        long[][] b = power(a, n/2);
        c = mul(b, b);
        if(n % 2 == 0)
            return c;
        else{
            b = mul(c, a);
            return b;
        }
    }
}

应用:递推式矩阵加速

递推式:

进行矩阵变换:

用矩阵幂运算由$O(n)$加速到$O(log(n))$

cs224n Lecture Notes 6

Keyphrases:

Seq2Seq and Attention Mechanisms, Neural Machine Translation, Speech Processing


Neural Machine Translation with Seq2Seq

encoder

The encoder network’s job is to read the input sequence to our Seq2Seq model and generate a fixed-dimensional context vector C for the sequence.

  1. LSTM or stacked LSTM: compress an arbitrary-length sequence into a single fixed-size vector.
  2. process the input sequence in reverse

decoder

  1. Initialize the hidden state of our first layer with the context vector from above.
  2. pass in a special token to signify the start of output generation.
  3. pass that output word into the first layer, and repeat the generation.

Bidirectional RNNs

we simply add another cell but feed inputs to it in the opposite direction.

Attention Mechanism

Attention mechanisms make use of this observation by providing the decoder network with a look at the entire input sequence at every decoding step; the decoder can then decide what input words are important at any point in time.

  • $h_i$: hidden vectors representing the input sentence.
  • $s_i$: hidden vectors representing the output sentence.
  • $y_i$: generated word
  • $c_i$: context vector that capture the context from the original sentence that is relevant to the time step i of the decoder.

scores $\alpha_{i,j}$ at decoding step i signify the words in the source sentence that align with word i in the target.we can use attention scores to build an alignment table – a table mapping words in the source to corresponding words in the target sentence.

This model can efficiently translate long sentences.

Sequence model decoders

Another approach to machine translation comes from statistical machine translation. Consider a model that computes the probability $P(\bar{s}\mid s)$ of a translation $\bar{s}$ given the original sentence $s$.we want

As the search space can be huge, we need to shrink its size.

Beam search: the idea is to maintain K candidates at each time step.

and compute $H_{t+1}$ by expanding $H_t$ and keeping the best $K$ candidates.

Evaluation of Machine Translation Systems

Bilingual Evaluation Understudy (BLEU)

Let $p_n$ denote matched n-grams,$w_n=1/2^n$be a geometric weighting for the precision of the n’th gram.Our brevity penalty is defined as

where $len_{ref}$ is the length of the reference translation and $len_{MT}$ is the length of the machine translation.

The BLEU score is then defined as

Dealing with the large output vocabulary

Softmax can be quite expensive to compute with a large vocabulary and its complexity also scales proportionally to the vocabulary size.

Scaling softmax

  1. Noise Contrastive Estimation: randomly sampling K words from negative samples.
  2. Hierarchical Softmax

They only save computation during training step.

Reducing vocabulary

Partitioning the training data into subsets with t unique target words.This concept is very similar to NCE.However, the main difference is that these negative samples are sampled from a biased distribution Q for each subset V’ where

Handling unknown words

The final prediction is either the word $y_t^w$ chosen by softmax over candidate list, as in previous methods, or $y_t^l$ copied from source text.

Word and character-based models

It deal with rare or unknown words.

Word segmentation

Representing rare and unknown words as a sequence of subword units.

One can choose to either build separate vocabularies for training and test sets or build one vocabulary jointly.

Hybrid NMT

Word-based Translation as a Backbone: The core of the hybrid NMT is a deep LSTM encoder-decoder that translates at the word level.

Source Character-based Representation: We learn a deep LSTM model over characters of rare words, and use the final hidden state of the LSTM as the representation for the rare word.

Target Character-level Generation: The solution is to have a separate deep LSTM that “translates” at the character level given the current word-level state.The system is trained such that whenever the wordlevel NMT produces an $\langle unk\rangle$, the character-level decoder is asked to recover the correct surface form of the unknown target word.

cs224n Lecture Notes 5

Keyphrases:

Language Models. RNN. Bi-directional RNN. Deep RNN. GRU. LSTM.


Language Models

Language models compute the probability of occurrence of a number of words in a particular sequence.

In some cases, the window of past consecutive n words may not be sufficient to capture the context.Bengio et al. introduced the first large-scale deep learning for natural language processing model that enables capturing this type of context via learning a distributed representation of words.

In all conventional language models, the memory requirements of the system grows exponentially with the window size n making it nearly impossible to model large word windows without running out of memory.

Recurrent Neural Networks (RNN)

Recurrent Neural Networks (RNN) are capable of conditioning the model on all previous words in the corpus.

Below are the details associated with each parameter in the network:

  • $x_1,…,x_t,…,x_T$: the word vectors corresponding to a corpus with T words
  • $h(t)=\sigma(W^{(hh)}h_{t-1}+W^{(hx)}x_{t}})$: the relationship to compute the hidden layer output features at each time-step t
  • $\hat{y_t}=softmax(W^{(S)}h_t)$: the output probability distribution over the vocabulary at each time-step t

The loss function used in RNNs is often the cross entropy error.

Perplexity relationship; it is basically 2 to the power of the negative log probability of the cross entropy error.Perplexity is a measure of confusion where lower values imply more confidence in predicting the next word in the sequence.

The amount of memory required to run a layer of RNN is proportional to the number of words in the corpus.

Deep Bidirectional RNNs

To make predictions based on future words by having the RNN model read through the corpus backwards

Gated Recurrent Units

Although RNNs can theoretically capture long-term dependencies, they are very hard to actually train to do this. Gated recurrent units are designed in a manner to have more persistent memory thereby making it easier for RNNs to capture long-term dependencies.

Long-Short-Term-Memories