U-net实现RNA二级结构预测_python

采用的数据是CT文件切割后形成的CSV文件，储存在

SOURCE_PATH = "split_ct_output"文件夹下

，使用focal loss作为损失函数。结果不是很理想

Epoch 1/20

36/883 [>.............................] - ETA: 20:38 - loss: nan - accuracy: 0.9957

loss产生了nan，accuracy又过于高。

查找原因有文章说是focal loss时有标签使得log(0)产生，需要对p做范围限定，改正之后再更新。

在完全机器学习的基础上添加了些物理化学性质作为辅助，具体见字典dic||dic1

下面是所有代码

import os
import time
from tensorflow import keras
import matplotlib.pyplot as plt
import tensorflow as tf
os.environ['KERAS_BACKEND']='tensorflow'
from tensorflow.keras  import backend as K
from tensorflow.keras import layers, optimizers
import numpy as np
# from keras.callbacks import ModelCheckpoint, LearningRateScheduler
# from keras import backend as keras

MATRIX_SIZE = 128
x_list, y_list = [], []
# x_list1 = []

dic = {"AA":[-0.83,-0.67,-1.13,1.34,-1.84,-0.26,1.72,0.95,0.29,1.35],
        "AC":[-0.14,-1.64,1.50,0.49,0.54,0.34,-0.27,-0.31,0.23,-0.26],
       "AG": [0.55,0.00,-0.79,0.12,0.09,-0.86,0.13,0.82,0.10,0.28],
       "AU": [-0.14,-0.62,-0.96,0.87,0.98,0.93,0.61,1.42,-0.83,1.35],
        "CA":[-1.87,0.62,0.48,0.33,0.83,-0.26,0.15,-0.57,-0.17,0.14],
        "CC":[0.78,0.09,-0.53,-1.36,-0.20,0.34,-1.13,-0.08,2.02,-1.34],
       "CG": [0.55,1.60,2.09,-1.95,-0.80,-2.65,0.06,0.79,-0.97,-0.13],
       "CU": [0.55,0.00,-0.79,0.12,0.09,-0.86,0.13,0.82,0.10,0.28],
        "GA":[1.47,0.40,0.14,-0.94,1.28,0.34,-0.72,-1.82,-0.04,-0.53],
        "GC":[-0.37,-1.07,0.14,0.71,-0.65,2.13,-1.77,-1.72,-1.83,-2.01],
       "GG": [0.78,1.60,-0.53,-1.36,-0.20,0.34,-1.13,-0.88,2.02,-1.34],
       "GU": [-0.14,-1.64,1.50,0.49,0.54,0.34,-0.27,-0.31,0.23,-0.26],
        "UA":[0.09,0.98,-0.62,0.39,-0.95,0.34,1.34,-0.27,-0.31,-1.23,1.08],
        "UC":[1.47,0.40,0.14,-0.94,1.28,0.34,-0.72,-0.31,-0.04,-0.26],
       "UG": [-1.87,0.62,0.48,0.33,0.83,-0.26,0.15,0.82,-0.17,0.28],
       "UU": [-0.83,-0.67,-1.13,1.34,-1.84,-0.26,1.72,0.95,0.29,1.35]
       }

dic1 = {
        "AA":[0.31 ,0.30 ,0.00 ,1.00 ,0.00 ,0.50 ,0.87 ,	0.85 ,	0.55 ,	1.00 ],
        "AC":[0.52 	,0.00 ,	0.82 	,0.74 	,0.76 ,	0.63 ,	0.43 ,	0.47 ,	0.54 ,	0.52 ],
       "AG": [0.72 	,0.51 ,	0.11 ,	0.63 	,0.62 ,	0.37 ,	0.54 ,	0.81 ,	0.50 ,	0.68 ],
       "AU": [0.52 ,	0.31 ,	0.05 ,	0.86 ,	0.90 ,	0.75, 	0.68 ,	1.00 ,	0.26 ,	1.00 ],
        "CA":[0.00 	,0.70 ,	0.50 ,	0.69 ,	0.86 ,	0.50 	,0.55 ,	0.39 ,	0.43 ,	0.64 ],
        "CC":[0.79 ,	0.53 ,	0.19 ,	0.18 ,	0.53 ,	0.63 ,	0.47 ,	0.29 ,	1.00 ,	0.20 ],
       "CG": [0.72 ,	1.00 ,	1.00 ,	0.00 ,	0.56 ,	0.00 ,	0.52 ,	0.81 ,	0.22 ,	0.56 ],
       "CU": [0.72 	,0.51 ,	0.11 ,	0.63 ,	0.62 	,0.37 	,0.54 	,0.81 ,	0.50 ,	0.68 ],
        "GA":[1.00 ,	0.63 ,	0.39 ,	0.31 ,	1.00 ,	0.63, 	0.30 ,	0.00 ,	0.46 ,	0.44 ],
        "GC":[0.45 ,	0.18 ,	0.39 ,	0.81 	,0.38 	,1.00 ,	0.00 ,	0.03, 	0.00 ,	0.00 ],
       "GG": [0.79 ,	1.00 ,	0.19 ,	0.18 ,	0.53 ,	0.63 ,	0.18 ,	0.29 ,	1.00 ,	0.20 ],
       "GU": [0.52 ,	0.00 	,0.82 ,	0.74, 	0.76, 	0.63 ,	0.43 ,	0.47 ,	0.54 ,	0.52 ],
        "UA":[0.59 ,	0.81 ,	0.16 ,	0.71 ,	0.29 ,	0.63 ,	0.89 ,	0.65 ,	0.16 ,	0.92 ],
        "UC":[0.91 ,	0.63 ,	0.39 ,	0.31 ,	1.00 ,	0.63 ,	0.30, 	0.47 ,	0.46 ,	0.52 ],
       "UG": [0.00 ,	0.70 ,	0.50 ,	0.69 ,	0.86 ,	0.50 ,	0.55 ,	0.81 ,	0.43 ,	0.68 ],
       "UU": [0.31 ,	0.30 ,	0.00 ,	1.00 ,	0.00 ,	0.50 ,	1.00 ,	0.85 ,	0.55 	,1.00 ]
}

def ACGU(a, b):  # a,b为两个字符（ACGU）中
    c = a + b
    try:
        for k, v in dic.items():
            if c == k:
                return v
            else:
                return [-1,-1,-1,-1,-1,-1,-1,-1,-1,-1]
    except ValueError:
        print("输入的rna序列有误")

def RNAToMat0(source):
    str = []
    for t in source:
        str.append(t[1])
    l = len(str)

    b = np.zeros((MATRIX_SIZE, MATRIX_SIZE), dtype=np.float)
    try:
        for i in range(l):
            for j in range (i,l):
                c = ACGU(str[i],str[j])
                b[i][j] = c[0]
        return b
    except ValueError:
        print("矩阵生成出错")

def binary_focal_loss(gamma=2, alpha=0.25):
    """
    Binary form of focal loss.
    适用于二分类问题的focal loss

    focal_loss(p_t) = -alpha_t * (1 - p_t)**gamma * log(p_t)
        where p = sigmoid(x), p_t = p or 1 - p depending on if the label is 1 or 0, respectively.
    References:
        https://arxiv.org/pdf/1708.02002.pdf
    Usage:
     model.compile(loss=[binary_focal_loss(alpha=.25, gamma=2)], metrics=["accuracy"], optimizer=adam)
    """
    alpha = tf.constant(alpha, dtype=tf.float32)
    gamma = tf.constant(gamma, dtype=tf.float32)

    def binary_focal_loss_fixed(y_true, y_pred):
        """
        y_true shape need be (None,1)
        y_pred need be compute after sigmoid
        """
        y_true = tf.cast(y_true, tf.float32)#数据类型转换
        alpha_t = y_true * alpha + (K.ones_like(y_true) - y_true) * (1 - alpha)

        p_t = y_true * y_pred + (K.ones_like(y_true) - y_true) * (K.ones_like(y_true) - y_pred) + K.epsilon()
        # print(p_t)
        focal_loss =  - alpha_t * K.pow((K.ones_like(y_true) - p_t), gamma) * K.log(p_t)
        print(focal_loss)
        return K.mean(focal_loss)

    return binary_focal_loss_fixed
# model.compile(loss=[binary_focal_loss(alpha=.25, gamma=2)], metrics=["accuracy"], optimizer=optimizer.Adam(lr=1e-4))

def focal_loss(gamma=2., alpha=.25):
    def focal_loss_fixed(y_true, y_pred):
        pt_1 = tf.where(tf.equal(y_true, 1), y_pred, tf.ones_like(y_pred))
        pt_0 = tf.where(tf.equal(y_true, 0), y_pred, tf.zeros_like(y_pred))
        return -K.sum(alpha * K.pow(1. - pt_1, gamma) * K.log(K.epsilon()+pt_1))-K.sum((1-alpha) * K.pow( pt_0, gamma) * K.log(1. - pt_0 + K.epsilon()))
    return focal_loss_fixed

def csv2matrix_y(source):
    # matrix = np.zeros((MATRIX_SIZE, MATRIX_SIZE), dtype=np.float)
    matrix = np.zeros((MATRIX_SIZE, MATRIX_SIZE), dtype=np.float)
    for t in source:
        if t[2] != '0':
            first = min(int(t[0]), int(t[2]))
            second = max(int(t[0]), int(t[2]))
            matrix[first - 1][second - 1] = 1.0
    return matrix
# 配对的位置记为1

def generate_data_list(SOURCE_PATH):
    count1 =0
    count2 = 0
    for i in os.listdir(SOURCE_PATH):#os.listdir() 方法用于返回指定的文件夹包含的文件或文件夹的名字的列表
        try:
            source = np.loadtxt("{}/{}".format(SOURCE_PATH, i), dtype=np.str, delimiter=',')
            if len(source) > MATRIX_SIZE:
                count1+=1
                # print("Sequence Too Long: skip " + i)
                continue
            count2+=1
            # x_list.append(csv2matrix_x(source))
            y_list.append(csv2matrix_y(source))
            data = RNAToMat0(source)
            # d = np.random.rand(MATRIX_SIZE, MATRIX_SIZE)
            # data = np.add(data, d)
            # data1 = RNAToMat1(source)
            x_list.append(data)
            # x_list1.append(data1)
            # print(i)
        #     在列表末尾添加一个元素
        except ValueError:
            print(f"Value Error: skip {i}")
    print(count1,count2-count1)

def get_model(shape):
    stack = []
    inputs = keras.Input(shape=shape)
    x = inputs
    pre = inputs

    for filters in [32, 64, 128, 256]:
        x = layers.SeparableConv2D(filters=filters, kernel_size=3, padding='same', activation='relu')(x)
        x = layers.BatchNormalization()(x)

        x = layers.SeparableConv2D(filters=filters, kernel_size=3, padding='same', activation='relu')(x)
        x = layers.BatchNormalization()(x)

        stack.append(x)

        x = layers.MaxPooling2D(pool_size=(2, 2), strides=2, padding='same')(x)

        residual = layers.Conv2D(filters=filters, kernel_size=1, strides=2, padding='same')(pre)
        x = layers.add([x, residual])
        pre = x

    x = layers.Conv2D(filters=512, kernel_size=3, padding='same', activation='relu')(x)
    x = layers.BatchNormalization()(x)
    x = layers.Conv2D(filters=512, kernel_size=3, padding='same', activation='relu')(x)
    x = layers.BatchNormalization()(x)
    x = layers.Dropout(0.5)(x)

    for filters in [256, 128, 64, 32]:
        x = layers.UpSampling2D(size=(2, 2))(x)
        x = layers.Conv2D(filters=filters, kernel_size=3, padding='same', activation='relu')(x)
        x = layers.Concatenate(axis=3)([x, stack.pop()])

        x = layers.Conv2DTranspose(filters=filters, kernel_size=3, padding='same', activation='relu')(x)
        x = layers.BatchNormalization()(x)

        x = layers.Conv2DTranspose(filters=filters, kernel_size=3, padding='same', activation='relu')(x)
        x = layers.BatchNormalization()(x)

        residual = layers.UpSampling2D(size=(2, 2))(pre)
        residual = layers.Conv2D(filters=filters, kernel_size=1, padding='same')(residual)
        x = layers.add([x, residual])
        pre = x

    outputs = layers.Conv2D(1, kernel_size=3, activation='softmax', padding='same')(x)

    model = keras.Model(inputs, outputs)
    return model


def unet(pretrained_weights=None, input_size=(256, 256, 1)):
    inputs = layers.Input(input_size)

    conv1 = layers.Conv2D(64, 3, activation='relu', padding='same', kernel_initializer='he_normal')(inputs)
    conv1 = layers.Conv2D(64, 3, activation='relu', padding='same', kernel_initializer='he_normal')(conv1)
    pool1 = layers.MaxPooling2D(pool_size=(2, 2))(conv1)

    conv2 = layers.Conv2D(128, 3, activation='relu', padding='same', kernel_initializer='he_normal')(pool1)
    conv2 = layers.Conv2D(128, 3, activation='relu', padding='same', kernel_initializer='he_normal')(conv2)
    pool2 = layers.MaxPooling2D(pool_size=(2, 2))(conv2)

    conv3 = layers.Conv2D(256, 3, activation='relu', padding='same', kernel_initializer='he_normal')(pool2)
    conv3 = layers.Conv2D(256, 3, activation='relu', padding='same', kernel_initializer='he_normal')(conv3)
    pool3 = layers.MaxPooling2D(pool_size=(2, 2))(conv3)

    conv4 = layers.Conv2D(512, 3, activation='relu', padding='same', kernel_initializer='he_normal')(pool3)
    conv4 = layers.Conv2D(512, 3, activation='relu', padding='same', kernel_initializer='he_normal')(conv4)
    drop4 = layers.Dropout(0.5)(conv4)
    pool4 = layers.MaxPooling2D(pool_size=(2, 2))(drop4)

    conv5 = layers.Conv2D(1024, 3, activation='relu', padding='same', kernel_initializer='he_normal')(pool4)
    conv5 = layers.Conv2D(1024, 3, activation='relu', padding='same', kernel_initializer='he_normal')(conv5)
    drop5 = layers.Dropout(0.5)(conv5)

    up6 = layers.Conv2D(512, 2, activation='relu', padding='same', kernel_initializer='he_normal')(
        layers.UpSampling2D(size=(2, 2))(drop5))
    merge6 = layers.concatenate([drop4, up6], axis=3)
    conv6 = layers.Conv2D(512, 3, activation='relu', padding='same', kernel_initializer='he_normal')(merge6)
    conv6 = layers.Conv2D(512, 3, activation='relu', padding='same', kernel_initializer='he_normal')(conv6)

    up7 = layers.Conv2D(256, 2, activation='relu', padding='same', kernel_initializer='he_normal')(
        layers.UpSampling2D(size=(2, 2))(conv6))
    merge7 = layers.concatenate([conv3, up7], axis=3)
    conv7 = layers.Conv2D(256, 3, activation='relu', padding='same', kernel_initializer='he_normal')(merge7)
    conv7 = layers.Conv2D(256, 3, activation='relu', padding='same', kernel_initializer='he_normal')(conv7)

    up8 = layers.Conv2D(128, 2, activation='relu', padding='same', kernel_initializer='he_normal')(
        layers.UpSampling2D(size=(2, 2))(conv7))
    merge8 = layers.concatenate([conv2, up8], axis=3)
    conv8 = layers.Conv2D(128, 3, activation='relu', padding='same', kernel_initializer='he_normal')(merge8)
    conv8 = layers.Conv2D(128, 3, activation='relu', padding='same', kernel_initializer='he_normal')(conv8)

    up9 = layers.Conv2D(64, 2, activation='relu', padding='same', kernel_initializer='he_normal')(
        layers.UpSampling2D(size=(2, 2))(conv8))
    merge9 = layers.concatenate([conv1, up9], axis=3)
    conv9 = layers.Conv2D(64, 3, activation='relu', padding='same', kernel_initializer='he_normal')(merge9)
    conv9 = layers.Conv2D(64, 3, activation='relu', padding='same', kernel_initializer='he_normal')(conv9)
    conv9 = layers.Conv2D(2, 3, activation='relu', padding='same', kernel_initializer='he_normal')(conv9)

    conv10 = layers.Conv2D(1, 1, activation='sigmoid')(conv9)

    model = keras.Model(inputs=inputs, outputs=conv10)

    # model.compile(optimizer=optimizers.Adam(lr=1e-4), loss='binary_crossentropy', metrics=['accuracy'])
    model.compile(optimizer=optimizers.Adam(learning_rate=0.1), loss=[binary_focal_loss(alpha=0.25, gamma=2)],metrics=['accuracy'])
    # model.compile(optimizer=optimizers.Adam(learning_rate=1e-4),loss=focal_loss(gamma=2,alpha=0.25),metrics=['accuracy'])

    # model.summary()

    if (pretrained_weights):
        model.load_weights(pretrained_weights)

    return model

def train_model():
    y_train = np.array(y_list,dtype=float)#数据集的标签
    x_train = np.array(x_list,dtype=float)
    # x_train1 = np.array(x_list1,dtype=float)

    keras.backend.clear_session()

    # m = get_model((MATRIX_SIZE, MATRIX_SIZE, 1, ))
    m = unet(None,(MATRIX_SIZE, MATRIX_SIZE, 1))
    # m.compile(optimizer=optimizers.Adam(lr=1e-4), loss='binary_crossentropy', metrics=['accuracy'])
    # m.compile(optimizer=optimizers.Adam(learning_rate=0.1),loss=[binary_focal_loss(alpha=0.25, gamma=2)],metrics=['accuracy'])

    history = m.fit(x_train, y_train, epochs=20 , batch_size=8, validation_split=0.2)

    # history1 = m.fit(x_train1,y_train,epochs=2, batch_size=128, validation_split=0.9)

    # 绘制训练过程的loss和accuracy曲线
    acc = history.history['accuracy']#-K.ones_like(history.history['accuracy'])
    val_acc = history.history['val_accuracy']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    epochs = range(len(acc))

    plt.plot(epochs, acc, 'b', label='Training accuracy')
    plt.plot(epochs, val_acc, 'r', label='validation accuracy')
    plt.title('Training and validation accuracy')
    plt.legend(loc='lower right')
    plt.figure()

    plt.plot(epochs, loss, 'r', label='Training loss')
    plt.plot(epochs, val_loss, 'b', label='validation loss')
    plt.title('Training and validation loss')
    plt.legend()
    plt.show()
    # m.save(f"model_{int(time.time())}")

SOURCE_PATH = "split_ct_output"
generate_data_list(SOURCE_PATH)
train_model()

欢迎分享，转载请注明来源：内存溢出

原文地址: http://outofmemory.cn/langs/918878.html

U-net实现RNA二级结构预测

发表评论

评论列表（0条）