Download the DNA Library Designer (DNALD)


DNALD Tutorial (download)

// This is a single line comment. Everything up to the end of the line is 
// ignored. (This is another single line comment.)
# This is also a single line comment, if you prefer using a single hash.

/* This is a multi-line comment. Anything up to and including the closing 
 * symbol (asterisk and slash together) is ignored, which means that they can 
 * also be used to add comments within an interpreted line (see below). 
 * 
 * The preceding * are optional but inserted automatically for clarity.        
 */

/**
 * This is a documentation comment (or docstring), which is also a multi-line 
 * comment, distinguished by the second *, which will be associated with the 
 * following section.
 * 
 * Libraries are main objects of DNALD. They allow you to specify a set of 
 * output DNA sequences, for synthesis, as combinations of existing ("natural") 
 * input DNA fragments and new ("synthetic") intermediate DNA fragments, using 
 * definitions and expressions of definitions, sequences and operations.
 *
 * This example demonstrates the structure and syntax of a DNALD DNA library 
 * design, the available operations and their syntax and semantics.
 *
 */
library tutorial {
/*
 * The symbolic keyword "library" MUST be followed by a name, which is used to 
 * create a namespace for the definitions within. The beginning and end of a 
 * library is denoted by an open curly brace and matching closing curly brace
 * after the final definition. 
 * 
 * DNALD names must match the following pattern: 
 * 	'^'?('a'..'z'|'A'..'Z'|'_'|"5'"|"3'") ('a'..'z'|'A'..'Z'|'0'..'9'|'_')*
 * which can be interpreted as: an optional hat character (used to distinguish 
 * names that are also keywords); followed by either a lower or upper case 
 * letter, an underscore or a 5' or 3' character pair; followed by zero or 
 * more alphanumerics and underscores.
 */
	
	/**
	 * The symbolic keyword "inputs" denotes the inputs section which again 
	 * begins and ends with open and closed curly braces respectively.
	 * 
	 * Within the inputs section definitions of sequences are specified, 
	 * corresponding to natural DNA fragments, that can be provided by you,
	 * the library designer, to the library manufacturer.
	 */
	inputs {
		input1 := 'acgt'

		/**
		 * Definitions begin with a name statisfying the rules above, followed by 
		 * the definition symbol := and a DNALD expression (examples of which are 
		 * shown below).
		 *  
		 * Sequences MUST be quoted using a pair of either 'single' or "double" 
		 * speech marks.
		 * Sequence can include numbers and whitespace (spaces, tabs or 
		 * end-of-line characters), which will be stripped automatically. This 
		 * enables cut and paste from the sequence sections of various formats. 
		 * Each letter in a sequence MUST be one of 'a', 'c', 'g' or 't'.	// NOT currently enforced 
		 * Lowercase and uppercase letters are allowed, as are a mix of cases.  
		 */
		input2 := 'atcgt'
		
	} # inputs end

	
		/**
		 * Intermediates are definitions in the library but NOT in the input
		 * or output sections.
		 */  
		
		# synthetic sequences
		5'end	:= 'atg'
		3'end	:= 'uua'

		# sequences (the results of expressions) are concatenated using only whitespace  	
		concatenation	:= 5'end input1 
		
		# the "is" keyword asserts than the expression following it is equivalent
		assertion		:= concatenation
						is 5'end 'acgt' # you can use assertions when refactoring 
						is 'atg' input1 # to check refactored expressions
						is 'atg' 'acgt' # against previous correct ones

						 
		reference	:= input1


		# subsequences
		nucleotide := input1[2] is 'c' # the second nucleotide of input

		subsequence := input1[2:3] is 'cg' 
		
		from_i_to_end := input1[2:end] is 'cgt' # use end for the rest of the sequence 
		

		# insertions

		insertion_with_concatenation := input1[1] 'AAA' input1[4:end]

		insertion_with_mutation	:= input1[2 = 'tc'] 
								is 'atcgt'
								is input2
	
	
		# reverse, complement and their composition
	
		^reverse 	:= reverse('acca')
					is 'acca'
		
		^complement	:= complement('acca')
					is 'tggt'
					
		# the hat symbol ^ enables use of a keyword as a name
		
		/**
		 * reverse complements are special because they are the other stand 
		 * of their own reverse complement
		 */
		reverse_complement	:= reverse(complement('acca'))
							is complement(reverse('acca'))
							is 'tggt'
		
		# repetitions
		
		repetition1	:= 'acg' * 2 is 'acgacg'
		repetition2	:= 2 * 'acg' is 'acgacg'
		
		# the order of repetitions is irrelevant for single expressions 
		repetitions	:= 3 * 'ac' * 2
					is 2 * 'ac' * 3

		# the order of repetitions is relevent for multiple expressions
		repetitions_multiple
					:= 3 * ac GT * 2
					is 2 * ac GT * 3 # warning

		# parentheses can be used to wrap multiple expressions into a single
		repetitions_multiple_parenthesised
					:= 3 * (ac GT) * 2
					is 2 * (ac GT) * 3

		# different coloured fragments used by repetitions_multiple* above
		ac	:= 'ac'
		GT	:= 'GT'

		variable_length_repetition := 'at' * 2:3 is 'atat' + 'atatat'
		

		# mutations, deletions and insertions		
	
		mutation 	:= input1[2:3='AA'] 	// an expression which replaces a mutated subsequence
		
		deletion 	:= input1[2:3='A'] 		// when the mutation is shorter than the subsequence
		
		insertion 	:= input1[2:3='AAA'] 	// when the mutation is longer  than the subsequence


		# multiple mutations

		SNP 	:= input1[4='A'] is 'acgA' 
		SNPs 	:= input1[2='',4='A'] is 'agA' // multiple mutations are applied simultaneously to preserve indices
		
		sequential_SNPs 	:= input1[2=''][3='A'] is 'agA'// OK but sequential mutations do not preserve indices 
//		sequential_SNPs2 	:= input1[2=''][4='A'] // not OK because sequential mutations do not preserve indices

		
		// creating sets of sequences

		// "+" or "union" creates a set of sequences   			
		union1	:= input1 + input2 is (input1 + input2) // sometimes parentheses are optional
				is input2 + input1 is (input2 + input1) // sometimes (with other set operations) they are not 
				is (input1 + input2) is (input2 + input1) // parentheses help delineate groups


		// a choice is also creates a set but implies that only one sequence is required
		choice	:= input1 | input2 
				is input2 | input1

		
		// mutation with a set of sequences
		
		three_mutations	:= input1[2:3='AA' + 'A' + 'AAA'] 
						is mutation + deletion + insertion 


		// order of set operations

		union2	:= concatenation + input1  
				is input1 + concatenation // order is irrelevant
				is 'atgacgt' + 'acgt'
		
		intersection	:= union1 & union2 
						is union2 & union1 // order is irrelevant
						is input1
		
		symmetric_difference	:= union1 ^ union2
								is union2 ^ union1 // order is irrelevant
								is concatenation + input2   
								is 'atgacgt' + 'atcgt'

		difference1	:= union1 - union2
					is union2 - union1 // order is relevant
					is input2
					is 'atcgt'  
		 
		difference2	:= union2 - union1
					is union1 - union2 // order is relevant
					is concatenation 
					is 'atg' 'acgt'
					is 'atgacgt'

	/**
	 * The outputs section is similar to the inputs section, except that
	 * at least one output must be defined. 
	 *
	 * These are the definitions that will be manufactured from the 
	 * library design.
	 */
	outputs {
		
		/**
		 * Now it's your turn. Any valid expression combining those definitions 
		 * above will appear at the bottom of the library view below.
		 */
		output1 := ''
					
	} # outputs end
}


/**
 * Codon tables are used to translate nucleotide sequences and back-translate 
 * amino acid sequences. 
 * 
 * The default codon table is the E. coli standard genetic code from
 * http://www.kazusa.or.jp/codon/cgi-bin/showcodon.cgi?species=37762&aa=1&style=N
 * based on 8087 CDS's (2,330,943 codons)
 */
Codon Table DefaultCodonTable {
	/* 
	 * A set of codons is specified by either the full name or three letter code 
	 * of an amino acid, followed by the definition symbol := and a series of 
	 * codons separated by the choice symbol |. 
	 * Codons MUST be specified for each of the 20 standard amino acids and the 
	 * untranslatable stop codons.
	 * Stop codons are named "stop" or "Ter". 
	 * Alternate start codons are named "start".
	 */ 
 	Alanine         /* A */ := 'GCA'  0.27 | 'GCC'  0.26 | 'GCG'  0.25 | 'GCT' #0.22
	Arginine        /* R */ := 'CGT'  0.30 | 'CGC'  0.26 | 'CGG'  0.15 | 'AGA'  0.13 | 'CGA' 0.09 | 'AGG' #0.07 
//  full name or three letter code           codon (quoted)       weight             choice               redundant weight when sum < 0
	Asn /* Asparagine, N */ := 'AAT'  0.59 | 'AAC' #0.41	#  
	Aspartic acid   /* D */ := 'GAT'  0.65 | 'GAC' #0.35
	Cysteine        /* C */ := 'TGT'  0.52 | 'TGC' #0.48
	Glutamic acid   /* E */ := 'GAA'  0.64 | 'GAG' #0.36
	Gln  /* Glutamine, Q */ := 'CAG'  0.65 | 'CAA' #0.35
	Glycine         /* G */ := 'GGT'  0.34 | 'GGC'  0.29 | 'GGA'  0.19 | 'GGG' #0.18
	Histidine       /* H */ := 'CAT'  0.63 | 'CAC'  0.37
	Ile             /* I */ := 'ATT'  0.47 | 'ATC'  0.31 | 'ATA'  0.21	# Isoleucine
	Leucine         /* L */ := 'CTG'  0.38 | 'TTA'  0.18 | 'CTT'  0.15 | 'TTG'  0.13 | 'CTC' 0.10 | 'CTA'  0.06
	Lysine          /* K */ := 'AAA'  0.71 | 'AAG' #0.29
	Methionine      /* M */ := 'ATG' #1.00
	Phenylalanine   /* F */ := 'TTT'  0.64 | 'TTC' #0.36
	Proline         /* P */ := 'CCG'  0.37 | 'CCT'  0.24 | 'CCA'  0.23 | 'CCC' #0.16
//	Pyl             /* O */ := '' // Pyrrolysine 
//	Sec             /* U */ := '' // Selenocysteine
	Serine          /* S */ := 'AGC'  0.20 | 'AGT'  0.18 | 'TCA'  0.18 | 'TCT'  0.18 | 'TCC' 0.14 | 'TCG' #0.11
	Ter /*stop*/    /* * */ := 'TAA'  0.58 | 'TGA'  0.33 | 'TAG' #0.09	# stop
	Threonine       /* T */ := 'ACC'  0.31 | 'ACA'  0.25 | 'ACG'       | 'ACG' #0.22 (each)
	Trp             /* W */ := 'TGG' #1.00
	Tyrosine        /* Y */ := 'TAT'  0.65 | 'TAC' #0.35
	Valine          /* V */ := 'GTT'  0.32 | 'GTG'  0.29 | 'GTA'  0.19 | 'GTC' #0.19
	/*
	 * Numeric weights following the codon sequence are used to probablistically 
	 * choose a given codon when back-translating. They correspond to the 
	 * determined (or desired) codon usage frequencies of the target organism. 
	 * Weights are optional if they are no weights given for that amino acid or if 
	 * the weights that are given sum to less than 1.
	 * The remainder (1 is no weights are given) is divided equally between the 
	 * unweighted codons. For example, in the default codon table the weights of 
	 * the last two codons of threonine can be commented out because their weights 
	 * are equal and the sum of those is equal to the remainder of 1 minus the sum 
	 * of the weights of the first two codons.  
	 * Weights may be greater than 1 and can sum to greater than 1, however, in 
	 * which case ALL of the codons must be given weights, because it is not 
	 * possible determine the remainder otherwise.
	 * 
	 * Not every codon has to be used, but those that are must be unique. 
	 * Not using a particular codon is another way of biasing codon choice for 
	 * back-translated sequences. However, translation in DNALD also relies on 
	 * codon tables and an error will be flagged if a triplet in a reading frame 
	 * cannot be matched to an amino acid. Therefore it may be best to specify all 
	 * codons but give those that are unwanted a weight of 0. 
	 */
}