Warning: disgusting scripts ahead
# mount drive http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ebs-using-volumes.html
# instructions
http://trulymadlywordly.blogspot.com/2011/03/creating-text-corpus-from-wikipedia.html
# download http://source.cet.uct.ac.za/svn/people/smarquard/sphinx/experiments/scripts/Wikipedia2Txt.java
# remove the package bit from teh top
# download http://code.google.com/p/gwtwiki/
# download http://commons.apache.org/compress/
# yum install java
# yum install java-devel
# setup maven https://github.com/dssg/cta-otp/wiki/AWS-EC2-Setup
# run "mvn compile" in the wiki directory
# page.isMain()
javac -cp "jars/bliki.jar" Wikipedia2Txt.java
java -cp .:./jars/bliki.jar:./jars/commons-compress-1.7.jar Wikipedia2Txt > /wiki2/sentences
bunzip2 enwiki-latest-pages-articles.xml.bz2
cat sentences | grep -i '^You' | grep -v "Young" > youze
#!/bin/bash
file="youze2"
count=0
while read line
do
(( count++ ))
filtered=$(echo $line | sed -e 's/[^[:alnum:]|[:space:]]//g')
echo "$filtered" | text2wave | lame --silent - > "sounds/$count.mp3"
echo "line $count"
done < <(cat $file)
#!/bin/bash
# count=0
FILES=sounds/*.mp3
for f in $FILES
do
echo "Processing $f file..."
filename=${f##*/}
base=${f%.mp3}
wav=$base.wav
wav_regular=$base.regular.wav
tmp=echos/tmp.wav
trimmed=echos/tmp_trimmed.wav
final=echos/tmp_final.wav
padding=3
# echo $filename;
# echo $wav;
# echo $base;
# echo $base;
sox $f $wav vol -20dB pad 0 $padding
sox $f -c 2 $wav_regular pad 0 $padding
duration=`sox $wav -n stat 2>&1 | sed -n 's#^Length (seconds):[^0-9]*\([0-9.]*\)$#\1#p'`
fconvolver chapel.conf $wav $tmp
sox echos/tmp.wav $trimmed trim 0 $duration
sox -m $wav_regular $trimmed $final fade t 0 $duration 0.02
lame --quiet $final echos/$filename
rm $wav
rm $tmp
rm $trimmed
rm $wav_regular
rm $final
# (( count++ ))
# if [ $count = "10" ]; then
# break
# fi
done